#This scripts described how to pre-process 10X Genomics multiome data starting with an output of cell-ranger-arc#For questions contact the Drost Lab: j.drost@prinsesmaximacentrum.nl#Pipeline as described below was used for organoids as well as tissue samples used for scMultiome-seq#Libraries neededlibrary(Signac)library(JASPAR2020)library(TFBSTools)library(Seurat)library(EnsDb.Hsapiens.v86)library(BSgenome.Hsapiens.UCSC.hg38)library(future)library(ggplot2)library(Matrix)library(dplyr)library(ggplot2)library(scales)library(patchwork)library(Polychrome)library(biomaRt)library(GO.db)library(org.Hs.eg.db)library(RSQLite)library(stringr)library(SingleR)library(clustree)library(infercnv)library(celldex)library(tidyr)library(tibble)library(plyr)library(plyranges)library(chromVAR)#Change to your working directorysetwd("/hpc/pmc_drost/rstudio_ire/multiome_IP/libraries")#plan("multicore", workers = 4)options(future.globals.maxSize =50*1024^3)##Load gene sets used to remove confounder effect of cell cycle and red blood cellsgenesRM<-readRDS("genes_remove_HB_CC_stress.RDS")# Configure variables ----basedir<-getwd()CFG<-list()CFG$data_dir<-paste0(basedir, "/data/")CFG$work_dir<-paste0(basedir, "/work_dir/")CFG$output_dir<-paste0(basedir, "/output/")CFG$metadata_dir<-paste0(basedir)#Needs to be changed depending on where this file is storedCFG$workspace_dir<-paste0(basedir, "/workspace/")CFG$ndims<-40#Number of dimensions for dimensional reductionCFG$random_seed<-1234#Random seed for UMAP generationCFG$mito_pattern<-"^MT-"CFG$min_txpts<-800#As discussed with the single cell facility in the Princess Maxima Center CFG$max_txpts<-30000CFG$min_features<-500CFG$max_pctmito<-20CFG$min_TSS<-1CFG$max_nucleo<-1.5CFG$max_frgmts<-50000CFG$min_frgmts<-800CFG$max_pct_hemo<-5CFG$gradient_colors=viridis::viridis(101, direction =-1)CFG$cbPalette4<-c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#EE5050","#111111", "#00E656", "#C8C8C8", "#E6009F", "hotpink1", "#D55E00", "#CC79A7","cyan2", "darkmagenta")set.seed(CFG$random_seed)#Get gene annotations for hg38annotation<-GetGRangesFromEnsDb(ensdb =EnsDb.Hsapiens.v86)ucsc.levels<-str_replace(string=paste("chr",seqlevels(annotation),sep=""), pattern="chrMT", replacement="chrM")#mitochondria annotationseqlevels(annotation)<-ucsc.levelsgenome(annotation)<-"hg38"# Load data ----## Load metadata and create directory information ----metadata.exp<-read.table(paste0('/hpc/pmc_drost/rstudio_ire/multiome_IP/libraries/multiome_drost_metadata_new.txt'), header=T)#Loads metadata table for experiments (M) with accompanying library and analysis numbersexperiments<-unique(metadata.exp[,1])#Here you can subset for experiments (M) of interest (or just take all)lib<-unique(paste0(metadata.exp$experiment, "_",metadata.exp$RNA_lib, "_",metadata.exp$ATAC_lib, "_",metadata.exp$analysis))#Generates character vector with the full path name on the surfdrive seq_data folder to identify the libraries per experiment#Generates dataframe with the directory for the count install.packages ATAC fragments (column 3) to be retrieveddir<-data.frame(experiments, dirFilt =paste0(CFG$data_dir, lib, "/filtered_feature_bc_matrix"), dirAll =paste0(CFG$data_dir, lib))select_data<-unique((metadata.exp%>%filter(source%in%c("organoid","organoids")))[,1])#this line kchanges if you take the tissue datadir<-dir[which(dir$experiments%in%select_data),]## Function to calculate colSums in GEX data per experiment ---## Function to calculate colSums in GEX data per experiment ---fun.GEX.colSums<-function(a){#b <- Matrix::colSums(counts.list[[a]]) b<-Matrix::colSums(counts.list[[a]]$'Gene Expression')##this is for combined analysisreturn(b)}# load the RNA and ATAC data and create a Seurat object##Here we read only GEX datafun.READ10X<-function(a){counts.list<-Read10X(data.dir =dir[dir$experiments==a, 2])#counts.list <- counts.list[[1]] ##comment this if running both ATAC and GEX togetherreturn(counts.list)}fun.GEX.meta<-function(a){data<-counts.list[[a]]$'Gene Expression'#use this if doing combined ATAC and GEX reading#data <- counts.list[[a]] mitos<-grep(CFG$mito_pattern, rownames(data), value =TRUE)#Find mitochondrial genespercent_mito<-100*Matrix::colSums(data[mitos, ])/GEX.colSums[[a]]#Calculate percentage of mitochondrial genes per cell; gives named vector which is necessary for adding the meta.data later (barcodes have to be included)nuclear<-setdiff(rownames(data), mitos)#Define nuclear genes via setdiff functionlog2_transcript_counts<-log2(1+Matrix::colSums(data[nuclear , ]))#log2_transcript_counts for nuclear geneslog2_feature_counts<-log2(1+Matrix::colSums(data[nuclear, ]>0))meta<-data.frame(percent_mito =percent_mito, #create meta.data df for Seurat object log2_counts =log2_transcript_counts, log2_features =log2_feature_counts, source =rep(metadata.exp[match(a, metadata.exp[,1]),6], ncol(data)), lib =rep(a, ncol(data)))#Automatically adds correct experiment to dflist<-list(meta, nuclear)return(list)}fun.GEX.Seurat<-function(a){counts<-counts.list[[a]]$`Gene Expression`[GEX.meta[[a]][[2]],]#use this if doing combined ATAC and GEX reading#counts <- counts.list[[a]]srat<-CreateSeuratObject(counts =counts, assay ="RNA", meta =GEX.meta[[a]][[1]])#Add information in meta.data from abovereturn(srat)}## Load data for selected data/library ----counts.list<-lapply(select_data, fun.READ10X)#Make sure to use the correct function (both, GEX, ATAC)names(counts.list)<-select_dataGEX.colSums<-sapply(select_data, fun.GEX.colSums)names(GEX.colSums)<-select_data## Generate GEX meta.data list ----GEX.meta<-lapply(select_data, fun.GEX.meta)names(GEX.meta)<-select_data##Create Seurat object with data and metadatasrat.GEX<-sapply(select_data, fun.GEX.Seurat)# create ATAC assay and add it to the object##But first, read peak filesgrFiles<-list()for(expinselect_data){# ##read peaks filebedFile<-read.table(paste0(dir$dirAll[match(exp, dir$experiments)], "/atac_peaks.bed"), col.names =c("chr", "start", "end"))grFiles[[exp]]<-makeGRangesFromDataFrame(bedFile)}# Create a unified set of peaks to quantify in each datasetcombined.peaks<-disjoin(x =c(grFiles[[1]], grFiles[[2]], grFiles[[3]]))# Filter out bad peaks based on length and combine peakspeakwidths<-width(combined.peaks)combined.peaks<-combined.peaks[peakwidths<10000&peakwidths>20]##Here filter further to only keep standard chromosomescombined.peaks<-keepStandardChromosomes(combined.peaks, pruning.mode="coarse")##Here filter to remove blacklisted genescombined.peaks<-subsetByOverlaps(x =combined.peaks, ranges =blacklist_hg38_unified, invert =TRUE)atacAll<-srat.GEXfor(expinselect_data){fragpath<-paste0(dir$dirAll[match(exp, dir$experiments)], "/atac_fragments.tsv.gz")atacAll[[exp]][["ATAC"]]<-CreateChromatinAssay( counts =counts.list[[exp]]$Peaks, sep =c(":", "-"), fragments =fragpath, annotation =annotation)DefaultAssay(atacAll[[exp]])<-"ATAC"atacAll[[exp]]<-NucleosomeSignal(atacAll[[exp]])atacAll[[exp]]<-TSSEnrichment(atacAll[[exp]])##Create fragment object for each samplefrags<-CreateFragmentObject(path =fragpath)CRcounts<-FeatureMatrix(fragments =Fragments(atacAll[[exp]]), features =combined.peaks, cells =colnames(atacAll[[exp]]))atacAll[[exp]][["peaks"]]<-CreateChromatinAssay( counts =CRcounts, fragments =fragpath, annotation =annotation)pdf(file=paste0(CFG$output_dir, exp, "_QC_Seurat_merge_blacklist.pdf"))p<-VlnPlot( object =atacAll[[exp]], features =c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"), ncol =4, pt.size =0)print(p)dev.off()saveRDS(atacAll, "srat_merged_disjoin_organoids.RDS")}for(expinselect_data){atacAll[[exp]]<-subset( x =atacAll[[exp]], subset =nCount_RNA>=CFG$min_txpts&#Minimum threshold transcripts nCount_RNA<=CFG$max_txpts&#Maximum threshold transcriptspercent_mito<=CFG$max_pctmito&nucleosome_signal<CFG$max_nucleo&TSS.enrichment>CFG$min_TSS&nCount_ATAC<CFG$max_frgmts&nCount_ATAC>CFG$min_frgmts)pdf(file=paste0(CFG$output_dir, exp, "_QC_filtered_merged_BL.pdf"))p<-VlnPlot( object =atacAll[[exp]], features =c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"), ncol =4, pt.size =0)print(p)dev.off()}##Merge seurat objects from the filtered librariessrat<-merge(atacAll[[1]], c(atacAll[[2]], atacAll[[3]]), add.cell.ids =names(atacAll))## Normalize GEX data using LogNormalize ----DefaultAssay(srat)<-"RNA"srat<-NormalizeData(srat, normalization.method ="LogNormalize")#Goes to slot "Data" of RNA assay (GetAssayData function)srat<-ScaleData(srat, features =rownames(srat), verbose =FALSE)#Goes to slot "scale.data" of RNA assay (GetAssayData function)srat<-FindVariableFeatures(srat)## Normalize GEX data using SCTransform ----# SCTransform automatically scales data and finds variablesDefaultAssay(srat)<-"RNA"srat<-SCTransform(srat,verbose =FALSE, variable.features.n =3000)#Specify number of features for downstream analysissaveRDS(srat, "090523_srat_SCT_done")srat<-readRDS("120523_srat_SCT_done_organoids")## Inspect most highly variable genes ----DefaultAssay(srat)<-"SCT"top10<-head(VariableFeatures(srat), 10)plot1<-VariableFeaturePlot(srat)plot2<-LabelPoints(plot =plot1, points =top10)plot2##Dimensional reduction ----### PCA: Calculation ----# There can only be one PCA in the Seurat object. Start with SCTransform to generate UMAP; LogNormalized will be used for gene expression analysissrat<-RunPCA(srat, npcs =50)#PCA for SCTransformed Seurat object### PCA: Visualize nuclei along the first two principal axes per normalization method ----DimPlot(srat, reduction ="pca", pt.size =1, group.by ='lib', cols =CFG$cbPalette4)### Elbow Plot to choose number of dimensions ----ElbowPlot(srat, ndims =50, reduction ="pca")###Alternatively, consider a more quantitative approach to determine how many pcs to use# Determine percent of variation associated with each PCpct<-srat@reductions$pca@stdev/sum(srat@reductions$pca@stdev)*100# Calculate cumulative percents for each PCcum<-cumsum(pct)# Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5co1<-which(cum>90&pct<5)[1]# Determine the difference between variation of PC and subsequent PCco2<-sort(which((pct[1:length(pct)-1]-pct[2:length(pct)])>0.1), decreasing =T)[1]+1# last point where change of % of variation is more than 0.1%.# Minimum of the two calculationpcs<-min(co1, co2)# change to any other number### PC scores heatmaps ----DimHeatmap(srat, dims =1:20, cells =100)### UMAP (before filtering) ----srat<-RunUMAP(srat, dims =1:30, n.neighbors =30)#Default n.neighbors = 30pdf(file ="Unfiltered variable features UMAP_merged.pdf")DimPlot(srat, reduction ="umap", pt.size =0.5, group.by ='lib', cols =CFG$cbPalette4)+ggtitle("SCT unfiltered UMAP, 20 PCs")dev.off()## Filter certain gene groups from variable features ----# This concerns cell cycle genes, stress genes etc. ##It makes use of the SCutils package developed by our scGenomics facility### Remove cells with high hemoglobin content ----hb.genes<-genesRM$hb.geneshb.genes<-intersect(hb.genes, rownames(srat))hb.counts<-Matrix::colSums(srat@assays$RNA@counts[hb.genes,])srat<-AddMetaData(srat, col.name ="log2hb_genes", metadata =log2(1+hb.counts))srat<-AddMetaData(srat, col.name ="pct_hemo", metadata =100*hb.counts/srat@meta.data$nCount_RNA)srat<-AddMetaData(srat, col.name ="percent.rb", metadata =PercentageFeatureSet(srat, pattern ="^RP[SL]"))srat<-CellCycleScoring(object =srat, g2m.features =cc.genes$g2m.genes, s.features =cc.genes$s.genes)srat<-subset(srat, pct_hemo<=CFG$max_pct_hemo)#Will likely only filter out few nuclei#Now we have a filtered dataset where we need to add a chromvar assay #Add a chromVAR array to the seurat object # Get a list of motif position frequency matrices from the JASPAR databasepfm<-getMatrixSet( x =JASPAR2020, opts =list(collection ="CORE",species ="Homo sapiens", tax_group ='vertebrates', all_versions =FALSE))# add motif informationDefaultAssay(srat)<-'peaks'tissue<-SetIdent(srat, value="cell_status_and_subtype")tissue<-AddMotifs( object =tissue, genome =BSgenome.Hsapiens.UCSC.hg38, pfm =pfm)#Add chromVar assay to the seurat objectregister(MulticoreParam(8))register(SerialParam())DefaultAssay(srat)<-"peaks"# Calculate chromvar scores on the peak assay#Run this as seperate script as job instead of in the command line tissue<-RunChromVAR( object =srat, assay ='peaks', genome =BSgenome.Hsapiens.UCSC.hg38)