Doublet Removal

# Load conda environment with python instalation. This depends on each user's.
reticulate::use_condaenv(condaenv = "rstudio_server")

# Import scrublet.
scrublet <- reticulate::import("scrublet")

# Compute doublets per patient.
list.output <- list()
for (samp in unique(sample$orig.ident)){
  rlang::inform(paste0(add_info(), crayon_body("Analysing sample: "), crayon_key(samp)))
  
  sample.use <- sample[, sample$orig.ident == samp]
  
  counts_transposed <- Matrix::t(sample.use@assays$RNA@counts)

  # Run scrublet.
  # Code adapted from: https://github.com/swolock/scrublet/blob/master/examples/scrublet_basics.ipynb
  scrub = scrublet$Scrublet(counts_transposed, expected_doublet_rate = 0.06)
  
  # Compute the doublets.
  return_list = scrub$scrub_doublets() # List with the output from scrublet.
  scrublet_score <- return_list[[1]] # Scrublet scores per cell.
  scrublet_binary <- return_list[[2]] # Scrublet assignment for each cell.
  
  # Add cell names to the output, so it can be integrated in the Seurat object.
  row.names(scrublet_score) <- colnames(sample.use)
  list.output[[samp]] <- scrublet_score
}

# Get the doublet scores.
list.matrices <- list()
for (samp in names(list.output)){
  data <- list.output[[samp]]
  
  data.use <- data.frame("Cell" = names(data),
                         "Score" = unname(data),
                         "orig.ident" = rep(samp, length(data))) %>% 
              tibble::as_tibble() %>% 
              dplyr::left_join(y = sample@meta.data %>% tibble::rownames_to_column(var = "Cell") %>% dplyr::select(dplyr::all_of(c("Cell", "nCount_RNA", "nFeature_RNA"))),
                               by = "Cell")
  list.matrices[[samp]] <- data.use
}
data <- do.call(rbind, list.matrices)

# Add them as metadata.
sample@meta.data <- sample@meta.data %>% 
                    tibble::rownames_to_column(var = "Cell") %>% 
                    dplyr::left_join(y = data %>% dplyr::select(dplyr::all_of(c("Cell", "Score"))),
                                     by = "Cell") %>% 
                    tibble::column_to_rownames(var = "Cell") %>% 
                    dplyr::rename("Scrublet Score" = "Score")

# Inspect the distribution of scores per patient and manually select a doublet score cutoff for each of them.
cutoffs <- list("H049-ATRT-0014" = 0.175,
                "H049-ATRT-0005" = 0.225,
                "H049-JVCT" = 0.175,
                "H049-ATRT-0001" = 0.175,
                "H049-ATRT-0013" = 0.175,
                "H049-ATRT-0003" = 0.175,
                "H049-ATRT-0010" = 0.175,
                "H049-ATRT-0006" = 0.2,
                "H049-ATRT-0022-P" = 0.175,
                "H049-UV6K" = 0.2,
                "H049-ATRT-0007" = 0.2,
                "H049-ATRT-0009" = 0.175,
                "ATRT21" = 0.175,
                "ATRT24" = 0.175,
                "ATRT04" = 0.175,
                "ATRT05.2" = 0.175,
                "ATRT15_RV3" = 0.2,
                "JD113T" = 0.175,
                "ATRT14" = 0.175,
                "CG_SB_NB8358" = 0.175)

# Retrieve cells to exclude.
cells.exclude <- NULL
for (patient in unique(sample$orig.ident)){
  cutoff <- cutoffs[[patient]]
  sample.test <- sample[, sample$orig.ident == patient]
  cells.exclude <- append(cells.exclude, names(sample.test$`Scrublet Score`[sample.test$`Scrublet Score` >= cutoff]))
}

# Exclude them.
sample <- sample[, !(colnames(sample) %in% cells.exclude)]