CNV analysis

# Methodology from: https://www.cell.com/cell-reports-medicine/pdf/S2666-3791(23)00426-3.pdf


# Subset Seurat object to only contain tumor cells.

sample@assays$RNA <- as(sample@assays$RNA, "Assay")

# Generate a new metadata column storing the mapping cell-metacell.
sample[["metacell_mapping"]] <- "not_mapped"
Seurat::Idents(sample) <- sample$TME_annotation

# Will store all the metacells.
whole_metacells <- data.frame(test = rownames(sample), row.names = rownames(sample))
# Will store the complete annotation for the metacells.
whole_annotation <- data.frame(cluster_names = "test", row.names = "test")

meta_counter <- 0 # To keep a count of the metacells that are created.
metacell_content <- 5 # How many cells per metacell.

for (cluster_id in rev(levels(sample))){
  print(sprintf("Computing metacells for cluster %s.", cluster_id))
  # Will store the metacells per cluster.
  metacells <- data.frame(test = rownames(sample), row.names = rownames(sample))
  
  # Subset the sample by each cluster ID.
  chunksample <- sample[, sample$TME_annotation == cluster_id]
  
  # Get the count data as a data frame and transpose it so columns are GENES and rows are CELLS.
  countdata <- t(as.data.frame(Seurat::GetAssayData(chunksample, assay = "RNA", slot =  "counts")))
  
  # Get the possible amount of metacells.
  times <- trunc(dim(countdata)[1] / metacell_content)
  
  for (i in seq(1,times)){
    meta_counter <- meta_counter + 1
    # Generate slice points for each metacell.
    start <- ((i -1) * metacell_content + 1)
    end <- i * metacell_content
    
    
    # Compute the slice as a data frame containing the sum of the subsetted cells. dims = 1 row (metacell), X columns (genes)
    slice <- as.data.frame(colSums(countdata[start:end, ]))
    
    # Get the name of the cells merged.
    cell_names <- rownames(countdata[start:end, ])
    
    # Add the metacell.
    col_name <- sprintf("metacell_%s", meta_counter)
    metacells[[col_name]] <- slice[,1]
    
    # Add the mapping.
    sample$metacell_mapping[colnames(sample) %in% cell_names] <- col_name
  }
  
  # Delete the test column as we already have more than 1 column in our data frame.
  metacells[["test"]] <- NULL
  
  # Will contain the annotation of the generated metacells. Columns: cluster identities. Rows: each metacell.
  annotation <- data.frame(cluster_names = colnames(metacells), row.names = colnames(metacells))
  # Replace the dummy cluster_names column's values for the actual label for the cluster.
  annotation$cluster_names <- cluster_id
  
  # Add the annotation data and the metacell data to the global containers. In the end: # Columns for metacell object = # rows for annotation object.
  whole_metacells <- cbind(whole_metacells, metacells)
  whole_annotation <- rbind(whole_annotation, annotation)
}

# Turn the names into characters for the sake of avoiding errors when subsetting.
whole_annotation$cluster_names <- as.character(whole_annotation$cluster_names)

# Delete the test row from the global annotation data.
whole_annotation <- whole_annotation[!rownames(whole_annotation) %in% c("test"), , drop = FALSE]

# Delete the test column from the global metacell data.
whole_metacells$test <- NULL

cnv_analysis_folder <- "" # Path to store the output of inferCNV
dir.create(cnv_analysis_folder, recursive = TRUE)
annotation_file <- paste0(cnv_analysis_folder, "/annotation_metacells.tsv")

# Save the annotation object.
utils::write.table(whole_annotation,
                   file = annotation_file,
                   sep = "\t",
                   row.names = TRUE,
                   col.names = FALSE,
                   quote = FALSE)

# Return the metacell object as a matrix (required for running inferCNV).
count_matrix <- as.matrix(whole_metacells)

# Run inferCNV:
gene_ordering_file <- "" # Path to where the file with the order of the genes is stored. It can also be downloaded here: https://data.broadinstitute.org/Trinity/CTAT/cnv/

ref_clusters <- c("") # Which clusters to use as a reference.

# Create the inferCNV object.
infercnv_obj <- infercnv::CreateInfercnvObject(raw_counts_matrix = count_matrix,
                                               annotations_file = annotation_file,
                                               delim = "\t",
                                               gene_order_file = gene_ordering_file,
                                               ref_group_names = ref_clusters)

# Run inferCNV.
cnv_analysis_folder_output <- paste0(cnv_analysis_folder, "/output") # This path needs to not exist in your filesystem otherwise inferCNV will stop complaining that the path exists.

infercnv_obj <- infercnv::run(infercnv_obj,
                              cutoff = 0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                              min_cells_per_gene = 3, # Default.
                              out_dir = cnv_analysis_folder,  # dir is auto-created for storing outputs
                              cluster_by_groups = TRUE,   # Cluster by groups.
                              denoise = TRUE,
                              HMM = TRUE,
                              HMM_type = "i6",
                              window_length = 201,
                              num_threads = 8,
                              resume_mode=FALSE)

# For further denoising, you can apply median filtering, but this might remove true signal from the plot.
infercnv_obj_median_filtered = infercnv::apply_median_filtering(infercnv_obj)

infercnv::plot_cnv(infercnv_obj_median_filtered,
                   out_dir =  cnv_analysis_folder,
                   output_filename = 'infercnv.median_filtered',
                   x.range = "auto",
                   x.center = 1,
                   title = "infercnv",
                   color_safe_pal = FALSE)