13 CNVs in gene expression subtypes

This document performs copy number variation (CNV) analysis for multiple myeloma (MM) RNA-seq data. It processes CNV data, integrates it with genomic annotations, and generates heatmaps to visualize CNV patterns across chromosomal bands and arms, stratified by sample subtypes. Additionally, it conducts statistical enrichment analysis to identify significant CNV alterations.

13.1 Setup

Load required R packages and set up the working environment.

pkgs <- c("fs", "futile.logger", "configr", "ggpubr", "ggthemes",
          "jhtools", "glue", "ggsci", "patchwork", "tidyverse",
          "circlize", "ComplexHeatmap", "GenomicRanges", "jhuanglabRNAseq")

# Load packages silently
for (pkg in pkgs) {
  suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}

# Define project, dataset, and species
project <- "mm"
dataset <- "meta"
species <- "human"

# Create and set working directory for output
workdir <- glue("~/projects/{project}/analysis/{dataset}/{species}/rnaseq/figures/heatmap") |> checkdir()
setwd(workdir)

# Set random seed for reproducibility
set.seed(2025)

13.2 Data Loading

Load CNV data, chromosomal band annotations, gene annotations, and sample metadata.

# Create output directory for CNV analysis results
out_dir <- "./cnv_rnaclusterenrich" |> checkdir()

# Load chromosomal band data and filter for autosomes (chr1-22)
band <- read_rds("/cluster/home/ztao_jh/projects/mm/analysis/meta/human/figures/heatmap/cnv_rnaclusterenrich/band.rds") |>
  dplyr::filter(str_detect(chromosome_name, pattern = "^\\d+$")) |>
  dplyr::rename(chr = chromosome_name, start = start, end = end, band = band) |>
  mutate(chr = paste0("chr", chr))

# Load gene annotations from CSV and select relevant columns
annobedx <- "/cluster/home/jhuang/projects/mm/docs/meta/tx2gene.gencode.v47.csv" |>
  read_csv(show_col_types = FALSE) |>
  dplyr::select(seqid, start, end, gene_id, gene_name) |>
  dplyr::distinct()

# Rename columns for consistency
annobed <- annobedx |> dplyr::rename(chr = seqid, Gene = gene_id, genes = gene_name)

# Load supplemental CNV data from RDS (originally from an Excel file)
sm8_fn <- "~/projects/mm/docs/commpass/41588_2024_1853_MOESM8_ESM.rds"
supcnv <- read_rds(sm8_fn)

# Load sample metadata
sampleinfo <- "~/projects/mm/docs/meta/sampleinfo/sampleinfo_jilin_commpass.rds" |> read_rds()

CNV Data Preparation Annotate CNV data with gene and chromosomal band information, and prepare it for analysis.

# Annotate CNV data with gene information
anno_supcnv <- supcnv |> 
  left_join(annobed[, c("genes", "chr", "start", "Gene")]) |> 
  dplyr::select(-Gene)

# Reorder columns and ensure chromosome levels are consistent
anno_supcnv <- anno_supcnv |> 
  dplyr::select("chr", "start", "genes", everything()) |>
  mutate(chr = factor(chr, levels = c(paste0("chr", 1:22), "chrX", "chrY"))) |>
  arrange(chr, start) |>
  dplyr::filter(!is.na(chr) & !is.na(start)) |>
  mutate(cnv_rowid = 1:n())

# Create GenomicRanges objects for CNV and band data
grA <- GRanges(seqnames = anno_supcnv$chr, ranges = IRanges(start = anno_supcnv$start, end = anno_supcnv$start))
grB <- GRanges(seqnames = band$chr, ranges = IRanges(start = band$start, end = band$end))

# Find overlaps between CNV and band regions
overlaps <- findOverlaps(grA, grB)

# Create a dataframe of overlaps
overlap_df <- data.frame(
  cnv_rowid = anno_supcnv$cnv_rowid[queryHits(overlaps)],
  A_chr = seqnames(grA)[queryHits(overlaps)],
  A_start = start(grA)[queryHits(overlaps)],
  A_end = end(grA)[queryHits(overlaps)],
  gene = anno_supcnv$genes[queryHits(overlaps)],
  band = band$band[subjectHits(overlaps)]
)

13.3 Long Format Transformation

Transform CNV data into long format for visualization and analysis by chromosomal bands.

# Merge overlap data with CNV annotations
anno_supcnv_long <- overlap_df[, c("cnv_rowid", "A_chr", "band")] |>
  left_join(anno_supcnv) |>
  pivot_longer(-c(cnv_rowid, A_chr, band, chr, start, genes), names_to = "sample_id") |>
  left_join(sampleinfo[, c("sample_id", "PrimaryCluster", "subtypes", "subtype_index")]) |>
  dplyr::filter(!is.na(PrimaryCluster))

# Summarize CNV values by sample, cluster, and band
anno_supcnv_long_s <- anno_supcnv_long |>
  group_by(sample_id, PrimaryCluster, subtypes, subtype_index, chr, band) |>
  summarise(value = mean(value)) |>
  ungroup()

# Binarize CNV values (gain: >0.2, loss: <-0.2, neutral: else)
anno_supcnv_long_sb <- anno_supcnv_long_s |>
  mutate(bivalue = case_when(
    value > 0.2 ~ 1,
    value < -0.2 ~ -1,
    TRUE ~ 0
  )) |>
  mutate(
    label1 = str_extract(chr, pattern = "\\d+") |> as.numeric(),
    label2 = ifelse(str_detect(band, pattern = "p"), 1, 2),
    label3 = str_extract(band, pattern = "\\d+") |> as.numeric(),
    label4 = str_extract(band, pattern = "(?<=\\.)\\d+") |> as.numeric(),
    fullchr = paste0(chr, band)
  ) |>
  arrange(chr, desc(label2), desc(label3), desc(label4)) |>
  mutate(fullchr = factor(fullchr, levels = unique(fullchr)))

# Ensure subtypes are factors
anno_supcnv_long_sb$subtypes <- factor(anno_supcnv_long_sb$subtypes)

# Prepare data for plotting
dat_anno <- anno_supcnv_long_sb |> 
  arrange(subtype_index) |>
  mutate(sample_id = factor(sample_id, levels = unique(sample_id)))

Heatmap Visualization (Chromosomal Bands) Generate a heatmap visualizing CNV patterns across chromosomal bands.

# Create PDF for heatmap
pdf(glue("{out_dir}/anno_supcnv_long_sb.pdf"), width = 10, height = 5)
ggplot(dat_anno, aes(x = sample_id, y = fullchr, fill = bivalue)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red") +
  theme_few() +
  facet_grid(chr ~ subtypes, scales = "free", space = "free") +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    strip.text.y.right = element_text(angle = 0),
    panel.spacing = unit(0, "lines")
  )
dev.off()

13.4 Long Format Transformation (Chromosomal Arms)

Transform CNV data into long format for analysis by chromosomal arms.

# Extract chromosomal arm (p or q) and summarize CNV values
anno_supcnv_long_arm_s <- anno_supcnv_long |>
  mutate(arm = str_extract(band, pattern = "[pq]")) |>
  group_by(sample_id, PrimaryCluster, subtypes, subtype_index, chr, arm) |>
  summarise(value = mean(value)) |>
  ungroup()

# Binarize CNV values for arms
anno_supcnv_long_arm_sb <- anno_supcnv_long_arm_s |>
  mutate(bivalue = case_when(
    value > 0.2 ~ 1,
    value < -0.2 ~ -1,
    TRUE ~ 0
  )) |>
  mutate(
    label1 = str_extract(chr, pattern = "\\d+") |> as.numeric(),
    label2 = ifelse(str_detect(arm, pattern = "p"), 1, 2),
    fullchr = paste0(chr, arm)
  ) |>
  arrange(chr, desc(label2)) |>
  mutate(fullchr = factor(fullchr, levels = unique(fullchr)))

# Ensure subtypes are factors
anno_supcnv_long_arm_sb$subtypes <- factor(anno_supcnv_long_arm_sb$subtypes)

# Prepare data for plotting
dat_anno_arm <- anno_supcnv_long_arm_sb |> 
  arrange(subtype_index) |>
  mutate(sample_id = factor(sample_id, levels = unique(sample_id)))

13.5 Heatmap Visualization (Chromosomal Arms)

Generate a heatmap visualizing CNV patterns across chromosomal arms.

# Create PDF for arm-level heatmap
pdf(glue("{out_dir}/anno_supcnv_long_sb_arm.pdf"), height = 4, width = 12)
ggplot(dat_anno_arm, aes(x = sample_id, y = fullchr, fill = bivalue)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red") +
  theme_few() +
  facet_grid(chr ~ subtypes, scales = "free", space = "free") +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    strip.text.y.right = element_text(angle = 0),
    panel.spacing = unit(0, "lines")
  )
dev.off()

13.6 Enrichment Analysis Preparation

Calculate distributions of CNV events across chromosomal arms and subtypes.

# Calculate overall CNV distribution by arm
edgedistribute <- anno_supcnv_long_arm_sb |>
  group_by(chr, arm, fullchr, bivalue) |> 
  summarise(Nn = n()) |>
  group_by(chr, arm, fullchr) |> 
  mutate(N = sum(Nn)) |>
  ungroup() |>
  mutate(pct = Nn / N)

# Calculate CNV distribution by subtype
clusterdistribute <- anno_supcnv_long_arm_sb |>
  group_by(chr, arm, fullchr, bivalue, subtypes) |> 
  summarise(n = n()) |>
  group_by(chr, arm, fullchr, subtypes) |> 
  mutate(NC = sum(n)) |>
  ungroup()

# Merge distributions
fulldistribute <- clusterdistribute |> left_join(edgedistribute)

13.7 Statistical Enrichment Function

Define a function to compute p-values for CNV enrichment using binomial and hypergeometric tests.

# Function to calculate enrichment p-values
bphp <- function(drawn_positive_n, drawn_size, poistove_n, global_n, pct) {
  dbv <- dbinom(x = 0:drawn_size, size = drawn_size, prob = pct)
  dhv <- dhyper(x = 0:drawn_size, m = poistove_n, n = global_n - poistove_n, k = drawn_size)
  tibble(
    down_db_p = sum(dbv[0:drawn_positive_n]),
    up_db_p = sum(dbv[drawn_positive_n:drawn_size]),
    down_dh_p = sum(dhv[0:drawn_positive_n]),
    up_dh_p = sum(dhv[drawn_positive_n:drawn_size]),
    OE = drawn_positive_n / (pct * drawn_size)
  )
}

# Apply enrichment function to distribution data
fulldistribute_pp <- fulldistribute |> 
  rowwise() |> 
  mutate(plist = list(bphp(n, NC, Nn, N, pct))) |> 
  unnest(cols = plist)

Enrichment Analysis (Losses) Perform enrichment analysis for CNV losses (bivalue = -1) and visualize results.

# Filter for losses and compute significance
fulldistribute_pp_sig <- fulldistribute_pp |>
  rowwise() |>
  dplyr::filter(bivalue == -1) |>
  mutate(
    sigdown = down_db_p < 0.025 | down_dh_p < 0.025,
    sigup = up_db_p < 0.025 | up_dh_p < 0.025,
    twosidep = min(down_db_p, down_dh_p, up_db_p, up_dh_p) * 2
  ) |>
  arrange(subtypes)

# Volcano plot for losses
pdf(glue("{out_dir}/anno_supcnv_long_sb_armsig_vol_dn.pdf"), height = 7, width = 4)
ggplot(fulldistribute_pp_sig, aes(x = log2(OE), y = -log10(twosidep))) +
  geom_point()
dev.off()

# Heatmap for significant losses
pdf(glue("{out_dir}/anno_supcnv_long_sb_armsig_dn.pdf"), height = 4, width = 12)
ggplot(fulldistribute_pp_sig |>
         dplyr::mutate(
           OE = case_when(
             twosidep >= 0.05 ~ 1,
             OE < 1 ~ 1,
             TRUE ~ OE
           )
         ), aes(x = subtypes, y = fullchr, fill = log2(OE))) +
  geom_tile() +
  geom_point(data = fulldistribute_pp_sig |> dplyr::filter(twosidep < 0.05 & OE > 1)) +
  scale_fill_gradient2(low = "blue", high = "blue") +
  theme_few() +
  facet_grid(chr ~ subtypes, scales = "free", space = "free") +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    strip.text.y.right = element_text(angle = 0),
    panel.spacing = unit(0, "lines")
  )
dev.off()