pkgs <- c("fs", "futile.logger", "configr", "stringr", "ggpubr", "ggthemes",
"jhtools", "glue", "ggsci", "patchwork", "tidyverse", "dplyr",
"SummarizedExperiment", "jhuanglabRNAseq", "DESeq2", "maftools",
"matrixStats", "Matrix", "furrr", "ComplexHeatmap")
for (pkg in pkgs){
suppressPackageStartupMessages(library(pkg, character.only = T))
}
setwd("/cluster/home/jhuang/projects/mm/analysis/meta/human/manuscript")
# ======================== mixed plot of key mutations across ethnicity groups =======================
# load data info
merged_sinfo <- read_rds("~/projects/mm/docs/meta/sampleinfo/sampleinfo_jilin_commpass.rds")
keygenes <- c("KRAS", "NRAS", "DIS3", "FAM46C", "BRAF", "TP53", "HIST1H1E", "FGFR3", "SP140", "MAX", "CYLD",
"IGLL5", "DUSP2", "PRDM1", "TRAF3", "IRF4", "PRKD2", "PTPN11", "ATM", "KLHL6", "SAMHD1", "LTB",
"IKZF3", "CCND1", "IKBKB", "DDX5", "BCL7A", "HIST1H1B", "HNRNPU", "TCL1A", "SMU1", "RB1",
"RNASEH2C", "RPL5", "PIM1", "EGR1", "FIP1L1", "CDKN1B", "SGPP1", "TGDS", "EEF1A1", "SETD2",
"FRMPD2", "ANKRD9", "TRAF2", "FAM196A", "HIST1H4E", "IRF1", "HIST1H1D", "RASA2", "NFKBIA",
"ACTG1", "SP3", "EFTUD2", "EHD1", "HOOK1", "SF3B1", "FKBP14", "HNRNPA2B1", "ATP5D", "RFTN1",
"ZNF292", "IDH2", "ZFP36L1", "PIK3CA", "MAF", "ARID1A", "FUBP1", "XBP1", "IDH1", "HUWE1", "NF1",
"NCOR1", "ARID2", "CDKN2C", "EP300", "MAML2", "C8orf34", "KMT2C", "ABCF1", "MAN2C1", "NFKB2",
"MAFB", "TET2", "KMT2B", "CREBBP", "DNMT3A", "KDM6A", "ATRX", "UBR5", "KDM5C")
config_fn <- "rds/colors_fix.yaml"
config_list <- show_me_the_colors(config_fn, "all")
mut_colors <- config_list$mutation_type
smmh <- readr::read_rds("/rds/mutation/samtools_merge_mut_hotspots.rds")
#
# commpass_maf <- readr::read_rds("rds/41588_2024_1853_MOESM5_ESM_hg38.rds")
# commpass_maf_sub <- commpass_maf %>% dplyr::select(
# sample_id = sample, GENE, var = EFFECT,
# chr = Chromosome, pos = Start_Position, REF, ALT, HGVS_P)|>
# dplyr::filter(HGVS_P != ".") |>
# mutate(aa = str_extract(HGVS_P, pattern = "(?<=p\\.)[A-Za-z]+\\d+"))
# cannot exactly map to mutation types!
# transform into long format
mut_long <- smmh[, -(ncol(smmh))] %>%
pivot_longer(cols = -sample_id, names_to = "gene_name", values_to = "mutation_status") %>%
separate(gene_name, into = c("Gene", "aa"), sep = "_") %>%
group_by(Gene) %>%
mutate(hotspot_type = ifelse(all(grepl("other", aa)), FALSE, TRUE))
mut_long <- mut_long[!(grepl("_other$", mut_long$gene_name) & mut_long$hotspot_type), ] %>%
group_by(sample_id, Gene) %>%
summarise(mutation_status = any(mutation_status)) %>%
ungroup()
mut_long <- mut_long[mut_long$mutation_status %in% TRUE, ]
# add information
all_mut_p <- left_join(mut_long, merged_sinfo[, c("sample_id", "ethnicity")])
all_mut_p <- all_mut_p[all_mut_p$ethnicity %notin% "other", ]
colnames(all_mut_p) <- c("Sample", "Gene", "mutation_status", "Cohort")
all_mut_p <- all_mut_p[all_mut_p$Gene %notin% c("HNRNPA2B1", "NOTCH2"), ]
all_mut_p$MutationType <- "missense"
mutation_summary <- all_mut_p %>%
distinct(Sample, Gene, Cohort) %>%
group_by(Cohort) %>%
mutate(TotalSamples = n_distinct(Sample)) %>%
group_by(Cohort, Gene, TotalSamples) %>%
summarise(MutatedSamples = n_distinct(Sample), .groups = "drop") %>%
mutate(MutationRatio = MutatedSamples / TotalSamples)
# data for plot
dot_data <- mutation_summary
cohort_colors <- config_list$ethnicity
dot_data$Gene <- factor(dot_data$Gene, levels = keygenes[keygenes %in% unique(dot_data$Gene)])
side_data <- all_mut_p %>%
group_by(Gene, MutationType) %>%
summarise(Count = n(), .groups = "drop")
side_data$Gene <- factor(side_data$Gene, levels = keygenes[keygenes %in% unique(side_data$Gene)])
# gene re-level by functions:
gene_sets_anno <- list(
"RTK/RAS/MAPK" = c("KRAS", "NRAS", "BRAF", "FGFR3", "PTPN11", "NF1", "RASA2"),
"Transcription factor" = c("MAF", "MAFB", "IRF4", "IRF1", "PRDM1", "SP3", "EGR1", "MAX"),
"Chromatin remodeling" = c("KMT2C", "KMT2B", "KDM6A", "KDM5C", "SETD2", "ARID1A", "ARID2", "CREBBP", "EP300", "NCOR1"),
"Methylation regulator" = c("IDH1", "IDH2", "TET2", "DNMT3A"),
"RNA Processing" = c("DIS3", "SF3B1", "EEF1A1", "FIP1L1", "DDX5", "SMU1", "EFTUD2", "HNRNPA2B1", "HNRNPU"),
"DNA repair/Cell cycle" = c("TP53", "RB1", "ATM", "ATRX", "CDKN1B", "CDKN2C", "HUWE1", "UBR5"),
"NFkB/immune-related" = c("TRAF2", "TRAF3", "NFKBIA", "NFKB2", "LTB", "IKBKB", "IKZF3", "CYLD", "SP140"),
"RNA stability" = c("RPL5", "ATP5D", "ZFP36L1", "ABCF1", "RNASEH2C", "TGDS", "MAN2C1"),
"Ig/Plasma-related" = c("IGLL5", "XBP1", "FAM46C", "CCND1", "TCL1A", "PIM1", "SGPP1", "BCL7A", "FUBP1", "ZNF292", "HOOK1"),
"Histone-related" = c("HIST1H1E", "HIST1H1D", "HIST1H1B", "HIST1H4E")
)
gene_group <- stack(gene_sets_anno)
colnames(gene_group) <- c("Gene", "Group")
gene_group$Group <- as.character(gene_group$Group)
dot_data_annotated <- dot_data %>% left_join(gene_group, by = "Gene") %>% na.omit()
# reorder - group
group_levels <- c(names(gene_sets_anno), "Other")
dot_data_annotated$Group <- factor(dot_data_annotated$Group, levels = group_levels)
# reorder - genes
gene_order <- dot_data_annotated %>%
group_by(Group, Gene) %>%
summarise(max_ratio = max(MutationRatio, na.rm = TRUE), .groups = "drop") %>%
arrange(Group, desc(max_ratio)) %>%
pull(Gene)
dot_data_annotated$Gene <- factor(dot_data_annotated$Gene, levels = unique(gene_order))
gene_group_colors <- config_list$gene_group_colors
# update dotplot
p_dot <- ggplot(dot_data_annotated, aes(x = Gene, y = Cohort)) +
geom_point(aes(size = MutationRatio, color = Group)) +
scale_color_manual(values = gene_group_colors,
guide = guide_legend(nrow = 3)) +
scale_size(range = c(2, 6),
guide = guide_legend(ncol = 1)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom") +
labs(title = "Mutation Frequency by Gene and Cohort", x = "Gene", y = "Cohort")
# update barplot
side_data$Gene <- factor(side_data$Gene, levels = levels(dot_data_annotated$Gene))
side_data <- na.omit(side_data)
p_bar <- ggplot(side_data, aes(x = Gene, y = Count, fill = MutationType)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = mut_colors) +
theme_minimal() +
labs(title = "Mutation Type Distribution", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# merge and output
pdf("figure2_dotplot_3groups.pdf", height = 6, width = 15)
final_plot <- ggarrange(
p_bar,
p_dot,
ncol = 1,
heights = c(1.25, 1.75),
align = "v"
)
print(final_plot)
dev.off()