26  Mutation frequency by ethnicity

pkgs <- c("fs", "futile.logger", "configr", "stringr", "ggpubr", "ggthemes",
          "jhtools", "glue", "ggsci", "patchwork", "tidyverse", "dplyr",
          "SummarizedExperiment", "jhuanglabRNAseq", "DESeq2", "maftools",
          "matrixStats", "Matrix", "furrr", "ComplexHeatmap")
for (pkg in pkgs){
  suppressPackageStartupMessages(library(pkg, character.only = T))
}

setwd("/cluster/home/jhuang/projects/mm/analysis/meta/human/manuscript")


# ======================== mixed plot of key mutations across ethnicity groups =======================

# load data info
merged_sinfo <- read_rds("~/projects/mm/docs/meta/sampleinfo/sampleinfo_jilin_commpass.rds")

keygenes <- c("KRAS", "NRAS", "DIS3", "FAM46C", "BRAF", "TP53", "HIST1H1E", "FGFR3", "SP140", "MAX", "CYLD",
              "IGLL5", "DUSP2", "PRDM1", "TRAF3", "IRF4", "PRKD2", "PTPN11", "ATM", "KLHL6", "SAMHD1", "LTB",
              "IKZF3", "CCND1", "IKBKB", "DDX5", "BCL7A", "HIST1H1B", "HNRNPU", "TCL1A", "SMU1", "RB1",
              "RNASEH2C", "RPL5", "PIM1", "EGR1", "FIP1L1", "CDKN1B", "SGPP1", "TGDS", "EEF1A1", "SETD2",
              "FRMPD2", "ANKRD9", "TRAF2", "FAM196A", "HIST1H4E", "IRF1", "HIST1H1D", "RASA2", "NFKBIA",
              "ACTG1", "SP3", "EFTUD2", "EHD1", "HOOK1", "SF3B1", "FKBP14", "HNRNPA2B1", "ATP5D", "RFTN1",
              "ZNF292", "IDH2", "ZFP36L1", "PIK3CA", "MAF", "ARID1A", "FUBP1", "XBP1", "IDH1", "HUWE1", "NF1",
              "NCOR1", "ARID2", "CDKN2C", "EP300", "MAML2", "C8orf34", "KMT2C", "ABCF1", "MAN2C1", "NFKB2",
              "MAFB", "TET2", "KMT2B", "CREBBP", "DNMT3A", "KDM6A", "ATRX", "UBR5", "KDM5C")

config_fn <- "rds/colors_fix.yaml"
config_list <- show_me_the_colors(config_fn, "all")
mut_colors <- config_list$mutation_type
smmh <- readr::read_rds("/rds/mutation/samtools_merge_mut_hotspots.rds")

#
# commpass_maf <- readr::read_rds("rds/41588_2024_1853_MOESM5_ESM_hg38.rds")
# commpass_maf_sub <- commpass_maf %>% dplyr::select(
#   sample_id = sample, GENE, var = EFFECT,
#   chr = Chromosome, pos = Start_Position, REF, ALT, HGVS_P)|>
#   dplyr::filter(HGVS_P != ".") |>
#   mutate(aa = str_extract(HGVS_P, pattern = "(?<=p\\.)[A-Za-z]+\\d+"))

# cannot exactly map to mutation types!


# transform into long format
mut_long <- smmh[, -(ncol(smmh))] %>%
  pivot_longer(cols = -sample_id, names_to = "gene_name", values_to = "mutation_status") %>%
  separate(gene_name, into = c("Gene", "aa"), sep = "_") %>%
  group_by(Gene) %>%
  mutate(hotspot_type = ifelse(all(grepl("other", aa)), FALSE, TRUE))
mut_long <- mut_long[!(grepl("_other$", mut_long$gene_name) & mut_long$hotspot_type), ] %>%
  group_by(sample_id, Gene) %>%
  summarise(mutation_status = any(mutation_status)) %>%
  ungroup()
mut_long <- mut_long[mut_long$mutation_status %in% TRUE, ]

# add information
all_mut_p <- left_join(mut_long, merged_sinfo[, c("sample_id", "ethnicity")])
all_mut_p <- all_mut_p[all_mut_p$ethnicity %notin% "other", ]
colnames(all_mut_p) <- c("Sample", "Gene", "mutation_status", "Cohort")
all_mut_p <- all_mut_p[all_mut_p$Gene %notin% c("HNRNPA2B1", "NOTCH2"), ]
all_mut_p$MutationType <- "missense"

mutation_summary <- all_mut_p %>%
  distinct(Sample, Gene, Cohort) %>%
  group_by(Cohort) %>%
  mutate(TotalSamples = n_distinct(Sample)) %>%
  group_by(Cohort, Gene, TotalSamples) %>%
  summarise(MutatedSamples = n_distinct(Sample), .groups = "drop") %>%
  mutate(MutationRatio = MutatedSamples / TotalSamples)


# data for plot
dot_data <- mutation_summary
cohort_colors <- config_list$ethnicity
dot_data$Gene <- factor(dot_data$Gene, levels = keygenes[keygenes %in% unique(dot_data$Gene)])

side_data <- all_mut_p %>%
  group_by(Gene, MutationType) %>%
  summarise(Count = n(), .groups = "drop")
side_data$Gene <- factor(side_data$Gene, levels = keygenes[keygenes %in% unique(side_data$Gene)])



# gene re-level by functions:
gene_sets_anno <- list(
  "RTK/RAS/MAPK" = c("KRAS", "NRAS", "BRAF", "FGFR3", "PTPN11", "NF1", "RASA2"),
  "Transcription factor" = c("MAF", "MAFB", "IRF4", "IRF1", "PRDM1", "SP3", "EGR1", "MAX"),
  "Chromatin remodeling" = c("KMT2C", "KMT2B", "KDM6A", "KDM5C", "SETD2", "ARID1A", "ARID2", "CREBBP", "EP300", "NCOR1"),
  "Methylation regulator" = c("IDH1", "IDH2", "TET2", "DNMT3A"),
  "RNA Processing" = c("DIS3", "SF3B1", "EEF1A1", "FIP1L1", "DDX5", "SMU1", "EFTUD2", "HNRNPA2B1", "HNRNPU"),
  "DNA repair/Cell cycle" = c("TP53", "RB1", "ATM", "ATRX", "CDKN1B", "CDKN2C", "HUWE1", "UBR5"),
  "NFkB/immune-related" = c("TRAF2", "TRAF3", "NFKBIA", "NFKB2", "LTB", "IKBKB", "IKZF3", "CYLD", "SP140"),
  "RNA stability" = c("RPL5", "ATP5D", "ZFP36L1", "ABCF1", "RNASEH2C", "TGDS", "MAN2C1"),
  "Ig/Plasma-related" = c("IGLL5", "XBP1", "FAM46C", "CCND1", "TCL1A", "PIM1", "SGPP1", "BCL7A", "FUBP1", "ZNF292", "HOOK1"),
  "Histone-related" = c("HIST1H1E", "HIST1H1D", "HIST1H1B", "HIST1H4E")
)

gene_group <- stack(gene_sets_anno)
colnames(gene_group) <- c("Gene", "Group")
gene_group$Group <- as.character(gene_group$Group)
dot_data_annotated <- dot_data %>% left_join(gene_group, by = "Gene") %>% na.omit()

# reorder - group
group_levels <- c(names(gene_sets_anno), "Other")
dot_data_annotated$Group <- factor(dot_data_annotated$Group, levels = group_levels)
# reorder - genes
gene_order <- dot_data_annotated %>%
  group_by(Group, Gene) %>%
  summarise(max_ratio = max(MutationRatio, na.rm = TRUE), .groups = "drop") %>%
  arrange(Group, desc(max_ratio)) %>%
  pull(Gene)
dot_data_annotated$Gene <- factor(dot_data_annotated$Gene, levels = unique(gene_order))

gene_group_colors <- config_list$gene_group_colors

# update dotplot
p_dot <- ggplot(dot_data_annotated, aes(x = Gene, y = Cohort)) +
  geom_point(aes(size = MutationRatio, color = Group)) +
  scale_color_manual(values = gene_group_colors,
                     guide = guide_legend(nrow = 3)) +
  scale_size(range = c(2, 6),
             guide = guide_legend(ncol = 1)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom") +
  labs(title = "Mutation Frequency by Gene and Cohort", x = "Gene", y = "Cohort")


# update barplot

side_data$Gene <- factor(side_data$Gene, levels = levels(dot_data_annotated$Gene))
side_data <- na.omit(side_data)

p_bar <- ggplot(side_data, aes(x = Gene, y = Count, fill = MutationType)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = mut_colors) +
  theme_minimal() +
  labs(title = "Mutation Type Distribution", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# merge and output
pdf("figure2_dotplot_3groups.pdf", height = 6, width = 15)
final_plot <- ggarrange(
  p_bar,
  p_dot,
  ncol = 1,
  heights = c(1.25, 1.75),
  align = "v"
)
print(final_plot)
dev.off()