Load packages

suppressMessages(library(dplyr))
suppressMessages(library(umap))
suppressMessages(library(ggplot2))
suppressMessages(library(devtools))
suppressMessages(library(gdata))

Directories and Files

Directories

# Data directory
data_dir <- file.path("~/Documents/GitHub/jharenza.github.io/openPBTA-notebooks/mb-subtypes/data/")
pbta_data_dir <- file.path("~/OpenPBTA-analysis/data/release-v16-20200320/")
# Create a results directory
results_dir <- "results"
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}

Input Filepaths

pbta_rnaseq_file <- file.path(pbta_data_dir, "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds")
pbta_rnaseq_file_full <- file.path(pbta_data_dir, "pbta-gene-expression-rsem-fpkm.stranded.rds")
pbta_hist_file <- file.path(pbta_data_dir, "pbta-histologies.tsv")
pptc_annot_file <- file.path(data_dir, "pptc_annot.tsv")
pptc_hist_file <- file.path(data_dir, "pptc-pdx-clinical-web.txt")
classifier_path_file <- file.path(data_dir, "rathi_results.tsv")

Read in files

# PBTA
pbta_fpkm_collapsed_mat <- readRDS(pbta_rnaseq_file)
pbta_fpkm_full_mat <- readRDS(pbta_rnaseq_file_full)
pbta_histologies_df <- read.delim(pbta_hist_file, sep = "\t", header = T)
# PPTC
load(file.path(data_dir, "2019-02-14-pptc-STAR_cufflinks_hg19_matrix_244.rda"), verbose = T)
Loading objects:
  rna.mat
pptc_subtypes_df <- read.delim(pptc_annot_file, sep = "\t", header = T)
# Rename PPTC df, remove gene name column, subset MB samples
mv(from = "rna.mat", to = "pptc_fpkm_df")
pptc_fpkm_df$gene_short_name <- NULL
pptc_mb_df <- pptc_fpkm_df %>%
  select(pptc_subtypes_df$Sample)
# Michael Taylor (MB classifier training set), rename
load(file.path(data_dir, "mt_rnaseq.RData"), verbose = T)
Loading objects:
  mt_mat
  mt_annot
mv(from = "mt_mat", to = "mt_fpkm_df")
head(mt_fpkm_df)
head(mt_annot)
# MM2S PBTA classifier results, MB classifier results, pathology data
# MM2S Paper: https://scfbm.biomedcentral.com/articles/10.1186/s13029-016-0053-y
pbta_combined_subtypes <- read.delim(classifier_path_file, sep = "\t", header = T)
# How many of each subtype identified by pathology?
table(pbta_combined_subtypes$pathology_subtype)

Group 3 or 4      Group 4      non-WNT          SHH          WNT 
          11            2            1           12            6 

PBTA wrangling

# Only pull out sample identifiers (KidsFirst biospecimen identifiers) that correspond to medulloblastoma samples
medulloblastoma_samples <- pbta_histologies_df %>%
  filter(short_histology == "Medulloblastoma") %>%
  filter(experimental_strategy == "RNA-Seq") %>%
  filter(RNA_library == "stranded") %>%
  pull(Kids_First_Biospecimen_ID)
# Calculate N
length(medulloblastoma_samples)
[1] 121
# Collect subtypes and count
mb_molecular_subtype_df <- pbta_histologies_df %>%
  filter(short_histology == "Medulloblastoma") %>%
  filter(experimental_strategy == "RNA-Seq") %>%
  filter(RNA_library == "stranded") %>%
  select(Kids_First_Biospecimen_ID, molecular_subtype, seq_center)
# Calculate N
nrow(mb_molecular_subtype_df)
[1] 121
# How many per subtype?
table(droplevels(mb_molecular_subtype_df$molecular_subtype))

Group3 Group4    SHH    WNT 
    15     67     28     11 

Create matrices of only MB samples

# select PBTA MB samples only
pbta_mb_coll_df <- pbta_fpkm_collapsed_mat %>% 
  select(medulloblastoma_samples)
pbta_mb_full_df <- pbta_fpkm_full_mat %>% 
  select(medulloblastoma_samples)
# select PPTC MB samples only
pptc_mb_df <- pptc_fpkm_df %>% 
  select(pptc_subtypes_df$Sample)
length(pptc_subtypes_df$Sample)
[1] 18
# How many per subtype?
table(pptc_subtypes_df$class_subtype)

Group3 Group4    SHH    WNT 
     7      2      7      2 
table(pptc_subtypes_df$path_subtype)

Group 3 Group 4     SHH     WNT 
      5       1       6       1 
pptc_subtypes_df
# 12/13 classified correctly - too few samples for clustering
12/13*100
[1] 92.30769
# How many in MT dataset?
length(mt_annot$Sample)
[1] 97
table(mt_annot$Subgroup)

Group3 Group4    SHH    WNT 
    19     25     50      3 
# convert to matrix
pbta_mb_coll_mat <- as.matrix(pbta_mb_coll_df)
pbta_mb_full_mat <- as.matrix(pbta_mb_full_df)
mt_fpkm_mat <- as.matrix(mt_fpkm_df)

Transform matrices

log2 matrix

pbta_coll_log2 <- log2(pbta_mb_coll_mat)
pbta_full_log2 <- log2(pbta_mb_full_mat)
mt_mat_log2 <- log2(mt_fpkm_mat)

Calculate variance for OpenPBTA collapsed data

gene_variance <- matrixStats::rowVars(pbta_coll_log2)
# Find the value that we'll use as a threshold to filter the top 5%
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
# Row indices of high variance genes
high_variance_index <- which(gene_variance > variance_threshold)

Set seed for reproducible UMAP results

set.seed(2020)

OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering

Using collapsed matrix

# expects features (genes) to be columns, so we have to use t()
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype 
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(mb_molecular_subtype_df)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
# Plot by subtype
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = molecular_subtype)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

# Plot by sequencing center - is this having any effect? (Not really)
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = seq_center)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

Merge MM2S classifier UMAP

Will use this later

pbta_results_mer <- merge(umap_plot_df, pbta_combined_subtypes, by = "Kids_First_Biospecimen_ID")
head(pbta_results_mer)

Calculate variance for OpenPBTA non-collapsed data

Does this give similar results?

gene_variance <- matrixStats::rowVars(pbta_full_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)

OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering - full matrix

umap_results <- umap::umap(t(pbta_full_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(mb_molecular_subtype_df)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = molecular_subtype)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

  • Yes, will proceed only with the collapsed matrix.

Exploration

Are the subtypes that don’t classify by unsupervised clustering the same subtypes that did not match pathology subtypes?

pbta_results_mer %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = pathology_subtype,
             shape = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

Misclassified samples:

  • For the 3 samples within the WNT cluster not classified as WNT, two were WNT by pathology and the other was not noted by pathology.
  • For 1 sample classified as WNT that clusterd with SHH, pathology was not noted.
  • For the 6 samples within the SHH cluster that were classified as Group 4, 3 were noted as SHH by pathology.
  • 1 was SHH by pathology.
  • Within this cluster, one sample was classified as SHH but pathology noted as WNT.
  • The other 3 samples were not noted by pathology.
  • Two samples classified as WNT present in the Group 3/4 cluster were not noted by pathology.
  • 6 samples were classified as SHH but are in the Group 3/4 cluster. 2 of these were SHH by pathology, and the rest were not noted. # Examine accuracy for this dataset
# Subset for only samples that had pathology results
subtypes_calc <- subset(pbta_combined_subtypes, !is.na(pathology_subtype)) 
# convert to character to perform ifelse on factors with different levels
subtypes_calc$pathology_subtype <- as.character(subtypes_calc$pathology_subtype)
subtypes_calc$mb_classifier_prediction <- as.character(subtypes_calc$mb_classifier_prediction)
# Create column to calculate matches
subtypes_calc$mbclass_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$mb_classifier_prediction, "true", 
                                       ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group3", "true", 
                                              ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group4", "true", "false")))
# How many were predicted correctly (true)?
table(subtypes_calc$mbclass_v_path)

false  true 
    6    26 
# Percent of classifications that match MB classifier calls
(26/(6+26))*100
[1] 81.25
# Create column to calculate matches for MM2S classifier
subtypes_calc$MM2S_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$MM2S_prediction, "true", 
                                       ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group3", "true", 
                                              ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group4", "true", "false")))
table(subtypes_calc$MM2S_v_path)

false  true 
    7    25 
# Percent of classifications that match MM2S classifier calls
(25/(7+25))*100
[1] 78.125

plot MM2S predictions

First, combine matrix with subtypes

gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(pbta_combined_subtypes)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector

Does this data look better?

umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = MM2S_prediction,
             shape = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

Calculate variance for Michael Taylor RNA-Seq data

gene_variance <- matrixStats::rowVars(mt_mat_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)

Michael Taylor (N = 97) UMAP clustering

umap_results <- umap::umap(t(mt_mat_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Sample") %>%
  inner_join(mt_annot)
Joining, by = "Sample"
# Plot
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = Subgroup)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

Observation:

  • It looks like there are too few WNT samples (N = 3) to enable unsupervised clustering of those samples in this dataset.

Rerun of OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering

  • Turns out the initial FPKM values were not log2 transformed, thus MB subtypes in pbta-histologies.tsv from MB classifier are not accurate.
# Replot UMAP with new subtype labels
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype 
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(pbta_combined_subtypes)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
# Plot by subtype
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")

  • This looks A LOT better!

session info

session_info()
─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────

─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
 package     * version date       lib source        
 askpass       1.1     2019-01-13 [1] CRAN (R 3.6.0)
 assertthat    0.2.1   2019-03-21 [1] CRAN (R 3.6.0)
 backports     1.1.5   2019-10-02 [1] CRAN (R 3.6.0)
 base64enc     0.1-3   2015-07-28 [1] CRAN (R 3.6.0)
 callr         3.3.2   2019-09-22 [1] CRAN (R 3.6.0)
 cli           1.1.0   2019-03-19 [1] CRAN (R 3.6.0)
 colorspace    1.4-1   2019-03-18 [1] CRAN (R 3.6.0)
 crayon        1.3.4   2017-09-16 [1] CRAN (R 3.6.0)
 desc          1.2.0   2018-05-01 [1] CRAN (R 3.6.0)
 devtools    * 2.2.1   2019-09-24 [1] CRAN (R 3.6.0)
 digest        0.6.22  2019-10-21 [1] CRAN (R 3.6.1)
 dplyr       * 0.8.3   2019-07-04 [1] CRAN (R 3.6.0)
 ellipsis      0.3.0   2019-09-20 [1] CRAN (R 3.6.0)
 evaluate      0.14    2019-05-28 [1] CRAN (R 3.6.0)
 fs            1.3.1   2019-05-06 [1] CRAN (R 3.6.0)
 gdata       * 2.18.0  2017-06-06 [1] CRAN (R 3.6.0)
 ggplot2     * 3.2.1   2019-08-10 [1] CRAN (R 3.6.0)
 glue          1.3.1   2019-03-12 [1] CRAN (R 3.6.0)
 gtable        0.3.0   2019-03-25 [1] CRAN (R 3.6.0)
 gtools        3.8.1   2018-06-26 [1] CRAN (R 3.6.0)
 htmltools     0.4.0   2019-10-04 [1] CRAN (R 3.6.0)
 jsonlite      1.6     2018-12-07 [1] CRAN (R 3.6.0)
 knitr         1.25    2019-09-18 [1] CRAN (R 3.6.0)
 labeling      0.3     2014-08-23 [1] CRAN (R 3.6.0)
 lattice       0.20-38 2018-11-04 [1] CRAN (R 3.6.1)
 lazyeval      0.2.2   2019-03-15 [1] CRAN (R 3.6.0)
 lifecycle     0.2.0   2020-03-06 [1] CRAN (R 3.6.0)
 magrittr      1.5     2014-11-22 [1] CRAN (R 3.6.0)
 Matrix        1.2-17  2019-03-22 [1] CRAN (R 3.6.1)
 matrixStats   0.55.0  2019-09-07 [1] CRAN (R 3.6.0)
 memoise       1.1.0   2017-04-21 [1] CRAN (R 3.6.0)
 munsell       0.5.0   2018-06-12 [1] CRAN (R 3.6.0)
 openssl       1.4.1   2019-07-18 [1] CRAN (R 3.6.0)
 pillar        1.4.6   2020-07-10 [1] CRAN (R 3.6.2)
 pkgbuild      1.0.6   2019-10-09 [1] CRAN (R 3.6.0)
 pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 3.6.0)
 pkgload       1.0.2   2018-10-29 [1] CRAN (R 3.6.0)
 prettyunits   1.0.2   2015-07-13 [1] CRAN (R 3.6.0)
 processx      3.4.1   2019-07-18 [1] CRAN (R 3.6.0)
 ps            1.3.0   2018-12-21 [1] CRAN (R 3.6.0)
 purrr         0.3.3   2019-10-18 [1] CRAN (R 3.6.0)
 R6            2.4.0   2019-02-14 [1] CRAN (R 3.6.0)
 Rcpp          1.0.5   2020-07-06 [1] CRAN (R 3.6.2)
 remotes       2.1.0   2019-06-24 [1] CRAN (R 3.6.0)
 reticulate    1.16    2020-05-27 [1] CRAN (R 3.6.2)
 rlang         0.4.7   2020-07-09 [1] CRAN (R 3.6.2)
 rmarkdown     1.16    2019-10-01 [1] CRAN (R 3.6.0)
 rprojroot     1.3-2   2018-01-03 [1] CRAN (R 3.6.0)
 RSpectra      0.16-0  2019-12-01 [1] CRAN (R 3.6.0)
 rstudioapi    0.10    2019-03-19 [1] CRAN (R 3.6.0)
 scales        1.0.0   2018-08-09 [1] CRAN (R 3.6.0)
 sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 3.6.0)
 stringi       1.4.3   2019-03-12 [1] CRAN (R 3.6.0)
 stringr       1.4.0   2019-02-10 [1] CRAN (R 3.6.0)
 testthat      2.2.1   2019-07-25 [1] CRAN (R 3.6.0)
 tibble        3.0.3   2020-07-10 [1] CRAN (R 3.6.2)
 tidyselect    1.1.0   2020-05-11 [1] CRAN (R 3.6.2)
 umap        * 0.2.6.0 2020-06-16 [1] CRAN (R 3.6.2)
 usethis     * 1.5.1   2019-07-04 [1] CRAN (R 3.6.0)
 vctrs         0.3.2   2020-07-15 [1] CRAN (R 3.6.2)
 withr         2.1.2   2018-03-15 [1] CRAN (R 3.6.0)
 xfun          0.10    2019-10-01 [1] CRAN (R 3.6.0)
 yaml          2.2.0   2018-07-25 [1] CRAN (R 3.6.0)

[1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
---
title: "Explore medulloblastoma subtype calls in the context of unsupervised clustering"
output: html_notebook    
toc: TRUE
toc_float: TRUE
editor_options: 
  chunk_output_type: inline
---
```{r global_options, include=F}
knitr::opts_chunk$set(error = F, echo = T, warning = T, message = T)
```


# Load packages
```{r}
suppressMessages(library(dplyr))
suppressMessages(library(umap))
suppressMessages(library(ggplot2))
suppressMessages(library(devtools))
suppressMessages(library(gdata))
```

# Directories and Files
## Directories
```{r}
# Data directory
data_dir <- file.path("~/Documents/GitHub/jharenza.github.io/openPBTA-notebooks/mb-subtypes/data/")
pbta_data_dir <- file.path("~/OpenPBTA-analysis/data/release-v16-20200320/")
# Create a results directory
results_dir <- "results"
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}
```

## Input Filepaths
```{r}
pbta_rnaseq_file <- file.path(pbta_data_dir, "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds")
pbta_rnaseq_file_full <- file.path(pbta_data_dir, "pbta-gene-expression-rsem-fpkm.stranded.rds")
pbta_hist_file <- file.path(pbta_data_dir, "pbta-histologies.tsv")
pptc_annot_file <- file.path(data_dir, "pptc_annot.tsv")
pptc_hist_file <- file.path(data_dir, "pptc-pdx-clinical-web.txt")
classifier_path_file <- file.path(data_dir, "rathi_results.tsv")
```
## Read in files
```{r}
# PBTA
pbta_fpkm_collapsed_mat <- readRDS(pbta_rnaseq_file)
pbta_fpkm_full_mat <- readRDS(pbta_rnaseq_file_full)
pbta_histologies_df <- read.delim(pbta_hist_file, sep = "\t", header = T)
# PPTC
load(file.path(data_dir, "2019-02-14-pptc-STAR_cufflinks_hg19_matrix_244.rda"), verbose = T)
pptc_subtypes_df <- read.delim(pptc_annot_file, sep = "\t", header = T)
# Rename PPTC df, remove gene name column, subset MB samples
mv(from = "rna.mat", to = "pptc_fpkm_df")
pptc_fpkm_df$gene_short_name <- NULL
pptc_mb_df <- pptc_fpkm_df %>%
  select(pptc_subtypes_df$Sample)

# Michael Taylor (MB classifier training set), rename
load(file.path(data_dir, "mt_rnaseq.RData"), verbose = T)
mv(from = "mt_mat", to = "mt_fpkm_df")
head(mt_fpkm_df)
head(mt_annot)

# MM2S PBTA classifier results, MB classifier results, pathology data
# MM2S Paper: https://scfbm.biomedcentral.com/articles/10.1186/s13029-016-0053-y
pbta_combined_subtypes <- read.delim(classifier_path_file, sep = "\t", header = T)
# How many of each subtype identified by pathology?
table(pbta_combined_subtypes$pathology_subtype)
```

# PBTA wrangling
```{r}
# Only pull out sample identifiers (KidsFirst biospecimen identifiers) that correspond to medulloblastoma samples
medulloblastoma_samples <- pbta_histologies_df %>%
  filter(short_histology == "Medulloblastoma") %>%
  filter(experimental_strategy == "RNA-Seq") %>%
  filter(RNA_library == "stranded") %>%
  pull(Kids_First_Biospecimen_ID)
# Calculate N
length(medulloblastoma_samples)
# Collect subtypes and count
mb_molecular_subtype_df <- pbta_histologies_df %>%
  filter(short_histology == "Medulloblastoma") %>%
  filter(experimental_strategy == "RNA-Seq") %>%
  filter(RNA_library == "stranded") %>%
  select(Kids_First_Biospecimen_ID, molecular_subtype, seq_center)
# Calculate N
nrow(mb_molecular_subtype_df)
# How many per subtype?
table(droplevels(mb_molecular_subtype_df$molecular_subtype))
```
# Create matrices of only MB samples
```{r}
# select PBTA MB samples only
pbta_mb_coll_df <- pbta_fpkm_collapsed_mat %>% 
  select(medulloblastoma_samples)
pbta_mb_full_df <- pbta_fpkm_full_mat %>% 
  select(medulloblastoma_samples)
# select PPTC MB samples only
pptc_mb_df <- pptc_fpkm_df %>% 
  select(pptc_subtypes_df$Sample)
length(pptc_subtypes_df$Sample)
# How many per subtype?
table(pptc_subtypes_df$class_subtype)
table(pptc_subtypes_df$path_subtype)
pptc_subtypes_df
# 12/13 classified correctly - too few samples for clustering
12/13*100

# How many in MT dataset?
length(mt_annot$Sample)
table(mt_annot$Subgroup)

# convert to matrix
pbta_mb_coll_mat <- as.matrix(pbta_mb_coll_df)
pbta_mb_full_mat <- as.matrix(pbta_mb_full_df)
mt_fpkm_mat <- as.matrix(mt_fpkm_df)
```
# Transform matrices
## log2 matrix
```{r}
pbta_coll_log2 <- log2(pbta_mb_coll_mat)
pbta_full_log2 <- log2(pbta_mb_full_mat)
mt_mat_log2 <- log2(mt_fpkm_mat)
```
# Calculate variance for OpenPBTA collapsed data
```{r}
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
# Find the value that we'll use as a threshold to filter the top 5%
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
# Row indices of high variance genes
high_variance_index <- which(gene_variance > variance_threshold)
```
# Set seed for reproducible UMAP results
```{r}
set.seed(2020)
```

# OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering
## Using collapsed matrix
```{r}
# expects features (genes) to be columns, so we have to use t()
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype 
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(mb_molecular_subtype_df)
# Plot by subtype
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = molecular_subtype)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
```{r}
# Plot by sequencing center - is this having any effect? (Not really)
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = seq_center)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
# Merge MM2S classifier UMAP
## Will use this later
```{r}
pbta_results_mer <- merge(umap_plot_df, pbta_combined_subtypes, by = "Kids_First_Biospecimen_ID")
head(pbta_results_mer)
```
# Calculate variance for OpenPBTA non-collapsed data
## Does this give similar results?

```{r}
gene_variance <- matrixStats::rowVars(pbta_full_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
```
# OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering - full matrix

```{r}
umap_results <- umap::umap(t(pbta_full_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(mb_molecular_subtype_df)
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = molecular_subtype)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
- Yes, will proceed only with the collapsed matrix.

# Exploration
## Are the subtypes that don't classify by unsupervised clustering the same subtypes that did not match pathology subtypes?
```{r}
pbta_results_mer %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = pathology_subtype,
             shape = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
## Misclassified samples:
- For the 3 samples within the WNT cluster not classified as WNT, two were WNT by pathology and the other was not noted by pathology. 
- For 1 sample classified as WNT that clusterd with SHH, pathology was not noted. 
- For the 6 samples within the SHH cluster that were classified as Group 4, 3 were noted as SHH by pathology. 
- 1 was SHH by pathology. 
- Within this cluster, one sample was classified as SHH but pathology noted as WNT. 
- The other 3 samples were not noted by pathology. 
- Two samples classified as WNT present in the Group 3/4 cluster were not noted by pathology. 
- 6 samples were classified as SHH but are in the Group 3/4 cluster. 2 of these were SHH by pathology, and the rest were not noted.
# Examine accuracy for this dataset
```{r}
# Subset for only samples that had pathology results
subtypes_calc <- subset(pbta_combined_subtypes, !is.na(pathology_subtype)) 
# convert to character to perform ifelse on factors with different levels
subtypes_calc$pathology_subtype <- as.character(subtypes_calc$pathology_subtype)
subtypes_calc$mb_classifier_prediction <- as.character(subtypes_calc$mb_classifier_prediction)
# Create column to calculate matches
subtypes_calc$mbclass_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$mb_classifier_prediction, "true", 
                                       ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group3", "true", 
                                              ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group4", "true", "false")))
# How many were predicted correctly (true)?
table(subtypes_calc$mbclass_v_path)
# Percent of classifications that match MB classifier calls
(26/(6+26))*100
# Create column to calculate matches for MM2S classifier
subtypes_calc$MM2S_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$MM2S_prediction, "true", 
                                       ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group3", "true", 
                                              ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group4", "true", "false")))
table(subtypes_calc$MM2S_v_path)
# Percent of classifications that match MM2S classifier calls
(25/(7+25))*100
```

# plot MM2S predictions
## First, combine matrix with subtypes

```{r}
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(pbta_combined_subtypes)
```  
## Does this data look better?
```{r}
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = MM2S_prediction,
             shape = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```


# Calculate variance for Michael Taylor RNA-Seq data
```{r}
gene_variance <- matrixStats::rowVars(mt_mat_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
```

# Michael Taylor (N = 97) UMAP clustering 
```{r}
umap_results <- umap::umap(t(mt_mat_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Sample") %>%
  inner_join(mt_annot)
# Plot
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = Subgroup)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
## Observation:
- It looks like there are too few WNT samples (N = 3) to enable unsupervised clustering of those samples in this dataset.

# Rerun of OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering
- Turns out the initial FPKM values were not log2 transformed, thus MB subtypes in pbta-histologies.tsv from MB classifier are not accurate.
```{r}
# Replot UMAP with new subtype labels
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype 
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(pbta_combined_subtypes)
# Plot by subtype
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
-  This looks A LOT better!

# session info
```{r}
session_info()
```
