Load packages
suppressMessages(library(dplyr))
suppressMessages(library(umap))
suppressMessages(library(ggplot2))
suppressMessages(library(devtools))
suppressMessages(library(gdata))
Directories and Files
Directories
# Data directory
data_dir <- file.path("~/Documents/GitHub/jharenza.github.io/openPBTA-notebooks/mb-subtypes/data/")
pbta_data_dir <- file.path("~/OpenPBTA-analysis/data/release-v16-20200320/")
# Create a results directory
results_dir <- "results"
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
Read in files
# PBTA
pbta_fpkm_collapsed_mat <- readRDS(pbta_rnaseq_file)
pbta_fpkm_full_mat <- readRDS(pbta_rnaseq_file_full)
pbta_histologies_df <- read.delim(pbta_hist_file, sep = "\t", header = T)
# PPTC
load(file.path(data_dir, "2019-02-14-pptc-STAR_cufflinks_hg19_matrix_244.rda"), verbose = T)
Loading objects:
rna.mat
pptc_subtypes_df <- read.delim(pptc_annot_file, sep = "\t", header = T)
# Rename PPTC df, remove gene name column, subset MB samples
mv(from = "rna.mat", to = "pptc_fpkm_df")
pptc_fpkm_df$gene_short_name <- NULL
pptc_mb_df <- pptc_fpkm_df %>%
select(pptc_subtypes_df$Sample)
# Michael Taylor (MB classifier training set), rename
load(file.path(data_dir, "mt_rnaseq.RData"), verbose = T)
Loading objects:
mt_mat
mt_annot
mv(from = "mt_mat", to = "mt_fpkm_df")
head(mt_fpkm_df)
head(mt_annot)
# MM2S PBTA classifier results, MB classifier results, pathology data
# MM2S Paper: https://scfbm.biomedcentral.com/articles/10.1186/s13029-016-0053-y
pbta_combined_subtypes <- read.delim(classifier_path_file, sep = "\t", header = T)
# How many of each subtype identified by pathology?
table(pbta_combined_subtypes$pathology_subtype)
Group 3 or 4 Group 4 non-WNT SHH WNT
11 2 1 12 6
PBTA wrangling
# Only pull out sample identifiers (KidsFirst biospecimen identifiers) that correspond to medulloblastoma samples
medulloblastoma_samples <- pbta_histologies_df %>%
filter(short_histology == "Medulloblastoma") %>%
filter(experimental_strategy == "RNA-Seq") %>%
filter(RNA_library == "stranded") %>%
pull(Kids_First_Biospecimen_ID)
# Calculate N
length(medulloblastoma_samples)
[1] 121
# Collect subtypes and count
mb_molecular_subtype_df <- pbta_histologies_df %>%
filter(short_histology == "Medulloblastoma") %>%
filter(experimental_strategy == "RNA-Seq") %>%
filter(RNA_library == "stranded") %>%
select(Kids_First_Biospecimen_ID, molecular_subtype, seq_center)
# Calculate N
nrow(mb_molecular_subtype_df)
[1] 121
# How many per subtype?
table(droplevels(mb_molecular_subtype_df$molecular_subtype))
Group3 Group4 SHH WNT
15 67 28 11
Create matrices of only MB samples
# select PBTA MB samples only
pbta_mb_coll_df <- pbta_fpkm_collapsed_mat %>%
select(medulloblastoma_samples)
pbta_mb_full_df <- pbta_fpkm_full_mat %>%
select(medulloblastoma_samples)
# select PPTC MB samples only
pptc_mb_df <- pptc_fpkm_df %>%
select(pptc_subtypes_df$Sample)
length(pptc_subtypes_df$Sample)
[1] 18
# How many per subtype?
table(pptc_subtypes_df$class_subtype)
Group3 Group4 SHH WNT
7 2 7 2
table(pptc_subtypes_df$path_subtype)
Group 3 Group 4 SHH WNT
5 1 6 1
pptc_subtypes_df
# 12/13 classified correctly - too few samples for clustering
12/13*100
[1] 92.30769
# How many in MT dataset?
length(mt_annot$Sample)
[1] 97
table(mt_annot$Subgroup)
Group3 Group4 SHH WNT
19 25 50 3
# convert to matrix
pbta_mb_coll_mat <- as.matrix(pbta_mb_coll_df)
pbta_mb_full_mat <- as.matrix(pbta_mb_full_df)
mt_fpkm_mat <- as.matrix(mt_fpkm_df)
Calculate variance for OpenPBTA collapsed data
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
# Find the value that we'll use as a threshold to filter the top 5%
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
# Row indices of high variance genes
high_variance_index <- which(gene_variance > variance_threshold)
Set seed for reproducible UMAP results
set.seed(2020)
OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering
Using collapsed matrix
# expects features (genes) to be columns, so we have to use t()
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype
umap_plot_df <- data.frame(umap_results$layout) %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
inner_join(mb_molecular_subtype_df)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
# Plot by subtype
umap_plot_df %>%
ggplot(aes(x = X1,
y = X2,
color = molecular_subtype)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

# Plot by sequencing center - is this having any effect? (Not really)
umap_plot_df %>%
ggplot(aes(x = X1,
y = X2,
color = seq_center)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

Merge MM2S classifier UMAP
Will use this later
pbta_results_mer <- merge(umap_plot_df, pbta_combined_subtypes, by = "Kids_First_Biospecimen_ID")
head(pbta_results_mer)
Calculate variance for OpenPBTA non-collapsed data
Does this give similar results?
gene_variance <- matrixStats::rowVars(pbta_full_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering - full matrix
umap_results <- umap::umap(t(pbta_full_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
inner_join(mb_molecular_subtype_df)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
umap_plot_df %>%
ggplot(aes(x = X1,
y = X2,
color = molecular_subtype)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

- Yes, will proceed only with the collapsed matrix.
Exploration
Are the subtypes that don’t classify by unsupervised clustering the same subtypes that did not match pathology subtypes?
pbta_results_mer %>%
ggplot(aes(x = X1,
y = X2,
color = pathology_subtype,
shape = mb_classifier_prediction)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

Misclassified samples:
- For the 3 samples within the WNT cluster not classified as WNT, two were WNT by pathology and the other was not noted by pathology.
- For 1 sample classified as WNT that clusterd with SHH, pathology was not noted.
- For the 6 samples within the SHH cluster that were classified as Group 4, 3 were noted as SHH by pathology.
- 1 was SHH by pathology.
- Within this cluster, one sample was classified as SHH but pathology noted as WNT.
- The other 3 samples were not noted by pathology.
- Two samples classified as WNT present in the Group 3/4 cluster were not noted by pathology.
- 6 samples were classified as SHH but are in the Group 3/4 cluster. 2 of these were SHH by pathology, and the rest were not noted. # Examine accuracy for this dataset
# Subset for only samples that had pathology results
subtypes_calc <- subset(pbta_combined_subtypes, !is.na(pathology_subtype))
# convert to character to perform ifelse on factors with different levels
subtypes_calc$pathology_subtype <- as.character(subtypes_calc$pathology_subtype)
subtypes_calc$mb_classifier_prediction <- as.character(subtypes_calc$mb_classifier_prediction)
# Create column to calculate matches
subtypes_calc$mbclass_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$mb_classifier_prediction, "true",
ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group3", "true",
ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group4", "true", "false")))
# How many were predicted correctly (true)?
table(subtypes_calc$mbclass_v_path)
false true
6 26
# Percent of classifications that match MB classifier calls
(26/(6+26))*100
[1] 81.25
# Create column to calculate matches for MM2S classifier
subtypes_calc$MM2S_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$MM2S_prediction, "true",
ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group3", "true",
ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group4", "true", "false")))
table(subtypes_calc$MM2S_v_path)
false true
7 25
# Percent of classifications that match MM2S classifier calls
(25/(7+25))*100
[1] 78.125
plot MM2S predictions
First, combine matrix with subtypes
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
inner_join(pbta_combined_subtypes)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
Does this data look better?
umap_plot_df %>%
ggplot(aes(x = X1,
y = X2,
color = MM2S_prediction,
shape = mb_classifier_prediction)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

Calculate variance for Michael Taylor RNA-Seq data
gene_variance <- matrixStats::rowVars(mt_mat_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
Michael Taylor (N = 97) UMAP clustering
umap_results <- umap::umap(t(mt_mat_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
tibble::rownames_to_column("Sample") %>%
inner_join(mt_annot)
Joining, by = "Sample"
# Plot
umap_plot_df %>%
ggplot(aes(x = X1,
y = X2,
color = Subgroup)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

Observation:
- It looks like there are too few WNT samples (N = 3) to enable unsupervised clustering of those samples in this dataset.
Rerun of OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering
- Turns out the initial FPKM values were not log2 transformed, thus MB subtypes in pbta-histologies.tsv from MB classifier are not accurate.
# Replot UMAP with new subtype labels
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype
umap_plot_df <- data.frame(umap_results$layout) %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
inner_join(pbta_combined_subtypes)
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining character vector and factor, coercing into character vector
# Plot by subtype
umap_plot_df %>%
ggplot(aes(x = X1,
y = X2,
color = mb_classifier_prediction)) +
geom_point(size = 3, alpha = 0.7) +
theme_bw() +
xlab("UMAP1") +
ylab("UMAP2")

session info
session_info()
─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
package * version date lib source
askpass 1.1 2019-01-13 [1] CRAN (R 3.6.0)
assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
backports 1.1.5 2019-10-02 [1] CRAN (R 3.6.0)
base64enc 0.1-3 2015-07-28 [1] CRAN (R 3.6.0)
callr 3.3.2 2019-09-22 [1] CRAN (R 3.6.0)
cli 1.1.0 2019-03-19 [1] CRAN (R 3.6.0)
colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
desc 1.2.0 2018-05-01 [1] CRAN (R 3.6.0)
devtools * 2.2.1 2019-09-24 [1] CRAN (R 3.6.0)
digest 0.6.22 2019-10-21 [1] CRAN (R 3.6.1)
dplyr * 0.8.3 2019-07-04 [1] CRAN (R 3.6.0)
ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
fs 1.3.1 2019-05-06 [1] CRAN (R 3.6.0)
gdata * 2.18.0 2017-06-06 [1] CRAN (R 3.6.0)
ggplot2 * 3.2.1 2019-08-10 [1] CRAN (R 3.6.0)
glue 1.3.1 2019-03-12 [1] CRAN (R 3.6.0)
gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
gtools 3.8.1 2018-06-26 [1] CRAN (R 3.6.0)
htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
jsonlite 1.6 2018-12-07 [1] CRAN (R 3.6.0)
knitr 1.25 2019-09-18 [1] CRAN (R 3.6.0)
labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
lattice 0.20-38 2018-11-04 [1] CRAN (R 3.6.1)
lazyeval 0.2.2 2019-03-15 [1] CRAN (R 3.6.0)
lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
Matrix 1.2-17 2019-03-22 [1] CRAN (R 3.6.1)
matrixStats 0.55.0 2019-09-07 [1] CRAN (R 3.6.0)
memoise 1.1.0 2017-04-21 [1] CRAN (R 3.6.0)
munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
openssl 1.4.1 2019-07-18 [1] CRAN (R 3.6.0)
pillar 1.4.6 2020-07-10 [1] CRAN (R 3.6.2)
pkgbuild 1.0.6 2019-10-09 [1] CRAN (R 3.6.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.6.0)
prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.6.0)
processx 3.4.1 2019-07-18 [1] CRAN (R 3.6.0)
ps 1.3.0 2018-12-21 [1] CRAN (R 3.6.0)
purrr 0.3.3 2019-10-18 [1] CRAN (R 3.6.0)
R6 2.4.0 2019-02-14 [1] CRAN (R 3.6.0)
Rcpp 1.0.5 2020-07-06 [1] CRAN (R 3.6.2)
remotes 2.1.0 2019-06-24 [1] CRAN (R 3.6.0)
reticulate 1.16 2020-05-27 [1] CRAN (R 3.6.2)
rlang 0.4.7 2020-07-09 [1] CRAN (R 3.6.2)
rmarkdown 1.16 2019-10-01 [1] CRAN (R 3.6.0)
rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.6.0)
RSpectra 0.16-0 2019-12-01 [1] CRAN (R 3.6.0)
rstudioapi 0.10 2019-03-19 [1] CRAN (R 3.6.0)
scales 1.0.0 2018-08-09 [1] CRAN (R 3.6.0)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
stringi 1.4.3 2019-03-12 [1] CRAN (R 3.6.0)
stringr 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
testthat 2.2.1 2019-07-25 [1] CRAN (R 3.6.0)
tibble 3.0.3 2020-07-10 [1] CRAN (R 3.6.2)
tidyselect 1.1.0 2020-05-11 [1] CRAN (R 3.6.2)
umap * 0.2.6.0 2020-06-16 [1] CRAN (R 3.6.2)
usethis * 1.5.1 2019-07-04 [1] CRAN (R 3.6.0)
vctrs 0.3.2 2020-07-15 [1] CRAN (R 3.6.2)
withr 2.1.2 2018-03-15 [1] CRAN (R 3.6.0)
xfun 0.10 2019-10-01 [1] CRAN (R 3.6.0)
yaml 2.2.0 2018-07-25 [1] CRAN (R 3.6.0)
[1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
---
title: "Explore medulloblastoma subtype calls in the context of unsupervised clustering"
output: html_notebook    
toc: TRUE
toc_float: TRUE
editor_options: 
  chunk_output_type: inline
---
```{r global_options, include=F}
knitr::opts_chunk$set(error = F, echo = T, warning = T, message = T)
```


# Load packages
```{r}
suppressMessages(library(dplyr))
suppressMessages(library(umap))
suppressMessages(library(ggplot2))
suppressMessages(library(devtools))
suppressMessages(library(gdata))
```

# Directories and Files
## Directories
```{r}
# Data directory
data_dir <- file.path("~/Documents/GitHub/jharenza.github.io/openPBTA-notebooks/mb-subtypes/data/")
pbta_data_dir <- file.path("~/OpenPBTA-analysis/data/release-v16-20200320/")
# Create a results directory
results_dir <- "results"
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}
```

## Input Filepaths
```{r}
pbta_rnaseq_file <- file.path(pbta_data_dir, "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds")
pbta_rnaseq_file_full <- file.path(pbta_data_dir, "pbta-gene-expression-rsem-fpkm.stranded.rds")
pbta_hist_file <- file.path(pbta_data_dir, "pbta-histologies.tsv")
pptc_annot_file <- file.path(data_dir, "pptc_annot.tsv")
pptc_hist_file <- file.path(data_dir, "pptc-pdx-clinical-web.txt")
classifier_path_file <- file.path(data_dir, "rathi_results.tsv")
```
## Read in files
```{r}
# PBTA
pbta_fpkm_collapsed_mat <- readRDS(pbta_rnaseq_file)
pbta_fpkm_full_mat <- readRDS(pbta_rnaseq_file_full)
pbta_histologies_df <- read.delim(pbta_hist_file, sep = "\t", header = T)
# PPTC
load(file.path(data_dir, "2019-02-14-pptc-STAR_cufflinks_hg19_matrix_244.rda"), verbose = T)
pptc_subtypes_df <- read.delim(pptc_annot_file, sep = "\t", header = T)
# Rename PPTC df, remove gene name column, subset MB samples
mv(from = "rna.mat", to = "pptc_fpkm_df")
pptc_fpkm_df$gene_short_name <- NULL
pptc_mb_df <- pptc_fpkm_df %>%
  select(pptc_subtypes_df$Sample)

# Michael Taylor (MB classifier training set), rename
load(file.path(data_dir, "mt_rnaseq.RData"), verbose = T)
mv(from = "mt_mat", to = "mt_fpkm_df")
head(mt_fpkm_df)
head(mt_annot)

# MM2S PBTA classifier results, MB classifier results, pathology data
# MM2S Paper: https://scfbm.biomedcentral.com/articles/10.1186/s13029-016-0053-y
pbta_combined_subtypes <- read.delim(classifier_path_file, sep = "\t", header = T)
# How many of each subtype identified by pathology?
table(pbta_combined_subtypes$pathology_subtype)
```

# PBTA wrangling
```{r}
# Only pull out sample identifiers (KidsFirst biospecimen identifiers) that correspond to medulloblastoma samples
medulloblastoma_samples <- pbta_histologies_df %>%
  filter(short_histology == "Medulloblastoma") %>%
  filter(experimental_strategy == "RNA-Seq") %>%
  filter(RNA_library == "stranded") %>%
  pull(Kids_First_Biospecimen_ID)
# Calculate N
length(medulloblastoma_samples)
# Collect subtypes and count
mb_molecular_subtype_df <- pbta_histologies_df %>%
  filter(short_histology == "Medulloblastoma") %>%
  filter(experimental_strategy == "RNA-Seq") %>%
  filter(RNA_library == "stranded") %>%
  select(Kids_First_Biospecimen_ID, molecular_subtype, seq_center)
# Calculate N
nrow(mb_molecular_subtype_df)
# How many per subtype?
table(droplevels(mb_molecular_subtype_df$molecular_subtype))
```
# Create matrices of only MB samples
```{r}
# select PBTA MB samples only
pbta_mb_coll_df <- pbta_fpkm_collapsed_mat %>% 
  select(medulloblastoma_samples)
pbta_mb_full_df <- pbta_fpkm_full_mat %>% 
  select(medulloblastoma_samples)
# select PPTC MB samples only
pptc_mb_df <- pptc_fpkm_df %>% 
  select(pptc_subtypes_df$Sample)
length(pptc_subtypes_df$Sample)
# How many per subtype?
table(pptc_subtypes_df$class_subtype)
table(pptc_subtypes_df$path_subtype)
pptc_subtypes_df
# 12/13 classified correctly - too few samples for clustering
12/13*100

# How many in MT dataset?
length(mt_annot$Sample)
table(mt_annot$Subgroup)

# convert to matrix
pbta_mb_coll_mat <- as.matrix(pbta_mb_coll_df)
pbta_mb_full_mat <- as.matrix(pbta_mb_full_df)
mt_fpkm_mat <- as.matrix(mt_fpkm_df)
```
# Transform matrices
## log2 matrix
```{r}
pbta_coll_log2 <- log2(pbta_mb_coll_mat)
pbta_full_log2 <- log2(pbta_mb_full_mat)
mt_mat_log2 <- log2(mt_fpkm_mat)
```
# Calculate variance for OpenPBTA collapsed data
```{r}
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
# Find the value that we'll use as a threshold to filter the top 5%
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
# Row indices of high variance genes
high_variance_index <- which(gene_variance > variance_threshold)
```
# Set seed for reproducible UMAP results
```{r}
set.seed(2020)
```

# OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering
## Using collapsed matrix
```{r}
# expects features (genes) to be columns, so we have to use t()
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype 
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(mb_molecular_subtype_df)
# Plot by subtype
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = molecular_subtype)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
```{r}
# Plot by sequencing center - is this having any effect? (Not really)
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = seq_center)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
# Merge MM2S classifier UMAP
## Will use this later
```{r}
pbta_results_mer <- merge(umap_plot_df, pbta_combined_subtypes, by = "Kids_First_Biospecimen_ID")
head(pbta_results_mer)
```
# Calculate variance for OpenPBTA non-collapsed data
## Does this give similar results?

```{r}
gene_variance <- matrixStats::rowVars(pbta_full_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
```
# OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering - full matrix

```{r}
umap_results <- umap::umap(t(pbta_full_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(mb_molecular_subtype_df)
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = molecular_subtype)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
- Yes, will proceed only with the collapsed matrix.

# Exploration
## Are the subtypes that don't classify by unsupervised clustering the same subtypes that did not match pathology subtypes?
```{r}
pbta_results_mer %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = pathology_subtype,
             shape = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
## Misclassified samples:
- For the 3 samples within the WNT cluster not classified as WNT, two were WNT by pathology and the other was not noted by pathology. 
- For 1 sample classified as WNT that clusterd with SHH, pathology was not noted. 
- For the 6 samples within the SHH cluster that were classified as Group 4, 3 were noted as SHH by pathology. 
- 1 was SHH by pathology. 
- Within this cluster, one sample was classified as SHH but pathology noted as WNT. 
- The other 3 samples were not noted by pathology. 
- Two samples classified as WNT present in the Group 3/4 cluster were not noted by pathology. 
- 6 samples were classified as SHH but are in the Group 3/4 cluster. 2 of these were SHH by pathology, and the rest were not noted.
# Examine accuracy for this dataset
```{r}
# Subset for only samples that had pathology results
subtypes_calc <- subset(pbta_combined_subtypes, !is.na(pathology_subtype)) 
# convert to character to perform ifelse on factors with different levels
subtypes_calc$pathology_subtype <- as.character(subtypes_calc$pathology_subtype)
subtypes_calc$mb_classifier_prediction <- as.character(subtypes_calc$mb_classifier_prediction)
# Create column to calculate matches
subtypes_calc$mbclass_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$mb_classifier_prediction, "true", 
                                       ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group3", "true", 
                                              ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$mb_classifier_prediction == "Group4", "true", "false")))
# How many were predicted correctly (true)?
table(subtypes_calc$mbclass_v_path)
# Percent of classifications that match MB classifier calls
(26/(6+26))*100
# Create column to calculate matches for MM2S classifier
subtypes_calc$MM2S_v_path <- ifelse(subtypes_calc$pathology_subtype == subtypes_calc$MM2S_prediction, "true", 
                                       ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group3", "true", 
                                              ifelse(subtypes_calc$pathology_subtype == "Group 3 or 4" & subtypes_calc$MM2S_prediction == "Group4", "true", "false")))
table(subtypes_calc$MM2S_v_path)
# Percent of classifications that match MM2S classifier calls
(25/(7+25))*100
```

# plot MM2S predictions
## First, combine matrix with subtypes

```{r}
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(pbta_combined_subtypes)
```  
## Does this data look better?
```{r}
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = MM2S_prediction,
             shape = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```


# Calculate variance for Michael Taylor RNA-Seq data
```{r}
gene_variance <- matrixStats::rowVars(mt_mat_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
```

# Michael Taylor (N = 97) UMAP clustering 
```{r}
umap_results <- umap::umap(t(mt_mat_log2[high_variance_index, ]))
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Sample") %>%
  inner_join(mt_annot)
# Plot
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = Subgroup)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
## Observation:
- It looks like there are too few WNT samples (N = 3) to enable unsupervised clustering of those samples in this dataset.

# Rerun of OpenPBTA stranded RNA-Seq (N = 121) UMAP clustering
- Turns out the initial FPKM values were not log2 transformed, thus MB subtypes in pbta-histologies.tsv from MB classifier are not accurate.
```{r}
# Replot UMAP with new subtype labels
gene_variance <- matrixStats::rowVars(pbta_coll_log2)
variance_threshold <- quantile(gene_variance, 0.95, na.rm = T)
high_variance_index <- which(gene_variance > variance_threshold)
umap_results <- umap::umap(t(pbta_coll_log2[high_variance_index, ]))
# Make a data frame of the layout results and join with molecular subtype 
umap_plot_df <- data.frame(umap_results$layout) %>%
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  inner_join(pbta_combined_subtypes)
# Plot by subtype
umap_plot_df %>%
  ggplot(aes(x = X1, 
             y = X2,
             color = mb_classifier_prediction)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_bw() +
  xlab("UMAP1") +
  ylab("UMAP2")
```
-  This looks A LOT better!

# session info
```{r}
session_info()
```
