Exploration and Visualization of Breast Cancer Proteomes
Feb 14, 2018
Francisco Requena
6 minute read

In this post, we will try to explore and visualize datasets of proteome expression in breast cancer patients from this paper: Proteogenomics connects somatic mutations to signalling in breast cancer, Nature 2016. The data was obtained from Kaggle.

The PAM50 test allows to separate the breast cancer patients into four different groups (Basal-like, HER2-enriched, Luminal A, Luminal B) depending on mRNA levels of a set of 50 genes with biological relevance. In this case, we will use protein expression instead of mRNA levels with the same list of genes used in PAM50. In fact, we will use 35 of 50 genes because of high number of NA values in 15 of them.

1. Load libraries

library(org.Hs.eg.db)  # Annotation data (install from Bioconductor)
library(ggbiplot) # biplots with ggplot2 (install from Github)
library(dplyr) # Cleaning data
library(tidyr) # Cleaning data
library(ggplot2)  # Visualization
library(gridExtra) # Combine plots
library(Hmisc) # Imputation NA values
library(factoextra) # PCA
library(reshape2) # Reshaping data
library(AnnotationDbi) # Annotation data (install from Bioconductor)
library(clusterProfiler) # Gene Ontology enrichment (install from Bioconductor)
library(d3heatmap) # Heatmaps plots
library(topGO) # Gene ontology
library(DT) # Make interactive tables in Rmarkdown

2. Read data

# Dataset with the protein expression for every patient
proteomes <- read.csv('data/proteomes/77_cancer_proteomes_CPTAC_itraq.csv')

# Clinical information for every patient, such as cancer phase, age, genre...
clinical <- read.csv('data/proteomes/clinical_data_breast_cancer.csv')

# Data with the id's of genes
genes <- read.csv('data/proteomes/PAM50_proteins.csv')

3. Exploration of clinical data

clinical_stage <- clinical %>%
  group_by(AJCC.Stage) %>%
  count_(.) %>%
  mutate(perc = round(n/105,2) * 100)

ggplot(clinical_stage, aes(AJCC.Stage, perc)) +
  geom_col(aes(fill = AJCC.Stage)) +
  ylab('Percentage (%)') +
  theme(axis.text.x = element_text(face="bold", angle=90)) +
  ggtitle('Percentage of patients by breast cancer phase')

clinical_patho <- clinical %>%
  group_by(ER.Status , PR.Status , HER2.Final.Status) %>%
  count_(.) %>%
  mutate(Percentage = round(n / 105,2) * 100) %>%
  select(-n)

datatable(clinical_patho, 
          options = list(pageLength = 10, dom = 'tip'), rownames = FALSE)
clinical_class <- clinical %>%
  group_by(PAM50.mRNA) %>%
  count_(.) %>%
  mutate(perc = n/105)
p1 <- ggplot(clinical_class, aes(PAM50.mRNA, n)) +
  geom_col(aes(fill = PAM50.mRNA), color = 'black') +
  scale_fill_discrete(name = "PAM50 subtypes") +
  ylab('Percentage') +
  theme(axis.text.x = element_text(face="bold", angle=90))

p2 <- ggplot(clinical, aes(PAM50.mRNA, OS.Time)) +
  geom_boxplot(aes(fill = PAM50.mRNA)) +
  geom_point(size = 0.5) +
  ylab('Overall Survival (days)') +
  scale_fill_discrete(name = "PAM50 subtypes") +
  theme(axis.text.x = element_text(face="bold", angle=90))

p3 <- ggplot(clinical, aes(OS.Time)) +
  geom_density(aes(group = PAM50.mRNA, color = PAM50.mRNA )) +
  xlab('Overall Survival (days)') +
  theme(axis.text.x = element_text(face="bold", angle=90))

p4 <- ggplot(clinical, aes(PAM50.mRNA, Age.at.Initial.Pathologic.Diagnosis)) +
  geom_boxplot(aes(fill = PAM50.mRNA)) +
  geom_point(size = 0.5) +
  ylab('Age of diagnosis') +
  scale_fill_discrete(name = "PAM50 subtypes") +
  theme(axis.text.x = element_text(face="bold", angle=90))

grid.arrange(p1, p3, p2, p4, ncol = 2)

4. Proteome - Imputation of NA values

proteomes <- proteomes[c(1,4:(ncol(proteomes)-3))]

number_na <- as.data.frame(table(apply(proteomes,1,function(x) sum(is.na(x)))))

# Plot of NA values by protein. 
p1 <- ggplot(number_na[-1,], aes(Var1, Freq)) +
  geom_col(aes(fill = Freq)) +
  scale_fill_continuous(guide = guide_legend(title = "Frequency"),
                        name = "Frequency of NA values") +
  xlab('Number of NA values') +
  ylab('Nº of proteins') +
  geom_vline(xintercept = 20, color = 'red') +
  theme_minimal() +
  ggtitle('Total number of proteins')

# Delete proteins with at least 20 % of NA values
number_row <- apply(proteomes,1,function(x) sum(is.na(x)))
proteomes <- proteomes[number_row <= 20,]
number_na <-  as.data.frame(table(apply(proteomes,1,function(x) sum(is.na(x)))))

# Plot of NA values by protein. 
p2 <- ggplot(number_na[-1,], aes(Var1, Freq)) +
  geom_col(aes(fill = Freq)) +
  scale_fill_continuous(guide = guide_legend(title = "Frequency"),
                        name = "Frequency of NA values") +
  xlab('Number of NA values') +
  ylab('Nº of proteins') +
  theme_minimal() +
  ggtitle('Number of proteins with NA values below 20')


grid.arrange(p1, p2)

5. Proteome - cleaning data

# Merge columns from the same sample
# AO-A12D.01TCGA - AO-A12D.05TCGA (same sample)
# C8-A131.01TCGA - C8-A131.32TCGA (same sample)
# AO-A12B.01TCGA - AO-A12B.34TCGA (same sample)
  proteomes <- proteomes %>%
    mutate(AO.A12D.01TCGA  = (AO.A12D.01TCGA + AO.A12D.05TCGA)/2) %>%
    mutate(C8.A131.01TCGA = (C8.A131.01TCGA + C8.A131.32TCGA)/2) %>%
    mutate(AO.A12B.01TCGA = (AO.A12B.01TCGA + AO.A12B.34TCGA)/2) %>%
    dplyr::select(-c(AO.A12D.05TCGA, C8.A131.32TCGA, AO.A12B.34TCGA))
  
# Gathering dataset
proteomes[,2:ncol(proteomes)] <- apply(proteomes[,2:ncol(proteomes)], 2,
                                       function(x) impute(x, mean))
  
proteomes <- gather(proteomes, 'patient', 'expression', c(2:ncol(proteomes)))
colnames(proteomes)[1] <- 'id_protein'

# Regular expression to simplify the ids of patients
proteomes$patient <- gsub('.[0-9]{2}TCGA', '', proteomes$patient )
proteomes$patient <- gsub('[A-Z&0-9]{2}\\.', '', proteomes$patient )
clinical$Complete.TCGA.ID <- gsub('TCGA-[A-Z|0-9]{2}\\-', '', clinical$Complete.TCGA.ID)

colnames(clinical)[c(1,21)] <- c('patient', 'type')

protclinical <- left_join(proteomes, dplyr::select(clinical, patient, type), by = 'patient')

genes <- genes %>%
  select(RefSeqProteinID, GeneSymbol) %>%
  mutate(RefSeqProteinID = as.character(RefSeqProteinID))

protclinical <- left_join(protclinical, genes, by = c('id_protein' = 'RefSeqProteinID'))

head(protclinical)
##   id_protein patient expression          type GeneSymbol
## 1  NP_958782    A12D   1.098410 HER2-enriched       <NA>
## 2  NP_958785    A12D   1.106029 HER2-enriched       <NA>
## 3  NP_958786    A12D   1.106029 HER2-enriched       <NA>
## 4  NP_000436    A12D   1.104124 HER2-enriched       <NA>
## 5  NP_958781    A12D   1.104269 HER2-enriched       <NA>
## 6  NP_958780    A12D   1.102292 HER2-enriched       <NA>

6. Getting dataframes

# Select those proteins that belong to the set of genes used by PAM50
pam50_genes <- protclinical[protclinical$id_protein %in% genes$RefSeqProteinID,]
pam50_genes <- pam50_genes %>%
   group_by(GeneSymbol, patient, type) %>%
   dplyr::summarise(expression = median(expression)) %>%
   ungroup()

7. Expression of proteins by subtype.

ggplot(pam50_genes[1:1463,], aes(GeneSymbol, expression)) +
  geom_boxplot(aes(color = type)) +
  ggtitle('Expression of proteins by subtype (PAM50 Model) - 1º part') +
  theme(axis.text.x=element_text(angle=90,hjust=1)) 

ggplot(pam50_genes[1464:nrow(pam50_genes),], aes(GeneSymbol, expression)) +
  geom_boxplot(aes(color = type)) +
  ggtitle('Expression of proteins by subtype (PAM50 Model) - 2º part') +
  theme(axis.text.x=element_text(angle=90,hjust=1))

8. 50 proteins model (PAM50)

8.1 PCA

df_model <- dcast(pam50_genes, patient + type ~ GeneSymbol, value.var = 'expression')

df_pca <- prcomp(df_model[,3:ncol(df_model)], center = TRUE, scale. = TRUE)

p1 <- ggbiplot(df_pca, groups = df_model$type, obs.scale = 1,var.scale = 1,var.axes = TRUE, ellipse = TRUE)
p2 <- fviz_contrib(df_pca, choice = "var", axes = 1, top = 15)
p3 <- fviz_contrib(df_pca, choice = "var", axes = 2, top = 15)

p1

grid.arrange(p2, p3)

8.2 Clustering k-means

fviz_nbclust(df_model[,-c(1,2)], kmeans, method = "wss")

k2 <- kmeans(df_model[3:ncol(df_model)], centers = 2, nstart = 25)
k3 <- kmeans(df_model[3:ncol(df_model)], centers = 3, nstart = 25)
k4 <- kmeans(df_model[3:ncol(df_model)], centers = 4, nstart = 25)
k5 <- kmeans(df_model[3:ncol(df_model)], centers = 5, nstart = 25)

p1 <- fviz_cluster(k2, geom = "point", data = df_model[3:ncol(df_model)]) + ggtitle("k = 2") + theme_minimal()
p2 <- fviz_cluster(k3, geom = "point",  data = df_model[3:ncol(df_model)]) + ggtitle("k = 3") + theme_minimal()
p3 <- fviz_cluster(k4, geom = "point",  data = df_model[3:ncol(df_model)]) + ggtitle("k = 4") + theme_minimal()
p4 <- fviz_cluster(k5, geom = "point",  data = df_model[3:ncol(df_model)]) + ggtitle("k = 5") + theme_minimal()

grid.arrange(p1, p2, p3, p4, nrow = 2)

8.3 Hierachical clustering (Interactive)

pam50_genes_cast <- dcast(pam50_genes, GeneSymbol ~ patient, value.var = 'expression')
rownames(pam50_genes_cast) <- pam50_genes_cast[,1]
pam50_genes_cast <- pam50_genes_cast[,-1]
d3heatmap(pam50_genes_cast, scale = "column", colors = "RdYlBu", k_row = 4, k_col = 3)

9. Gene Ontology

vector_genes <- AnnotationDbi::select(org.Hs.eg.db, 
       keys = as.character(unique(pam50_genes$GeneSymbol)),
       columns = c("ENTREZID", "SYMBOL"),
       keytype = "SYMBOL")

9.1 GO Enrichment Analysis

ego <- enrichGO(gene          = vector_genes$ENTREZID,
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                readable      = TRUE)


dotplot(ego)



comments powered by Disqus