Add MLL DEGs

NCBI-Hackathons · Feb 6, 2019 · 3c1ceda · 3c1ceda
1 parent d4431e0
commit 3c1ceda
Show file tree

Hide file tree

Showing 2 changed files with 1,690 additions and 15 deletions.
diff --git a/r_code/Differential_Expression_and_Lasso.Rmd b/r_code/Differential_Expression_and_Lasso.Rmd
@@ -143,20 +143,27 @@ CDE.train <- AML.CDE.s %>%
     grepl("High|Standard", Risk.group) ~ "Yes", 
     grepl("Low", Risk.group) ~ "No", 
     grepl("Unknown", Risk.group) ~ "Unknown" )) %>% 
-  mutate(Age.Class=ifelse(Age.at.Diagnosis.in.Days > median(Age.at.Diagnosis.in.Days), "Yes", "No"))
+  mutate(Age.Class=ifelse(Age.at.Diagnosis.in.Days > median(Age.at.Diagnosis.in.Days), "Yes", "No")) %>%
+  mutate(MLL.Update=case_when(
+    Primary.Cytogenetic.Code == "MLL" ~ "MLL", 
+    Primary.Cytogenetic.Code == "Unknown" ~ "Unknown", 
+    TRUE ~ "No"))
 
 dim(CDE.train)
 
 table(CDE.train$Risk.group)
 table(CDE.train$RiskGroup.Class)
 table(CDE.train$Age.Class)
+table(CDE.train$MLL.Update)
 ```
 
+```{r}
+# lapply(CDE.train[c(15:35,45:47)], table)
+```
 
 
 #Split the Expression Data into Train/Test
 
-
 ```{r}
 #remove any patient samples that are not diagnositic
 Keep <- colnames(counts) %>% 
@@ -181,7 +188,6 @@ dim(counts.train)
 ```
 
 
-
 #Differential Expression Analysis
 
 Split median Age (ref == young (0))
@@ -192,6 +198,8 @@ Split on High+std vs Low (ref)
 source("r_code/Limma_Voom_DE_Function.R")
 ```
 
+## Risk Group DEGs
+
 ```{r}
 pheno <- CDE.train$RiskGroup.Class %>% 
   set_names(CDE.train$TARGET.USI) %>%
@@ -202,32 +210,24 @@ length(pheno)
 table(pheno)
 ```
 
-
 ```{r}
 DE.RG <- voom_DE(counts.df = counts.sub, ref="No",pheno=pheno)
 ```
 
-
 ```{r}
 DE.RG$desingMatrix[1:5,]
 table(DE.RG$desingMatrix[,1])
 dim(DE.RG$voomTransformation$E) #17445    93
 ```
 
-```{r}
-CDE.train %>% 
-  select( "RiskGroup.Class")
-  filter(TARGET.USI %in% rownames(DE.RG$desingMatrix[1:5,])) %>%
- 
-```
-
 ```{r}
 head(DE.RG$DEGs)
 dim(DE.RG$DEGs)
 
 # write.csv(DE.RG$DEGs, "r_code/TARGET_AML_High.Std.Risk_vs_LowRisk_DEGs.csv")
 ```
 
+## Age DEGs (by Median)
 
 ```{r}
 pheno2 <- CDE.train$Age.Class %>% 
@@ -244,7 +244,6 @@ table(pheno2)
 DE.Age <- voom_DE(counts.df = counts.sub, ref="No",pheno=pheno2)
 ```
 
-
 ```{r}
 head(DE.Age$DEGs)
 
@@ -256,13 +255,113 @@ Deliverables
 1. notebook/script
 2. significance of findings
 
+#MLL 
+
+```{r}
+pheno3 <- CDE.train$MLL.Update %>% 
+  set_names(CDE.train$TARGET.USI) %>% 
+  .[. != "Unknown"]
+
+table(pheno3)
+```
+
+```{r}
+DE.MLL <- voom_DE(counts.df = counts.train, ref = "No", pheno = pheno3)
+```
+
+
+```{r}
+dim(DE.MLL$DEGs) # 1575    6
+
+# write.csv(DE.MLL$DEGs, "r_code/TARGET_AML_MLL_vs_Others_DEGs.csv")
+```
+
+```{r}
+table(CDE.train$MLL, CDE.train$Primary.Cytogenetic.Code)
+```
+
+
 
 #Lasso with DEGs
 
-There is not too much class imbalance on either 
+```{r}
+library(glmnet)
+```
 
 ```{r}
- 
+glm.binom <- function(x,y,df,standardize=FALSE){
+  library(glmnet)
+  #df is the matrix with the response and  gene expression. Patietns as rownames.
+  #x is the expresion matrix, genes as rownames. 
+  
+  response <- y
+  predictors <- x
+  
+  y <- factor(df[,y])
+  x <- as.matrix(df[,x]) #NOTE: for categorical predictors data, should use model.matrix 
+  
+  if (any(is.na(y))){
+    print("There Are Missing Values.")
+    return(y)
+  }else if (length(levels(y)) > 2 ){
+    print("More tha two levels")
+    
+    sel <- grepl("yes|no", df[,response]) #select only the yes or no. 
+    df <- df[sel, ] #subset the df
+    
+    y <- factor(df[,response])
+    x <- as.matrix(df[,predictors])
+  }
+  
+  #Check the reference level of the response.
+  contrast <- contrasts(y)
+
+  #Use validation set approach. split observations into approx. equal groups.
+  set.seed(1) #changing this, see dramaticaly changes in results. THUS NEED an outer loop for sample cross-validation.  
+  train <- sample(c(TRUE,FALSE), nrow(x), replace = TRUE)
+  test <- (!train)
+
+  train.names <- rownames(df)[train]
+  test.names <- rownames(df)[test]
+
+  #grid of lambda values to test.
+  grid <- 10^ seq(10,-2, length=100)
+
+  #training model.
+  mod <- glmnet(x[train,], y[train],family = "binomial",
+                  standardize = standardize, lambda = grid, intercept = FALSE)
+
+  #use cross-validation on the training model.CV only for lambda
+  set.seed(1) 
+  cv.lamdba <- cv.glmnet(x[train,], y[train],family = "binomial",
+                  standardize = standardize, lambda = grid, nfolds = 5,
+                  type.measure = "deviance", intercept = FALSE)
+
+  #Select lambda min.
+  lambda.min <- cv.lamdba$lambda.min
+
+  #predict the classes
+  pred.class <- predict(mod, newx = x[test,], type="class", s=lambda.min)
+
+  #find the test error
+  tab <- table(pred.class,y[test])
+  testError <- mean(pred.class != y[test]) #how many predicted classes were incorrect
+
+  #Fit the full dataset.
+  final <- glmnet(x, y,family = "binomial",
+                  standardize = standardize, lambda = grid, intercept = FALSE)
+
+  #Extract the coefficients
+  coef <- predict(final, type="coefficients", s=lambda.min)
+  idx <- which(coef != 0)
+  nonZero <- coef[idx,]
+
+  list <- list(train.names, test.names, contrast, mod, cv.lamdba,testError, final, nonZero)
+  names(list) <- c("training.set", "testing.set","contrast", "train.model",
+                   "cv.train", "test.error", "final.model", "nonzero.coef")
+  return(list)
+  
+}
 ```