Skip to content

Commit

Permalink
Add MLL DEGs
Browse files Browse the repository at this point in the history
  • Loading branch information
Jenny Smith committed Feb 6, 2019
1 parent d4431e0 commit 3c1ceda
Show file tree
Hide file tree
Showing 2 changed files with 1,690 additions and 15 deletions.
129 changes: 114 additions & 15 deletions r_code/Differential_Expression_and_Lasso.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -143,20 +143,27 @@ CDE.train <- AML.CDE.s %>%
grepl("High|Standard", Risk.group) ~ "Yes",
grepl("Low", Risk.group) ~ "No",
grepl("Unknown", Risk.group) ~ "Unknown" )) %>%
mutate(Age.Class=ifelse(Age.at.Diagnosis.in.Days > median(Age.at.Diagnosis.in.Days), "Yes", "No"))
mutate(Age.Class=ifelse(Age.at.Diagnosis.in.Days > median(Age.at.Diagnosis.in.Days), "Yes", "No")) %>%
mutate(MLL.Update=case_when(
Primary.Cytogenetic.Code == "MLL" ~ "MLL",
Primary.Cytogenetic.Code == "Unknown" ~ "Unknown",
TRUE ~ "No"))
dim(CDE.train)
table(CDE.train$Risk.group)
table(CDE.train$RiskGroup.Class)
table(CDE.train$Age.Class)
table(CDE.train$MLL.Update)
```

```{r}
# lapply(CDE.train[c(15:35,45:47)], table)
```


#Split the Expression Data into Train/Test


```{r}
#remove any patient samples that are not diagnositic
Keep <- colnames(counts) %>%
Expand All @@ -181,7 +188,6 @@ dim(counts.train)
```



#Differential Expression Analysis

Split median Age (ref == young (0))
Expand All @@ -192,6 +198,8 @@ Split on High+std vs Low (ref)
source("r_code/Limma_Voom_DE_Function.R")
```

## Risk Group DEGs

```{r}
pheno <- CDE.train$RiskGroup.Class %>%
set_names(CDE.train$TARGET.USI) %>%
Expand All @@ -202,32 +210,24 @@ length(pheno)
table(pheno)
```


```{r}
DE.RG <- voom_DE(counts.df = counts.sub, ref="No",pheno=pheno)
```


```{r}
DE.RG$desingMatrix[1:5,]
table(DE.RG$desingMatrix[,1])
dim(DE.RG$voomTransformation$E) #17445 93
```

```{r}
CDE.train %>%
select( "RiskGroup.Class")
filter(TARGET.USI %in% rownames(DE.RG$desingMatrix[1:5,])) %>%
```

```{r}
head(DE.RG$DEGs)
dim(DE.RG$DEGs)
# write.csv(DE.RG$DEGs, "r_code/TARGET_AML_High.Std.Risk_vs_LowRisk_DEGs.csv")
```

## Age DEGs (by Median)

```{r}
pheno2 <- CDE.train$Age.Class %>%
Expand All @@ -244,7 +244,6 @@ table(pheno2)
DE.Age <- voom_DE(counts.df = counts.sub, ref="No",pheno=pheno2)
```


```{r}
head(DE.Age$DEGs)
Expand All @@ -256,13 +255,113 @@ Deliverables
1. notebook/script
2. significance of findings

#MLL

```{r}
pheno3 <- CDE.train$MLL.Update %>%
set_names(CDE.train$TARGET.USI) %>%
.[. != "Unknown"]
table(pheno3)
```

```{r}
DE.MLL <- voom_DE(counts.df = counts.train, ref = "No", pheno = pheno3)
```


```{r}
dim(DE.MLL$DEGs) # 1575 6
# write.csv(DE.MLL$DEGs, "r_code/TARGET_AML_MLL_vs_Others_DEGs.csv")
```

```{r}
table(CDE.train$MLL, CDE.train$Primary.Cytogenetic.Code)
```



#Lasso with DEGs

There is not too much class imbalance on either
```{r}
library(glmnet)
```

```{r}
glm.binom <- function(x,y,df,standardize=FALSE){
library(glmnet)
#df is the matrix with the response and gene expression. Patietns as rownames.
#x is the expresion matrix, genes as rownames.
response <- y
predictors <- x
y <- factor(df[,y])
x <- as.matrix(df[,x]) #NOTE: for categorical predictors data, should use model.matrix
if (any(is.na(y))){
print("There Are Missing Values.")
return(y)
}else if (length(levels(y)) > 2 ){
print("More tha two levels")
sel <- grepl("yes|no", df[,response]) #select only the yes or no.
df <- df[sel, ] #subset the df
y <- factor(df[,response])
x <- as.matrix(df[,predictors])
}
#Check the reference level of the response.
contrast <- contrasts(y)
#Use validation set approach. split observations into approx. equal groups.
set.seed(1) #changing this, see dramaticaly changes in results. THUS NEED an outer loop for sample cross-validation.
train <- sample(c(TRUE,FALSE), nrow(x), replace = TRUE)
test <- (!train)
train.names <- rownames(df)[train]
test.names <- rownames(df)[test]
#grid of lambda values to test.
grid <- 10^ seq(10,-2, length=100)
#training model.
mod <- glmnet(x[train,], y[train],family = "binomial",
standardize = standardize, lambda = grid, intercept = FALSE)
#use cross-validation on the training model.CV only for lambda
set.seed(1)
cv.lamdba <- cv.glmnet(x[train,], y[train],family = "binomial",
standardize = standardize, lambda = grid, nfolds = 5,
type.measure = "deviance", intercept = FALSE)
#Select lambda min.
lambda.min <- cv.lamdba$lambda.min
#predict the classes
pred.class <- predict(mod, newx = x[test,], type="class", s=lambda.min)
#find the test error
tab <- table(pred.class,y[test])
testError <- mean(pred.class != y[test]) #how many predicted classes were incorrect
#Fit the full dataset.
final <- glmnet(x, y,family = "binomial",
standardize = standardize, lambda = grid, intercept = FALSE)
#Extract the coefficients
coef <- predict(final, type="coefficients", s=lambda.min)
idx <- which(coef != 0)
nonZero <- coef[idx,]
list <- list(train.names, test.names, contrast, mod, cv.lamdba,testError, final, nonZero)
names(list) <- c("training.set", "testing.set","contrast", "train.model",
"cv.train", "test.error", "final.model", "nonzero.coef")
return(list)
}
```


Expand Down
Loading

0 comments on commit 3c1ceda

Please sign in to comment.