markdown_part2.Rmd

---
title: "R analysis of RNASeq Count Data Workflow: 2"
author: "Saradha Venkatachapathy"
output: 
  html_document: 
    fig_caption: yes
    fig_height: 6
    highlight: tango
    toc: yes
---
# DESeq normalisation

As an alternative to TPKM, DESeq estimates the variance by taking into account both the coverage within each library ("size factors"), the expression strength under each condition, and the per-gene variance. Library coverage is determined by comparing the ratio of gene counts between replicate experiments. Expression strength is determined by the mean counts for a gene for each condition. The per-gene variance is assumed to be a function of the mean that is approximated by empirical fit to the data. P-values are calculated through a method that is analogous to a Fisher's exact test, using a 2x2 contingency table, but instead of assuming that the probabilities follow a hypergeometric distribution, they follow a negative binomial distribution parametrized from the mean and the estimated dispersion.



##Normalise the data and plot their distribution and compare with raw counts and TPKM normalisation

THe Steps to normalised are as follows.Take the geometric mean of each condition for a gene and use that as the reference expression data set. For each condition, get a list of quotients of each gene expression value to its reference expression. The median of each condition quotient list is the normalization factor for that data set.

Plot the calculated Size factor and the summed column values for each sample. Plot the gene dispersion vs mean read counts. 
Further, the relationship between the samples (normalized and un-normalized) is calculated using Principle component analysis and distance matrices and hierarchical clustering. 

Lastly, the distribution of the rnormalized transcript was plotted.

```{r}
rnaseq_mapped_genes<-read.csv("./allcombined_gene_level_expression_mapped_to_gene_name.csv")
rnaseq_mapped_genes<-rnaseq_mapped_genes[,-1]
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq")
library( "DESeq" )
# prepare count tables and design matrix
counttable<-rnaseq_mapped_genes[,c(50:61)]
rownames(counttable)<-rnaseq_mapped_genes[,1]
Design = data.frame(row.names = colnames( counttable ),
                    condition = c( "S1", "S2", "S3","S4",
                                   "S1", "S2", "S3","S4",
                                   "S1", "S2", "S3","S4") )
condition = factor( c( "S1", "S2", "S3","S4",
                       "S1", "S2", "S3","S4",
                       "S1", "S2", "S3","S4")  )

cds = newCountDataSet( counttable, condition )
#sizefactors to normalised
cds = estimateSizeFactors( cds )
sizeFactors( cds )

par(font.axis=2,font.lab=2)
barplot(sizeFactors( cds ), ylab="DEseq_sizeFactor", las=2)
box()

norm_counts<-( counts( cds, normalized=TRUE ) )
raw_counts<-( counts( cds, normalized=F ) )
tej<-cor((norm_counts),(raw_counts))

par(font.axis=2,font.lab=2)
barplot(colSums(norm_counts[,1:12]), ylab="summed norm.count in each sample", las=2,cex.axis=0.6)
box()



library("corrplot")
corrplot(as.matrix(tej),is.corr=F, tl.col="black",cl.pos="b", cl.ratio=0.3)

tpkm_norm<-as.matrix(rnaseq_mapped_genes[,c(63:74)])
tej<-cor((norm_counts),(tpkm_norm))
library("corrplot")
corrplot(as.matrix(tej),is.corr=F, tl.col="black",cl.pos="b", cl.ratio=0.3)

norm_counts<-as.data.frame(norm_counts)
norm_counts[,13]<-rownames(norm_counts)
norm_counts[,14]<-rowMeans(subset(norm_counts, select = c(S1_B1,S1_B2,S1_B3)), na.rm = TRUE)
norm_counts[,15]<-rowMeans(subset(norm_counts, select = c(S2_B2,S2_B2,S3_B3)), na.rm = TRUE)
norm_counts[,16]<-rowMeans(subset(norm_counts, select = c(S3_B3,S3_B2,S3_B3)), na.rm = TRUE)
norm_counts[,17]<-rowMeans(subset(norm_counts, select = c(S4_B3,S4_B2,S4_B3)), na.rm = TRUE)
colnames(norm_counts)[14:17]<-c("S1","S2","S3","S4")

#The package uses the relationship between the data’s variance (or dispersion) and its mean. 
cds = estimateDispersions( cds )
str( fitInfo(cds) )
plotDispEsts( cds )

# PCA
exon <- log2(norm_counts[, 1:12])
is.na(exon) <- do.call(cbind,lapply(exon, is.infinite)) 
exon.pca <- prcomp(na.omit(exon),scale= F) 
t<-as.data.frame(exon.pca$rotation)
plot(t[,2]~t[,1],pch=18,col=c("purple","blue","red","green4"), cex.axis=1,las=1, cex=1.2, 
     ylab="PC2",xlab="PC1")
legend(-0.2805761,0.08247002,c("S1","S2","S3","S4"), lty=c(1,1), lwd=c(2.5,2.5),col=c("purple","blue","red","green4")) 

library(corrplot)
par(mfrow=c(2,2))
t<-as.matrix(dist(t(exon), upper=T),method = "euclidean")
corrplot(t, is.corr=F, tl.col="black",method="color")
plot(hclust(dist(t(exon), method="euclidean")))
t<-as.matrix(dist(t(exon), upper=F,method = "maximum"))
corrplot(t, is.corr=F, tl.col="black",method="color")
plot(hclust(dist(t(exon), method="maximum")))

par(font.axis=2,font.lab=2)
par(mfrow=c(2,2))
hist(log(norm_counts[,14], base=2), col="gray", las=1, cex.axis=0.8,xlab="DESeq_norm",main="S1",xlim=c(-3,20))
box()
hist(log(norm_counts[,15], base=2), col="gray", las=1, cex.axis=0.8,xlab="DESeq_norm",main="S2",xlim=c(-3,20))
box()
hist(log(norm_counts[,16], base=2), col="gray", las=1, cex.axis=0.8,xlab="DESeq_norm",main="S3",xlim=c(-3,20))
box()
hist(log(norm_counts[,17], base=2), col="gray", las=1, cex.axis=0.8,xlab="DESeq_norm",main="S4",xlim=c(-3,20))
box()

par(font.axis=2,font.lab=2,mfrow=c(1,1))
d <- density(log(norm_counts[,17], base=2)) # returns the density data 
plot(d, col="green4", las=1, lwd=1, main="")
d <- density(log(norm_counts[,16], base=2)) # returns the density data 
lines(d,col="red",lwd=1)
d <- density(log(norm_counts[,15], base=2)) # returns the density data 
lines(d,col="blue",lwd=1)
d <- density(log(norm_counts[,14], base=2)) # returns the density data 
lines(d,col="purple",lwd=1)
legend(12,0.04176209,c("S1","S2","S3","S4"), lty=c(1,1), lwd=c(2.5,2.5),col=c("purple","blue","red","green4")) 



```


##calculate the significantly differing genes
Plot the pvalue distribution, MAplot and hist of foldchange. Write the sigificant genes to a csv file. Store the number and names of genes.

```{r}
#S1S2
S1S2 = nbinomTest( cds, "S1","S2" )
plotMA(S1S2)
hist(S1S2$pval, breaks=25,  main="", col="gray")
box()
S1S2Sig = S1S2[ S1S2$padj < 0.1, ]
S1S2Sig<-S1S2Sig[complete.cases(S1S2Sig), ]
S1S2_num<-nrow(S1S2Sig)
S1S2_gene<-(S1S2Sig[,1])
write.csv(S1S2Sig, file="S1S2sig_deseq_norm.csv")
hist((S1S2Sig[,6]), col="gray", las=1, cex.axis=0.8,xlab="log2Foldchange",breaks=50,
     main="S1S2DESeqnorm")
box()
temp<-subset(norm_counts,norm_counts[,13] %in% S1S2Sig[,1])


#S1S3
S1S3 = nbinomTest( cds, "S1","S3" )
plotMA(S1S3)
hist(S1S3$pval, breaks=25,  main="", col="gray")
box()
S1S3Sig = S1S3[ S1S3$padj < 0.1, ]
S1S3Sig<-S1S3Sig[complete.cases(S1S3Sig), ]
S1S3_num<-nrow(S1S3Sig)
S1S3_gene<-(S1S3Sig[,1])
write.csv(S1S3Sig, file="S1S3sig_deseq_norm.csv")
hist((S1S3Sig[,6]), col="gray", las=1, cex.axis=0.8,xlab="log2Foldchange",breaks=50,
     main="S1S3DESeqnorm")
box()
temp<-subset(norm_counts,norm_counts[,13] %in% S1S3Sig[,1])

#S1S4
S1S4 = nbinomTest( cds, "S1","S4" )
plotMA(S1S4)
hist(S1S4$pval, breaks=25,  main="", col="gray")
box()
S1S4Sig = S1S4[ S1S4$padj < 0.1, ]
S1S4Sig<-S1S4Sig[complete.cases(S1S4Sig), ]
S1S4_num<-nrow(S1S4Sig)
S1S4_gene<-(S1S4Sig[,1])
write.csv(S1S4Sig, file="S1S4sig_deseq_norm.csv")
hist((S1S4Sig[,6]), col="gray", las=1, cex.axis=0.8,xlab="log2Foldchange",breaks=50,
     main="S1S4DESeqnorm")
box()
temp<-subset(norm_counts,norm_counts[,13] %in% S1S4Sig[,1])

#S2S3
S2S3 = nbinomTest( cds, "S2","S3" )
plotMA(S2S3)
hist(S2S3$pval, breaks=25,  main="", col="gray")
box()
S2S3Sig = S2S3[ S2S3$padj < 0.1, ]
S2S3Sig<-S2S3Sig[complete.cases(S2S3Sig), ]
S2S3_num<-nrow(S2S3Sig)
S2S3_gene<-(S2S3Sig[,1])
write.csv(S2S3Sig, file="S2S3sig_deseq_norm.csv")
hist((S2S3Sig[,6]), col="gray", las=1, cex.axis=0.8,xlab="log2Foldchange",breaks=50,
     main="S2S3DESeqnorm")
box()
temp<-subset(norm_counts,norm_counts[,13] %in% S2S3Sig[,1])

#S2S4
S2S4 = nbinomTest( cds, "S2","S4" )
plotMA(S2S4)
hist(S2S4$pval, breaks=25,  main="", col="gray")
box()
S2S4Sig = S2S4[ S2S4$padj < 0.1, ]
S2S4Sig<-S2S4Sig[complete.cases(S2S4Sig), ]
S2S4_num<-nrow(S2S4Sig)
S2S4_gene<-(S2S4Sig[,1])
write.csv(S2S4Sig, file="S2S4sig_deseq_norm.csv")
hist((S2S4Sig[,6]), col="gray", las=1, cex.axis=0.8,xlab="log2Foldchange",breaks=50,
     main="S2S4DESeqnorm")
box()
temp<-subset(norm_counts,norm_counts[,13] %in% S2S4Sig[,1])

#S3S4
S3S4 = nbinomTest( cds, "S3","S4" )
plotMA(S3S4)
hist(S3S4$pval, breaks=25,  main="", col="gray")
box()
S3S4Sig = S3S4[ S3S4$padj < 0.1, ]
S3S4Sig<-S3S4Sig[complete.cases(S3S4Sig), ]
S3S4_num<-nrow(S3S4Sig)
S3S4_gene<-(S3S4Sig[,1])
write.csv(S3S4Sig, file="S3S4sig_deseq_norm.csv")
hist((S3S4Sig[,6]), col="gray", las=1, cex.axis=0.8,xlab="log2Foldchange",breaks=50,
     main="S3S4DESeqnorm")
box()
temp<-subset(norm_counts,norm_counts[,13] %in% S3S4Sig[,1])

DE_norm_number = matrix( c(0,S1S2_num,S1S3_num,S1S4_num,
        S1S2_num,0,S2S3_num,S2S4_num,
        S1S3_num,S2S3_num,0,S3S4_num,
        S1S4_num,S2S4_num,S3S4_num,0), 
     nrow=4, ncol=4) 
DE_norm_number<-as.data.frame(DE_norm_number)
colnames(DE_norm_number)<-c("S1","S2","S3","S4")
rownames(DE_norm_number)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(DE_norm_number),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

```
##Characterizing the expression of candidate genes
Read in the candidate gene lists: Mesenchymal (Me), Embryonic Stem cell(ES) and induced Pluripotent Stem Cells(iPSC). Plot the heat map, summed z score and summed TPKM plots for each category. Visualize how these genes change further using egg-plots (z scores), projected 3D histogram and a scatter-cor-plot. 

Next, calculate the number of candidate transcripts and genes in the top 10% highly expressed genes. Plot this number as a heat map.

Note, the candidate gene and the important gene lists were obtained from Bibhas Roy.
```{r}
candidate_genes <- read.csv("D:/saradha/RNA seq/candidate_genes.csv")
candidate_genes[,1] <- as.character(candidate_genes[,1])
candidate_genes[,2] <- as.character(candidate_genes[,2])
candidate_genes[,3] <- as.character(candidate_genes[,3])
Me<-subset(norm_counts,norm_counts[,13] %in% candidate_genes[,1])
ES<-subset(norm_counts,norm_counts[,13] %in% candidate_genes[,2])
iPSC<-subset(norm_counts,norm_counts[,13] %in% candidate_genes[,3])

library("gplots")
par(font.axis=2,font.lab=2)
x<-heatmap.2(as.matrix(Me[,14:17]), col=greenred(75),
             scale="row",key=T, symkey=T, Rowv=F,Colv=FALSE,cexRow=0.9,
             trace='none',dendrogram="none",cexCol=1.2,srtCol =90,key.title=" ",key.xlab="",cex.axis=0.7,
             ylab = "",labRow = Me[,13],labCol=c("3 Hour","3 Days","6 Days","10 Days"),density.info="none")
par(font.axis=2,font.lab=2)
me_zscore<-as.data.frame(x$carpet)
t<-rowSums(me_zscore)
t_up<-max(t)+5
t_dw<-min(t)-5
barplot(t,las=1,ylab="Summed zscore",names =c("S1","S2","S3","S4"),ylim=c(t_dw,t_up))
box()
abline(h=0)
par(font.axis=2,font.lab=2)
x<-colSums(Me[,14:17])
x_up<-max(x)+200
barplot(x,las=1,ylab="Summed Tpkm",names =c("S1","S2","S3","S4"),ylim=c(0,x_up),cex.axis=0.7)
box()

par(font.axis=2,font.lab=2)
x<-heatmap.2(as.matrix(ES[,14:17]), col=greenred(75),
             scale="row",key=T, symkey=T, Rowv=F,Colv=FALSE,cexRow=0.9,
             trace='none',dendrogram="none",cexCol=1.2,srtCol =90,key.title=" ",key.xlab="",cex.axis=0.7,
             ylab = "",labRow = ES[,13],labCol=c("3 Hour","3 Days","6 Days","10 Days"),density.info="none")
par(font.axis=2,font.lab=2)
es_zscore<-as.data.frame(x$carpet)
t<-rowSums(es_zscore)
t_up<-max(t)+5
t_dw<-min(t)-5
barplot(t,las=1,ylab="Summed zscore",names =c("S1","S2","S3","S4"),ylim=c(t_dw,t_up))
box()
abline(h=0)
par(font.axis=2,font.lab=2)
x<-colSums(ES[,14:17])
x_up<-max(x)+200
barplot(x,las=1,ylab="Summed Tpkm",names =c("S1","S2","S3","S4"),ylim=c(0,x_up))
box()


par(font.axis=2,font.lab=2)
x<-heatmap.2(as.matrix(iPSC[,14:17]), col=greenred(75),
             scale="row",key=T, symkey=T, Rowv=F,Colv=FALSE,cexRow=0.9,
             trace='none',dendrogram="none",cexCol=1.2,srtCol =90,key.title=" ",key.xlab="",cex.axis=0.7,
             ylab = "",labRow = iPSC[,13],labCol=c("3 Hour","3 Days","6 Days","10 Days"),density.info="none")
iPSC_zscore<-as.data.frame(x$carpet)
t<-rowSums(iPSC_zscore, na.rm = T)
t_up<-max(t)+5
t_dw<-min(t)-5
barplot(t,las=1,ylab="Summed zscore",names =c("S1","S2","S3","S4"),ylim=c(t_dw,t_up))
box()
abline(h=0)
par(font.axis=2,font.lab=2)
x<-colSums(iPSC[,14:17])
x_up<-max(x)+200
barplot(x,las=1,ylab="Summed Tpkm",names =c("S1","S2","S3","S4"),ylim=c(0,x_up))
box()

# top genes
n<-round(nrow(norm_counts)*0.1,0)
S1_topgenes<-norm_counts[order(norm_counts[,14]),]
S1_topgenes<-tail(S1_topgenes,n)
S2_topgenes<-norm_counts[order(norm_counts[,15]),]
S2_topgenes<-tail(S2_topgenes,n)
S3_topgenes<-norm_counts[order(norm_counts[,16]),]
S3_topgenes<-tail(S3_topgenes,n)
S4_topgenes<-norm_counts[order(norm_counts[,17]),]
S4_topgenes<-tail(S4_topgenes,n)
m<-nrow(subset(S1_topgenes,S1_topgenes[,13] %in% candidate_genes[,1]))
e<-nrow(subset(S1_topgenes,S1_topgenes[,13] %in% candidate_genes[,2]))
i<-nrow(subset(S1_topgenes,S1_topgenes[,13] %in% candidate_genes[,3]))
S1_fractions<-c(m,e,i)
m<-nrow(subset(S2_topgenes,S2_topgenes[,13] %in% candidate_genes[,1]))
e<-nrow(subset(S2_topgenes,S2_topgenes[,13] %in% candidate_genes[,2]))
i<-nrow(subset(S2_topgenes,S2_topgenes[,13] %in% candidate_genes[,3]))
S2_fractions<-c(m,e,i)
m<-nrow(subset(S3_topgenes,S3_topgenes[,13] %in% candidate_genes[,1]))
e<-nrow(subset(S3_topgenes,S3_topgenes[,13] %in% candidate_genes[,2]))
i<-nrow(subset(S3_topgenes,S3_topgenes[,13] %in% candidate_genes[,3]))
S3_fractions<-c(m,e,i)
m<-nrow(subset(S4_topgenes,S4_topgenes[,13] %in% candidate_genes[,1]))
e<-nrow(subset(S4_topgenes,S4_topgenes[,13] %in% candidate_genes[,2]))
i<-nrow(subset(S4_topgenes,S4_topgenes[,13] %in% candidate_genes[,3]))
S4_fractions<-c(m,e,i)
top_10_percent<-as.data.frame(S1_fractions)
top_10_percent[,2]<-S2_fractions
top_10_percent[,3]<-S3_fractions
top_10_percent[,4]<-S4_fractions
row.names(top_10_percent)<-c("Mesenchymal","ES","iPSC")
colnames(top_10_percent)<-c("S1","S2","S3","S4")
library("corrplot")
corrplot(as.matrix(top_10_percent),is.corr=F, tl.col="black",cl.pos="b", cl.ratio=0.3)
temp<-top_10_percent
colnames(temp)<-c("3 Hour","3 Days","6 Days","10 Days")
par(font.axis=2,font.lab=2)
corrplot(as.matrix(temp),is.corr=F, tl.col="black",cl.pos="b", cl.ratio=0.3,method="color",addgrid.col="black")

#ME
a1<-length(subset(S1S2_gene,S1S2_gene %in% candidate_genes[,1]))
a2<-length(subset(S1S3_gene,S1S3_gene %in% candidate_genes[,1]))
a3<-length(subset(S1S4_gene,S1S4_gene %in% candidate_genes[,1]))
b1<-length(subset(S2S3_gene,S2S3_gene %in% candidate_genes[,1]))
b2<-length(subset(S2S4_gene,S2S4_gene %in% candidate_genes[,1]))
c1<-length(subset(S3S4_gene,S2S4_gene %in% candidate_genes[,1]))
DE_norm_number_me = matrix( c(0,a1,a2,a3,
                              a1,0,b1,b2,
                              a2,b1,0,c1,
                              a3,b2,c1,0), 
                         nrow=4, ncol=4) 
DE_norm_number_me<-as.data.frame(DE_norm_number_me)
colnames(DE_norm_number_me)<-c("S1","S2","S3","S4")
rownames(DE_norm_number_me)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(DE_norm_number_me),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

#ES
a1<-length(subset(S1S2_gene,S1S2_gene %in% candidate_genes[,2]))
a2<-length(subset(S1S3_gene,S1S3_gene %in% candidate_genes[,2]))
a3<-length(subset(S1S4_gene,S1S4_gene %in% candidate_genes[,2]))
b1<-length(subset(S2S3_gene,S2S3_gene %in% candidate_genes[,2]))
b2<-length(subset(S2S4_gene,S2S4_gene %in% candidate_genes[,2]))
c1<-length(subset(S3S4_gene,S2S4_gene %in% candidate_genes[,2]))
DE_norm_number_es = matrix( c(0,a1,a2,a3,
                              a1,0,b1,b2,
                              a2,b1,0,c1,
                              a3,b2,c1,0), 
                            nrow=4, ncol=4) 
DE_norm_number_es<-as.data.frame(DE_norm_number_es)
colnames(DE_norm_number_es)<-c("S1","S2","S3","S4")
rownames(DE_norm_number_es)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(DE_norm_number_es),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

#iPSC
a1<-length(subset(S1S2_gene,S1S2_gene %in% candidate_genes[,3]))
a2<-length(subset(S1S3_gene,S1S3_gene %in% candidate_genes[,3]))
a3<-length(subset(S1S4_gene,S1S4_gene %in% candidate_genes[,3]))
b1<-length(subset(S2S3_gene,S2S3_gene %in% candidate_genes[,3]))
b2<-length(subset(S2S4_gene,S2S4_gene %in% candidate_genes[,3]))
c1<-length(subset(S3S4_gene,S2S4_gene %in% candidate_genes[,3]))
DE_norm_number_ip = matrix( c(0,a1,a2,a3,
                              a1,0,b1,b2,
                              a2,b1,0,c1,
                              a3,b2,c1,0), 
                            nrow=4, ncol=4) 
DE_norm_number_ip<-as.data.frame(DE_norm_number_ip)
colnames(DE_norm_number_ip)<-c("S1","S2","S3","S4")
rownames(DE_norm_number_ip)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(DE_norm_number_ip),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

```

## Calculating the similarities between the RNAseqdata and H3K9ac promoter occupancy
Epitect qPCR was used to look at H3K9ac occupancy levels in candidate MET/reprogramming genes. This data is obtained from Bibhas. First, heat maps showing H3k9ac promoter occupancy levels and RNAseq data are plotted. 

Obtain Epithelial, Mesenchymal, Proliferative and Reprogramming gene lists. Aggregate the sum values and plot bar plot for both RNAseq and H3K9ac levels

```{r}
#epitect
epitect<-read.csv("D:/saradha/RNA seq/epitect.csv")
epitect[,1] <- as.character(epitect[,1])
rna_epi<-subset(norm_counts,norm_counts[,13] %in% epitect[,1])
colnames(epitect)[1]<-"name"
rownames(epitect)<-make.names(epitect[,1], unique=TRUE)

library("gplots")
par(font.axis=2,font.lab=2)
x<-heatmap.2(as.matrix(rna_epi[,14:17]), col=greenred(75),
             scale="row",key=T, symkey=T, Rowv=F,Colv=FALSE,cexRow=0.9,
             trace='none',dendrogram="none",cexCol=1.2,srtCol =90,key.title=" ",key.xlab="",cex.axis=0.7,
             ylab = "",labRow = rna_epi[,13],labCol=c("3 Hour","3 Days","6 Days","10 Days"),density.info="none")

te<-subset(epitect,epitect[,6]=="Epithelial")[,1]
te_rna<-subset(norm_counts,norm_counts[,13] %in% te)[,c(1,14:17)]
te_rna[,6]<-"Epithelial"
tm<-subset(epitect,epitect[,6]=="Mesenchymal")[,1]
tm_rna<-subset(norm_counts,norm_counts[,13] %in% tm)[,c(1,14:17)]
tm_rna[,6]<-"Mesenchymal"
tp<-subset(epitect,epitect[,6]=="Proliferation")[,1]
tp_rna<-subset(norm_counts,norm_counts[,13] %in% tp)[,c(1,14:17)]
tp_rna[,6]<-"Proliferation"
tr<-subset(epitect,epitect[,6]=="Reprogramming")[,1]
tr_rna<-subset(norm_counts,norm_counts[,13] %in% tr)[,c(1,14:17)]
tr_rna[,6]<-"Reprogramming"
rna_epitect_groups<-rbind(te_rna,tm_rna,tp_rna,tr_rna)

temp_rna<-aggregate(. ~ V6, data = rna_epitect_groups[,-1],sum)
temp_epitect<-aggregate(. ~ X.1, data = epitect[,-1],sum)
par(mfrow=c(1,2),font.axis=2,font.lab=2)
k<-as.matrix(temp_rna[1,-1])
barplot(k, las=1, horiz = T, main="Epithelial_RNAseq")
box()
k<-as.matrix(temp_epitect[1,-1])
barplot(k, las=1,horiz = T,main="Epithelial_Epitect")
box()
k<-as.matrix(temp_rna[2,-1])
barplot(k, las=1, horiz = T, main="Mesenchymal_RNAseq")
box()
k<-as.matrix(temp_epitect[2,-1])
barplot(k, las=1,horiz = T,main="Mesenchymal_Epitect")
box()
k<-as.matrix(temp_rna[3,-1])
barplot(k, las=1, horiz = T, main="Proliferation_RNAseq")
box()
k<-as.matrix(temp_epitect[3,-1])
barplot(k, las=1,horiz = T,main="Proliferation_Epitect")
box()
k<-as.matrix(temp_rna[4,-1])
barplot(k, las=1, horiz = T, main="Reprogramming_RNAseq")
box()
k<-as.matrix(temp_epitect[4,-1])
barplot(k, las=1,horiz = T,main="Reprogramming_Epitect")
box()

temp_rna<-aggregate(. ~ V6, data = rna_epitect_groups[,-1],mean)
temp_epitect<-aggregate(. ~ X.1, data = epitect[,-1],mean)
par(mfrow=c(1,2),font.axis=2,font.lab=2)
k<-as.matrix(temp_rna[1,-1])
barplot(k, las=1, horiz = T, main="Epithelial_RNAseq")
box()
k<-as.matrix(temp_epitect[1,-1])
barplot(k, las=1,horiz = T,main="Epithelial_Epitect")
box()
k<-as.matrix(temp_rna[2,-1])
barplot(k, las=1, horiz = T, main="Mesenchymal_RNAseq")
box()
k<-as.matrix(temp_epitect[2,-1])
barplot(k, las=1,horiz = T,main="Mesenchymal_Epitect")
box()
k<-as.matrix(temp_rna[3,-1])
barplot(k, las=1, horiz = T, main="Proliferation_RNAseq")
box()
k<-as.matrix(temp_epitect[3,-1])
barplot(k, las=1,horiz = T,main="Proliferation_Epitect")
box()
k<-as.matrix(temp_rna[4,-1])
barplot(k, las=1, horiz = T, main="Reprogramming_RNAseq")
box()
k<-as.matrix(temp_epitect[4,-1])
barplot(k, las=1,horiz = T,main="Reprogramming_Epitect")
box()

```

#Perform paired wilcox test on the genes normalised with TPKM

Wilcoxon tests (as opposed to t-tests, e.g.) are non-parametric. They do not assume anything about the distributions. When the data are very non-normal, t-tests may not be appropriate and one alternative is Wilcoxon. In addition, Wilcoxon tests entire distributions whereas t-tests are tests of the means. It is also known as Mann-Whitney test. Significant genes are identified with p.value < 0.1

```{r warning=FALSE}
d<-as.matrix(rnaseq_mapped_genes[,c(1,63:74)])
row.names(d)<-d[,1]

wil_S1S2<-apply(d, 1, function(row) unlist(wilcox.test(as.numeric(row[c(2,6,10)]),as.numeric(row[c(3,7,11)]))[c("p.value","statistic")]))
wil_S1S2<-as.data.frame(t(wil_S1S2))
wil_S1S2[,3]<-as.character(rownames(wil_S1S2))
S1S2_num<-nrow(subset(wil_S1S2,wil_S1S2[,1]<0.1))
S1S2_gene<-subset(wil_S1S2,wil_S1S2[,1]<0.1)[,3]

wil_S1S3<-apply(d, 1, function(row) unlist(wilcox.test(as.numeric(row[c(2,6,10)]),as.numeric(row[c(4,8,12)]))[c("p.value","statistic")]))
wil_S1S3<-as.data.frame(t(wil_S1S3))
wil_S1S3[,3]<-as.character(rownames(wil_S1S3))
S1S3_num<-nrow(subset(wil_S1S3,wil_S1S3[,1]<0.1))
S1S3_gene<-subset(wil_S1S3,wil_S1S3[,1]<0.1)[,3]

wil_S1S4<-apply(d, 1, function(row) unlist(wilcox.test(as.numeric(row[c(2,6,10)]),as.numeric(row[c(5,9,13)]))[c("p.value","statistic")]))
wil_S1S4<-as.data.frame(t(wil_S1S4))
wil_S1S4[,3]<-as.character(rownames(wil_S1S4))
S1S4_num<-nrow(subset(wil_S1S4,wil_S1S4[,1]<0.1))
S1S4_gene<-subset(wil_S1S4,wil_S1S4[,1]<0.1)[,3]

wil_S2S3<-apply(d, 1, function(row) unlist(wilcox.test(as.numeric(row[c(3,7,11)]),as.numeric(row[c(4,8,12)]))[c("p.value","statistic")]))
wil_S2S3<-as.data.frame(t(wil_S2S3))
wil_S2S3[,3]<-as.character(rownames(wil_S2S3))
S2S3_num<-nrow(subset(wil_S2S3,wil_S2S3[,1]<0.1))
S2S3_gene<-subset(wil_S2S3,wil_S2S3[,1]<0.1)[,3]

wil_S2S4<-apply(d, 1, function(row) unlist(wilcox.test(as.numeric(row[c(3,7,11)]),as.numeric(row[c(5,9,13)]))[c("p.value","statistic")]))
wil_S2S4<-as.data.frame(t(wil_S2S4))
wil_S2S4[,3]<-as.character(rownames(wil_S2S4))
S2S4_num<-nrow(subset(wil_S2S4,wil_S2S4[,1]<0.1))
S2S4_gene<-subset(wil_S2S4,wil_S2S4[,1]<0.1)[,3]

wil_S3S4<-apply(d, 1, function(row) unlist(wilcox.test(as.numeric(row[c(4,8,12)]),as.numeric(row[c(5,9,13)]))[c("p.value","statistic")]))
wil_S3S4<-as.data.frame(t(wil_S3S4))
wil_S3S4[,3]<-as.character(rownames(wil_S3S4))
S3S4_num<-nrow(subset(wil_S3S4,wil_S3S4[,1]<0.1))
S3S4_gene<-subset(wil_S3S4,wil_S3S4[,1]<0.1)[,3]

Wilcox_number = matrix( c(0,S1S2_num,S1S3_num,S1S4_num,
                           S1S2_num,0,S2S3_num,S2S4_num,
                           S1S3_num,S2S3_num,0,S3S4_num,
                           S1S4_num,S2S4_num,S3S4_num,0), 
                         nrow=4, ncol=4) 
Wilcox_number<-as.data.frame(Wilcox_number)
colnames(Wilcox_number)<-c("S1","S2","S3","S4")
rownames(Wilcox_number)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(Wilcox_number),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")


a1<-length(subset(S1S2_gene,S1S2_gene %in% candidate_genes[,1]))
a2<-length(subset(S1S3_gene,S1S3_gene %in% candidate_genes[,1]))
a3<-length(subset(S1S4_gene,S1S4_gene %in% candidate_genes[,1]))
b1<-length(subset(S2S3_gene,S2S3_gene %in% candidate_genes[,1]))
b2<-length(subset(S2S4_gene,S2S4_gene %in% candidate_genes[,1]))
c1<-length(subset(S3S4_gene,S2S4_gene %in% candidate_genes[,1]))
Wilcox_number_me = matrix( c(0,a1,a2,a3,
                              a1,0,b1,b2,
                              a2,b1,0,c1,
                              a3,b2,c1,0), 
                            nrow=4, ncol=4) 
Wilcox_number_me<-as.data.frame(Wilcox_number_me)
colnames(Wilcox_number_me)<-c("S1","S2","S3","S4")
rownames(Wilcox_number_me)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(Wilcox_number_me),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

#ES
a1<-length(subset(S1S2_gene,S1S2_gene %in% candidate_genes[,2]))
a2<-length(subset(S1S3_gene,S1S3_gene %in% candidate_genes[,2]))
a3<-length(subset(S1S4_gene,S1S4_gene %in% candidate_genes[,2]))
b1<-length(subset(S2S3_gene,S2S3_gene %in% candidate_genes[,2]))
b2<-length(subset(S2S4_gene,S2S4_gene %in% candidate_genes[,2]))
c1<-length(subset(S3S4_gene,S2S4_gene %in% candidate_genes[,2]))
Wilcox_number_es = matrix( c(0,a1,a2,a3,
                              a1,0,b1,b2,
                              a2,b1,0,c1,
                              a3,b2,c1,0), 
                            nrow=4, ncol=4) 
Wilcox_number_es<-as.data.frame(Wilcox_number_es)
colnames(Wilcox_number_es)<-c("S1","S2","S3","S4")
rownames(Wilcox_number_es)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(Wilcox_number_es),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

#iPSC
a1<-length(subset(S1S2_gene,S1S2_gene %in% candidate_genes[,3]))
a2<-length(subset(S1S3_gene,S1S3_gene %in% candidate_genes[,3]))
a3<-length(subset(S1S4_gene,S1S4_gene %in% candidate_genes[,3]))
b1<-length(subset(S2S3_gene,S2S3_gene %in% candidate_genes[,3]))
b2<-length(subset(S2S4_gene,S2S4_gene %in% candidate_genes[,3]))
c1<-length(subset(S3S4_gene,S2S4_gene %in% candidate_genes[,3]))
Wilcox_number_ip = matrix( c(0,a1,a2,a3,
                              a1,0,b1,b2,
                              a2,b1,0,c1,
                              a3,b2,c1,0), 
                            nrow=4, ncol=4) 
Wilcox_number_ip<-as.data.frame(Wilcox_number_ip)
colnames(Wilcox_number_ip)<-c("S1","S2","S3","S4")
rownames(Wilcox_number_ip)<-c("S1","S2","S3","S4")
library(corrplot)
corrplot(as.matrix(Wilcox_number_ip),is.corr=F, tl.col="black",cl.pos="r", cl.ratio=0.3, method="color")

```

#Session Information
```{r}
sessionInfo()
```