########## Select a dataset ########## dat <- read.table(file="z:\\tc102\\final_project\\GSE5056_no_header.txt", header=T, row.names=1) ann <- read.table(file="z:\\tc102\\final_project\\GSE5056_annotations.txt", header=T) cls <- names(ann)[2:dim(ann)[2]] ########## Process outliers ########## # correlation plot dat.cor <- cor(dat) image(dat.cor, axes=F, col=rainbow(dim(dat)[1])) axis(1,at=seq(0,1,length=ncol(dat.cor)),label=1:dim(dat)[2],cex.axis=0.4,las=2) axis(2,at=seq(0,1,length=ncol(dat.cor)),label=1:dim(dat)[2],cex.axis=0.4,las=2) title("Correlation plot for all genes", xlab="Label 1", ylab="Label 2") # cluster tree tdat <- t(dat) tdat.dist <- dist(tdat, method="euclidean") tdat.clust <- hclust(tdat.dist, method="single") plot(tdat.clust, labels=1:dim(tdat)[1], cex=0.75) # cv vs. mean dat.mean <- apply(dat,2,mean) dat.sd <- sqrt(apply(dat,2,var)) dat.cv <- dat.sd/dat.mean plot(dat.mean,dat.cv,main="Sample CV vs. Mean",xlab="Mean",ylab="CV",col="blue",cex=0.5,pch=19) text(dat.mean,dat.cv,label=1:dim(dat)[2],pos=1,cex=0.5) ########## Filter genes ########## # cv filter dat.mean <- apply(dat,1,mean) dat.sd <- sqrt(apply(dat,1,var)) dat.cv <- dat.sd/dat.mean dat.threshold <- quantile(dat.cv, probs=0.05) dat.cv.filtered <- dat[dat.cv>=dat.threshold,] # gene expression < 50 avg_expr <- rowMeans(dat.cv.filtered) dat.filtered <- dat.cv.filtered[avg_expr>=50,] # student's t-test s1.non.smokers <- regexpr("non\\.smoker", cls)!=-1 s2.smokers <- regexpr("\\.\\.smoker", cls)!=-1 t.test.all.genes <- function(x,s1,s2) { x1 <- x[s1] x2 <- x[s2] x1 <- as.numeric(x1) x2 <- as.numeric(x2) t.out <- t.test(x1,x2, alternative="two.sided",var.equal=T) out <- as.numeric(t.out$p.value) return(out) } raw.p <- apply(dat.filtered, 1, t.test.all.genes, s1=s1.non.smokers, s2=s2.smokers) # multiple test correction holm.p <- p.adjust(raw.p, method="holm") #hist(raw.p, col="blue") #hist(holm.p, col="blue") dat.selection <- dat.filtered[holm.p<.01,] hist(holm.p[holm.p<0.01], col="green", main="Histogram of p-values for Remaining Genes", xlab="p-value") ########## Classify Data ########## # PCA plot dat.pca <- prcomp(t(dat.selection),cor=F) dat.loadings <- dat.pca$x[,1:2] plot(range(dat.loadings[,1]),range(dat.loadings[,2]),xlab="p1",ylab="p2",main="PCA plot of Filtered GSE5056 Data") points(dat.loadings[,1][s1.non.smokers], dat.loadings[,2][s1.non.smokers],col="red",pch="N",cex=0.75) points(dat.loadings[,1][s2.smokers], dat.loadings[,2][s2.smokers],col="blue",pch="S",cex=0.75) legend(200000,50000,c("Non-smokers","Smokers"),col=c("red","blue"),pch=c("N","S"),cex=.7,horiz=F) # Scree plot dat.pca.var <- round(dat.pca$sdev^2 / sum(dat.pca$sdev^2)*100,2) plot(c(1:length(dat.pca.var)),dat.pca.var,type="b",xlab="# components",ylab="% variance",main="Scree plot") # Classification Test library(MASS) cls[s1.non.smokers] <- "Non-smoker" cls[s2.smokers] <- "Smoker" dat.cls <- data.frame(cls, dat.loadings) training.indeces <- c(sample(1:18,10), sample(19:44,10)) training.indeces testing.indeces <- setdiff(1:44, training.indeces) testing.indeces dat.qda <- qda(cls~., dat.cls[training.indeces,]) pred <- predict(dat.qda, dat.cls[testing.indeces,]) table(pred$class, cls[testing.indeces]) # PCA plot plot(range(dat.loadings[testing.indeces,1]),range(dat.loadings[testing.indeces,2]),xlab="p1",ylab="p2",main="PCA plot of Filtered GSE5056 Data") points(dat.loadings[testing.indeces,1][s1.non.smokers[testing.indeces]], dat.loadings[testing.indeces,2][s1.non.smokers[testing.indeces]],col="red",pch="N",cex=0.75) points(dat.loadings[testing.indeces,1][s2.smokers[testing.indeces]], dat.loadings[testing.indeces,2][s2.smokers[testing.indeces]],col="blue",pch="S",cex=0.75) legend(75000,15000,c("Actual non-smoker, predicted non-smoker","Actual non-smoker, predicted smoker","Actual smoker, predicted smoker","Actual smoker, predicted non-smoker"),col=c("red","blue","blue","red"),pch=c("N","N","S","S"),cex=.7,horiz=F) ########## Identify Gene ########## fold <- sort(apply(log2(dat.selection[,s1.non.smokers]),1,mean) - apply(log2(dat.selection[,s2.smokers]),1,mean)) names(fold[1:5]) names(fold[(length(fold)-4):length(fold)])