- ###################################################
- ### Gene (Feature) Selection 基因特征选择: 1.过滤方法;2.封装方法
- ###################################################
- if (!requireNamespace("BiocManager", quietly = TRUE))
- install.packages("BiocManager")
- BiocManager::install(version = "3.11")
- BiocManager::install(c("Biobase", "genefilter"))
- BiocManager::install(c("ALL"))
-
- library(Biobase)
- library(ALL)
- data(ALL)
-
- ALLb <- ALL[,tgt.cases]
-
- rowIQRs <- function(em)
- rowQ(em,ceiling(0.75*ncol(em))) - rowQ(em,floor(0.25*ncol(em)))
- plot(rowMedians(es),rowIQRs(es),
- xlab='Median expression level',
- ylab='IQR expression level',
- main='Main Characteristics of Genes Expression Levels')
-
- library(genefilter)
- ALLb <- nsFilter(ALLb,
- var.func=IQR,
- var.cutoff=IQR(as.vector(es))/5,
- feature.exclude="^AFFX")
- ALLb <- ALLb$eset
- es <- exprs(ALLb)
- dim(es)
-
- #ANOVA过滤
- f <- Anova(ALLb$mol.bio,p=0.01)
- ff <- filterfun(f)
- selGenes <- genefilter(exprs(ALLb),ff)
-
- sum(selGenes)
- ALLb <- ALLb[selGenes,]
- ALLb
-
- es <- exprs(ALLb)
- plot(rowMedians(es),rowIQRs(es),
- xlab='Median expression level',
- ylab='IQR expression level',
- main='Distribution Properties of the Selected Genes')
-
- # 用随机森林(适合用于处理包含大量特征的问题)进行过滤:随机森林由一组决策树构成,取决于分析的问题采用回归树还是分类树 - 每棵树都是通过自助法抽样(从原始数据集中用有放回抽样法随机抽取N个个案)进行训练
- # 对于回归问题,采用每棵树的预测值得平均值作为这些组合的预测值。对于分类问题,则采用投票机制
- featureNames(ALLb) <- make.names(featureNames(ALLb))
- es <- exprs(ALLb)
-
- library(randomForest)
- dt <- data.frame(t(es),Mut=ALLb$mol.bio)
- rf <- randomForest(Mut ~ .,dt,importance=T)
- imp <- importance(rf)
- imp <- imp[,ncol(imp)-1]
- rf.genes <- names(imp)[order(imp,decreasing=T)[1:30]]
-
- sapply(rf.genes,function(g) tapply(dt[,g],dt$Mut,median))
-
- library(lattice)
- ordMut <- order(dt$Mut)
- levelplot(as.matrix(dt[ordMut,rf.genes]),
- aspect='fill', xlab='', ylab='',
- scales=list(
- x=list(
- labels=c('+','-','*','|')[as.integer(dt$Mut[ordMut])],
- cex=0.7,
- tck=0)
- ),
- main=paste(paste(c('"+"','"-"','"*"','"|"'),
- levels(dt$Mut)
- ),
- collapse='; '),
- col.regions=colorRampPalette(c('white','orange','blue'))
- )
-
- #用特征聚类的组合进行过滤
- library(Hmisc)
- vc <- varclus(t(es))
- clus30 <- cutree(vc$hclust,30)
- table(clus30)
-
- getVarsSet <- function(cluster,nvars=30,seed=NULL,verb=F)
- {
- if (!is.null(seed)) set.seed(seed)
-
- cls <- cutree(cluster,nvars)
- tots <- table(cls)
- vars <- c()
- vars <- sapply(1:nvars,function(clID)
- {
- if (!length(tots[clID])) stop('Empty cluster! (',clID,')')
- x <- sample(1:tots[clID],1)
- names(cls[cls==clID])[x]
- })
- if (verb) structure(vars,clusMemb=cls,clusTots=tots)
- else vars
- }
- getVarsSet(vc$hclust)