R语言学习-文本分析-分类建模
(2015-04-01 10:56:21)
标签:
r语言文本分析分类算法svm随机森林 |
对于文本数据进行分类预测,分类方法很多
以下使用SVM, randomForest, 神经网络,KNN模型算法等进行分类
#示例从论坛发帖中寻找有购买意向用户
library(rJava)
library(Rwordseg)
library (tm)
library(slam)
#1.读取数据
allf<-read.csv('F:/test.csv',header=T,sep=',')
求推荐滚筒洗衣机,最好5000以内!
1
帮忙选一下, 洗衣机 三选一
1
滚筒洗衣机是用什么洗衣粉
0
三洋波轮洗衣机的型号“DB"开头和"XQB"开头是什么区别?
0
#训练,测试集
set.seed(1)
sp<-sample(c(0,1),size=length(allf[,1]),prob=c(0.7,0.3),replace=TRUE)
trainf<-allf[sp==0,]
testf<-allf[sp==1,]
#2.分词
segword<-function(unf)
{
pp<-as.character(unf$txt.title)
#insertWords(kw)
rf<-segmentCN(pp,nature=TRUE)
rf
}
#3.提取名词与动词
nvword<-function(wf)
{
wf.n=wf
for(i in 1:length(wf)){
wf.n[[i]]<-wf[[i]][which(names(wf[[i]])=='n'|names(wf[[i]])=='v'|names(wf[[i]])=='nr'|names(wf[[i]])=='nz'|names(wf[[i]])=='userDefine')]
wf.n[[i]]<-wf.n[[i]][which(is.na(match(wf.n[[i]],c("是","有","可以"))))]
}
ovid<-Corpus(VectorSource(wf.n))
ovid
}
#4.特征选择建模数据
modelword<-function(ovid,unf)
{
dtm<-DocumentTermMatrix(ovid,control = list(wordLengths =
c(2,Inf)))
rtdtm<-removeSparseTerms(dtm,
0.999)
rtdtm<-rtdtm[row_sums(rtdtm)>0,]
tfidf<-weightBin(rtdtm)
#tfidf<-weightTfIdf(rtdtm1)
dtmtx<-as.matrix(tfidf)
doc<-as.integer(dimnames(rtdtm[row_sums(rtdtm)>0,])$Docs)
nb<-as.matrix(unf$yn[doc])
modeldata<-data.frame(nb,dtmtx)
rtn<-list(modeldata,doc)
}
wf1<-segword(trainf)
ovid1<-nvword(wf1)
rtn1<-modelword(ovid1,trainf)
modeldata1<-rtn1[[1]]
doc1<-rtn1[[2]]
#特征选择
library(glmnet)
dd<-as.matrix(modeldata1[,-1])
cvfit<-cv.glmnet(dd,modeldata1$nb,family="multinomial",type.measure="mae")
plot(cvfit)
res<-list()
vf<-as.matrix(coef(cvfit)$'0')
res[[1]]<-names(vf[vf!=0,])[-1]
vf<-as.matrix(coef(cvfit)$'1')
res[[2]]<-names(vf[vf!=0,])[-1]
modelky1<-unique(unlist(res))
#6.建模数据准备
{
dic<-Dictionary(as.character(modelky))
pdtm<-DocumentTermMatrix(ovid,list(dictionary=dic,wordLengths =
c(2,Inf)))
pdtm<-pdtm[row_sums(pdtm)>0,]
pbin<-weightBin(pdtm)
pbintx<-as.matrix(pbin)
pdoc<-as.integer(dimnames(pdtm[row_sums(pdtm)>0,])$Docs)
pnb<-as.matrix(unf$yn[pdoc])
pmodeldata<-data.frame(pnb,pbintx)
prtn<-list(pmodeldata,pdoc)
}
#训练集
wf1<-segword(trainf)
ovid1<-nvword(wf1)
prtn1<-prepword(modelky1,ovid1,trainf)
#特征选择数据
traindata<-prtn1[[1]]
pdoc1<-prtn1[[2]]
zz<-c(1:length(trainf[,1]))
qyn1<-trainf[which(is.na(match(zz,pdoc1))),]$yn
#测试集
wf2<-segword(testf)
ovid2<-nvword(wf2)
prtn2<-prepword(modelky1,ovid2,testf)
testdata<-prtn2[[1]]
pdoc2<-prtn2[[2]]
tt<-c(1:length(testf[,1]))
qyn2<-testf[which(is.na(match(tt,pdoc2))),]$yn
#7.建模与评估:SVM
library(kernlab)
sample_ksvm <-
ksvm(pnb~., data=traindata,type = "C-svc", kernel
= "rbfdot",prob.model = TRUE)
svmCl <-
predict(sample_ksvm,traindata[,-1],type =
"probabilities")
svmpred<-apply(svmCl,1,function(x) which(x==max(x))-1)
svmpred<-c(svmpred,rep(0,1,length(qyn1)))
trainyn<-c(traindata$pnb,qyn1)
svmTable <-table(SVM=svmpred,
sample=trainyn)
svmTable
#训练集合的准确率,覆盖率
sum(diag(svmTable))/sum(svmTable)
sum(svmTable[2:3,2:3])/sum(svmTable[,2:3])
svmT <-
predict(sample_ksvm,testdata[,-1],type =
"probabilities")
svmpred<-apply(svmT,1,function(x) which(x==max(x))-1)
svmpred<-c(svmpred,rep(0,1,length(qyn2)))
testyn<-c(testdata$pnb,qyn2)
svmTable <-table(SVM=svmpred,
sample=testyn)
svmTable
#测试集合准确率,覆盖率
sum(diag(svmTable))/sum(svmTable)
sum(svmTable[2:3,2:3])/sum(svmTable[,2:3])
#7.建模:随机森林
library(randomForest)
set.seed(2013)
traindata.rf <-
randomForest(as.factor(pnb) ~
.,data=traindata,importance=TRUE,proximity=TRUE)
print(traindata.rf)
#变量重要性
imp<-round(importance(traindata.rf), 2)
impvar <- imp[order(imp[, 3],
decreasing=TRUE),]
impvar
prf<-predict(traindata.rf,
traindata[,-1])
prfpred<-c(as.integer(prf)-1,rep(0,1,length(qyn1)))
trainyn<-c(traindata$pnb,qyn1)
rp <-table(RF=prfpred,
sample=trainyn)
rp
sum(diag(rp))/sum(rp)
sum(rp[2:3,2:3])/sum(rp[,2:3])
prf<-predict(traindata.rf,
testdata[,-1])
prfpred<-c(as.integer(prf)-1,rep(0,1,length(qyn2)))
testyn<-c(testdata$pnb,qyn2)
rp <-table(RF=prfpred,
sample=testyn)
rp
sum(diag(rp))/sum(rp)
sum(rp[2:3,2:3])/sum(rp[,2:3])
#7.建模:决策树rpart
library(rpart)
model.tree<-rpart( pnb ~
.,data=traindata,method='class')
pre.tree<-predict(model.tree,
data = traindata,type='class')
prfpred<-c(as.integer(pre.tree)-1,rep(0,1,length(qyn1)))
trainyn<-c(traindata$pnb,qyn1)
rp <-table(RF=prfpred,
sample=trainyn)
rp
sum(diag(rp))/sum(rp)
sum(rp[2:3,2:3])/sum(rp[,2:3])
#7.建模:神经网络,参数设置问题?
library(RSNNS)
tTargets =
decodeClassLabels(traindata$pnb)
model.net <- mlp(traindata[,-1],
tTargets, size=5, learnFunc="Quickprop", learnFuncParams=c(0.1,
2.0, 0.0001, 0.1),maxit=100)
predictions <-
predict(model.net,traindata[,-1])
nntpred<-apply(predictions,1,function(x)
which(x==max(x))-1)
nntpred<-c(nntpred,rep(0,1,length(qyn1)))
trainyn<-c(traindata$pnb,qyn1)
nntTable <-table(NNT=nntpred,
sample=trainyn)
nntTable
sum(diag(nntTable))/sum(nntTable)
sum(nntTable[2:3,2:3])/sum(nntTable[,2:3])
predictions <-
predict(model.net,testdata[,-1])
nntpred<-apply(predictions,1,function(x)
which(x==max(x))-1)
nntpred<-c(nntpred,rep(0,1,length(qyn1)))
testyn<-c(testdata$pnb,qyn1)
nntTable <-table(NNT=nntpred,
sample=testyn)
nntTable
sum(diag(nntTable))/sum(nntTable)
sum(nntTable[2:3,2:3])/sum(nntTable[,2:3])
#7.建模:KNN
library(class)
sample_knnCl
<- knn(traindata[,-1],
traindata[,-1], traindata$pnb)
prfpred<-c(as.integer(sample_knnCl)-1,rep(0,1,length(qyn1)))
trainyn<-c(traindata$pnb,qyn1)
kn <-table(KNN=prfpred,
sample=trainyn)
kn
sum(diag(kn))/sum(kn)
sum(kn[2:3,2:3])/sum(kn[,2:3])
sample_knnT <-
knn(traindata[,-1], testdata[,-1],
traindata$pnb)
prfpred<-c(as.integer(sample_knnT)-1,rep(0,1,length(qyn2)))
testyn<-c(testdata$pnb,qyn2)
kn <-table(KNN=prfpred,
sample=testyn)
kn
sum(diag(kn))/sum(kn)
sum(kn[2:3,2:3])/sum(kn[,2:3])
#7.建模:贝叶斯
BayesTree包执行Bayesian Additive
Regression Trees (BART)算法(y必须是二元)
library(BayesTree)
set.seed(1)
sample_bayes<-bart(traindata[,-1],as.factor(traindata$pnb),traindata[,-1])
plot(sample_bayes)
#8.预测结果
dic<-Dictionary(as.character(modelky1))
wf2<-segword(allf)
ovid2<-nvword(wf2)
pdtm<-DocumentTermMatrix(ovid2,list(dictionary=dic,wordLengths =
c(2,Inf)))
pdtm<-pdtm[row_sums(pdtm)>0,]
pbin<-weightBin(pdtm)
pbintx<-as.matrix(pbin)
pdoc<-as.integer(dimnames(pdtm[row_sums(pdtm)>0,])$Docs)
svmres <-
predict(sample_ksvm,pbintx,type =
"probabilities")
pdc<-apply(svmres,1,function(x)
max(x))
pred<-apply(svmres,1,function(x)
which(x==max(x)))
buyyn<-data.frame(title=allf$txt.title[pdoc],auth=allf$txt.auth[pdoc],time=allf$txt.stime[pdoc],pred=pdc,yn=pred)
table(buyyn$yn)
subbuy<-subset(buyyn,buyyn$yn!=0)
head(subbuy[order(-subbuy$pred),],n=20)
rfres<-predict(traindata.rf,
pbintx)
buyyn<-data.frame(title=allf$txt.title[pdoc],auth=allf$txt.auth[pdoc],time=allf$txt.stime[pdoc],yn=rfres)
table(buyyn$yn)
subbuy<-subset(buyyn,buyyn$yn!=0)
head(subbuy,n=20)
#预测结果保存
write.table(subbuy,'F:/subbuy.csv',row.names=F,sep='\t')
#模型保存
save(traindata.rf
,file="F:/rfxyj.RData")
save(sample_ksvm
,file="F:/svmxyj.RData")
后一篇:spark完整安装步骤

加载中…