加载中…
个人资料
  • 博客等级:
  • 博客积分:
  • 博客访问:
  • 关注人气:
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

R语言数据分析与挖掘实战第五章挖掘建模

(2017-07-21 15:58:13)
分类: R语言数据分析与挖掘实战
一、logistic回归
setwd("E:/code/dm_R/")
Data = read.csv("./chapter5/bankloan.csv")[2:701,]
#数据命名
colnames(Data) <- c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8","y")
#logistic回归模型
glm = glm(y~x1+x2+x3+x4+x5+x6+x7+x8, family = binomial(link = logit),data = Data)
summary(glm)

Call:
glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, family = binomial(link = logit), 
    data = Data)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.352  -0.646  -0.293   0.234   3.009  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.55006    0.61818   -2.51    0.012 *  
x1           0.03464    0.01735    2.00    0.046 *  
x2           0.09029    0.12284    0.74    0.462    
x3          -0.25753    0.03310   -7.78  7.2e-15 ***
x4          -0.10476    0.02320   -4.52  6.3e-06 ***
x5          -0.00907    0.00774   -1.17    0.241    
x6           0.06723    0.03029    2.22    0.026 *  
x7           0.61509    0.11322    5.43  5.5e-08 ***
x8           0.06838    0.07701    0.89    0.375    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 801.68  on 698  degrees of freedom
Residual deviance: 551.00  on 690  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 569

Number of Fisher Scoring iterations: 6
#逐步寻优法
logit.step <- step(glm, direction = "both")
#前向选择法
logit.step <- step(glm, direction = "forward")

Start: AIC=569 y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 > summary(logit.step) Call: glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, family = binomial(link = logit), data = Data) Deviance Residuals: Min 1Q Median 3Q Max -2.352 -0.646 -0.293 0.234 3.009 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.55006 0.61818 -2.51 0.012 * x1 0.03464 0.01735 2.00 0.046 * x2 0.09029 0.12284 0.74 0.462 x3 -0.25753 0.03310 -7.78 7.2e-15 *** x4 -0.10476 0.02320 -4.52 6.3e-06 *** x5 -0.00907 0.00774 -1.17 0.241 x6 0.06723 0.03029 2.22 0.026 * x7 0.61509 0.11322 5.43 5.5e-08 *** x8 0.06838 0.07701 0.89 0.375 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 801.68 on 698 degrees of freedom Residual deviance: 551.00 on 690 degrees of freedom (1 observation deleted due to missingness) AIC: 569 Number of Fisher Scoring iterations: 6
#后向选择法
logit.step <- step(glm, direction = "backward")
summary(logit.step)
Call: glm(formula = y ~ x1 + x3 + x4 + x6 + x7, family = binomial(link = logit), data = Data) Deviance Residuals: Min 1Q Median 3Q Max -2.341 -0.653 -0.295 0.259 2.914 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.6456 0.5131 -3.21 0.0013 ** x1 0.0327 0.0172 1.91 0.0567 . x3 -0.2596 0.0301 -8.62 < 2e-16 *** x4 -0.1034 0.0231 -4.48 7.4e-06 *** x6 0.0908 0.0187 4.84 1.3e-06 *** x7 0.5604 0.0892 6.28 3.4e-10 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 801.68 on 698 degrees of freedom Residual deviance: 552.70 on 693 degrees of freedom (1 observation deleted due to missingness) AIC: 564.7 Number of Fisher Scoring iterations: 6
二、决策树
#决策树算法预测销量高低代码
data = read.csv("./chapter5/sales_data.csv")[,2:5]
#计算一列数据信息熵
calculateEntropy <- function(data){
t = table(data)
sum = sum(t)
t = t[t != 0]
entropy <- -sum(log2(t/sum)*(t/sum))
return(entropy)
}
#计算2列信息熵
calculateEntropy2 <- function(data){
var = table(data[1])
p = var/sum(var)
varnames = names(var)
array = c()
for(name in varnames){
array = append(array,calculateEntropy(subset(data, data[1] == name, select = 2)))
}
return(sum(array * p))
}
buildTree = function(data){
if(length(unique(data$result)) == 1){
cat(data$result[1])
return()
}
if(length(names(data)) == 1){
cat("...")
return()
}
entropy = calculateEntropy(data$result)
labels = names(data)
label = ""
temp = Inf
subentropy = c()
for(i in 1:(length(data)-1)){
temp2 = calculateEntropy2(data[c(i,length(labels))])
if(temp2 < temp){
temp = temp2
label = labels[i]
}
subentropy = append(subentropy, temp2)
}
cat(label)
cat("[")
nextLabels = labels[labels != label]
for(value in unlist(unique(data[label]))){
cat(value, ":")
buildTree(subset(data, data[label] == value,select = nextLabels))
cat(";")
}
cat("]")
}
#构建分类树
buildTree(data)

是否周末[yes :天气[bad :是否有促销[yes :...;no :...;];good :是否有促销[yes :...;no :...;];];no :是否有促销[yes :天气[bad :...;good :...;];no :天气[good :...;bad :...;];];]


#神经网络
library(nnet)
colnames(data)<-c("x1","x2","x3","y")
model1 = nnet(y~., data, size=6, decay=5e-4, maxit=1000)
# weights: 31 initial value 26.051318 iter 10 value 15.922331 iter 20 value 15.086145 iter 30 value 14.986261 iter 40 value 14.961586 iter 50 value 14.946794 iter 60 value 14.931805 iter 70 value 14.922406 iter 80 value 14.916750 iter 90 value 14.911668 iter 100 value 14.908190 iter 110 value 14.906581 iter 120 value 14.906084 iter 130 value 14.905649 iter 140 value 14.905359 iter 150 value 14.905066 iter 160 value 14.904837 iter 170 value 14.904617 iter 180 value 14.904524 iter 190 value 14.904443 iter 200 value 14.904396 iter 210 value 14.904298 iter 220 value 14.904219 iter 230 value 14.903930 iter 240 value 14.903773 iter 250 value 14.903655 iter 260 value 14.903566 iter 270 value 14.903509 iter 280 value 14.903450 iter 290 value 14.903420 iter 300 value 14.903382 final value 14.903382
pred = predict(model1, data[,1:3], type = "class")
(P = sum(as.numeric(pred == data$y))/nrow(data))
[1] 0.7647
table(data$y, pred)
pred high low high 14 4 low 4 12
> prop.table(table(data$y, pred), 1) pred high low high 0.7778 0.2222 low 0.2500 0.7500

#k-means聚类
setwd("E://code//dm_R")
Data = read.csv("./chapter5/consumption_data.csv",header = TRUE)[,2:4]
km = kmeans(Data, centers = 3)
> km$centers R F M 1 18.43978 11.238095 1207.5434 2 16.23223 10.819905 1925.1083 3 15.41667 7.376344 431.9666
km$size/sum(km$size)
[1] 0.3797872 0.2244681 0.3957447
#数据分组
aaa = data.frame(Data, km$cluster)
Data1 = Data[which(aaa$km.cluster == 1),]
Data2 = Data[which(aaa$km.cluster == 2),]
Data3 = Data[which(aaa$km.cluster == 3),]
par(mfrow = c(1,3))#客户分群“1”的概率密度函数图
plot(density(Data1[,1]), col = 'red', main = "R")
plot(density(Data1[,2]), col = 'red', main = "F")
plot(density(Data1[,3]), col = 'red', main = "M")
par(mfrow = c(1,3))#客户分群“1”的概率密度函数图
plot(density(Data2[,1]), col = 'red', main = "R")
plot(density(Data2[,2]), col = 'red', main = "F")
plot(density(Data2[,3]), col = 'red', main = "M")
par(mfrow = c(1,3))#客户分群“1”的概率密度函数图
plot(density(Data3[,1]), col = 'red', main = "R")
plot(density(Data3[,2]), col = 'red', main = "F")
plot(density(Data3[,3]), col = 'red', main = "M")
#使用kmeans函数构建一个聚类模型,使用图表示聚类纪录以及聚类中心
#使用kmeans函数构建一个聚类模型,使用图表示聚类纪录以及聚类中心
set.seed(2)
x = matrix(rnorm(50 * 2), ncol = 2)#生成50*2列个随机数
x[1:25, 1] = x[1:25, 1] + 3
x[1:25 ,2] = x[1:25, 2] - 4
km.out = kmeans(x, 2, nstart = 20)
km.out$cluster
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
plot(x, col = (km.out$cluster + 1), main = "K-Means Clustering Results with K = 2",
xlab = "",ylab = "", pch = 20, cex = 2)
km.out$centers
[,1] [,2] 1 -0.1956978 -0.1848774 2 3.3339737 -4.0761910
points(km.out$centers[1,1],km.out$centers[1,2],pch = 10,col = "red", cex = 2)
points(km.out$centers[2,1],km.out$centers[2,2],pch = 10,col = "green", cex = 2)

#关联规则
library(arules)
setwd("E:/code/dm_R")
data = read.table()#argument "file" is missing, with no default
tr <- read.transactions("./chapter5/menu_orders.txt", format = "basket", sep = ",")
summary(tr)
 
transactions as itemMatrix in sparse format with 10 rows (elements/itemsets/transactions) and 5 columns (items) and a density of 0.54 most frequent items: b a c e d (Other) 8 7 7 3 2 0 element (itemset/transaction) length distribution: sizes 2 3 4 5 3 2 Min. 1st Qu. Median Mean 3rd Qu. Max. 2.0 2.0 2.5 2.7 3.0 4.0 includes extended item information - examples: labels 1 a 2 b 3 c  
> inspect(tr) items [1] {a,c,e} [2] {b,d} [3] {b,c} [4] {a,b,c,d} [5] {a,b} [6] {b,c} [7] {a,b} [8] {a,b,c,e} [9] {a,b,c} [10] {a,c,e}
> rules0 = apriori(tr, parameter = list(support = 0.2, confidence = 0.5)) Apriori Parameter specification: confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext 0.5 0.1 1 none FALSE TRUE 5 0.2 1 10 rules FALSE Algorithmic control: filter tree heap memopt load sort verbose 0.1 TRUE TRUE FALSE TRUE 2 TRUE Absolute minimum support count: 2 set item appearances ...[0 item(s)] done [0.00s]. set transactions ...[5 item(s), 10 transaction(s)] done [0.00s]. sorting and recoding items ... [5 item(s)] done [0.00s]. creating transaction tree ... done [0.00s]. checking subsets of size 1 2 3 done [0.00s]. writing ... [18 rule(s)] done [0.00s]. creating S4 object ... done [0.00s].
> rules0 set of 18 rules
> inspect(rules0) lhs rhs support confidence lift [1] {} => {c} 0.7 0.7000000 1.0000000 [2] {} => {b} 0.8 0.8000000 1.0000000 [3] {} => {a} 0.7 0.7000000 1.0000000 [4] {d} => {b} 0.2 1.0000000 1.2500000 [5] {e} => {c} 0.3 1.0000000 1.4285714 [6] {e} => {a} 0.3 1.0000000 1.4285714 [7] {c} => {b} 0.5 0.7142857 0.8928571 [8] {b} => {c} 0.5 0.6250000 0.8928571 [9] {c} => {a} 0.5 0.7142857 1.0204082 [10] {a} => {c} 0.5 0.7142857 1.0204082 [11] {b} => {a} 0.5 0.6250000 0.8928571 [12] {a} => {b} 0.5 0.7142857 0.8928571 [13] {c,e} => {a} 0.3 1.0000000 1.4285714 [14] {a,e} => {c} 0.3 1.0000000 1.4285714 [15] {a,c} => {e} 0.3 0.6000000 2.0000000 [16] {b,c} => {a} 0.3 0.6000000 0.8571429 [17] {a,c} => {b} 0.3 0.6000000 0.7500000 [18] {a,b} => {c} 0.3 0.6000000 0.8571429
#离群点检测
Data = read.csv("./chapter5/consumption_data.csv")
Data = scale(Data)#标准化
set.seed(12)
km = kmeans(Data, centers = 3)
> print(km) K-means clustering with 3 clusters of sizes 559, 341, 40 Cluster means: R F M 1 -0.1493534 -0.6588930 -0.2717798 2 -0.1604506 1.1148015 0.3928444 3 3.4550549 -0.2956536 0.4491234 Clustering vector: [1] 1 1 2 1 1 1 1 1 1 2 1 1 2 2 1 2 3 1 1 1 1 1 2 1 1 2 1 2 1 3 2 1 1 1 2 2 1 1 2 1 1 2 1 1 [45] 2 1 1 1 2 2 1 1 1 2 1 1 1 3 1 1 1 2 2 1 2 1 1 1 1 1 2 2 1 1 2 1 3 2 2 1 1 1 1 1 1 2 2 1 [89] 1 2 3 1 2 2 1 1 2 1 2 2 1 1 3 2 2 1 1 2 2 1 1 2 1 1 1 3 1 1 3 2 1 1 1 1 1 1 1 1 1 3 1 1 [133] 1 1 1 1 1 1 3 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 2 2 2 2 1 1 [177] 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 [221] 1 2 1 1 1 3 1 1 1 2 1 1 1 1 1 1 3 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 3 3 1 1 2 1 1 2 1 1 [265] 2 1 2 1 3 1 1 1 1 2 2 2 1 2 1 1 1 3 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 3 1 3 2 2 1 2 1 1 1 1 [309] 2 1 1 1 1 1 1 1 2 3 1 2 1 1 1 2 2 2 1 1 1 1 1 2 3 1 2 2 1 2 2 2 1 1 1 1 2 1 1 1 1 2 1 1 [353] 1 2 1 2 3 2 1 1 1 2 2 1 2 1 1 2 1 1 1 2 2 2 1 1 2 1 3 2 1 2 1 2 1 1 1 1 1 1 2 1 2 2 1 2 [397] 1 2 2 3 2 2 2 1 2 2 2 2 1 2 1 2 2 1 2 2 2 1 3 1 1 2 1 2 2 1 1 2 2 2 1 1 1 2 2 1 2 1 2 1 [441] 2 2 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 2 1 2 2 2 [485] 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 2 1 2 1 1 1 1 2 2 2 3 1 1 [529] 1 1 2 1 2 2 1 1 1 2 2 2 2 2 2 1 2 2 1 2 1 2 1 1 2 1 1 2 1 2 3 1 2 1 1 1 2 1 1 2 2 1 1 2 [573] 2 1 2 1 2 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 1 1 1 2 1 2 1 1 3 1 1 1 1 [617] 1 1 1 2 1 1 2 2 1 3 1 1 1 2 2 2 1 1 2 2 2 1 1 1 3 2 2 2 1 1 1 1 1 2 1 2 1 3 2 2 1 1 2 1 [661] 1 1 1 2 1 2 1 2 1 3 2 2 2 1 2 1 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 2 1 2 1 1 1 1 3 2 2 1 [705] 1 2 1 2 1 1 2 1 1 1 2 2 1 1 1 1 3 2 2 1 1 1 1 2 2 1 2 1 1 2 2 1 1 3 1 1 1 1 2 1 1 1 1 1 [749] 1 1 1 1 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 2 1 1 2 2 1 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 3 [793] 1 3 2 1 2 1 2 2 2 2 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 2 1 [837] 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 2 1 2 2 2 1 1 2 2 3 2 2 2 2 2 1 [881] 1 3 2 2 2 1 1 2 1 1 2 2 2 1 1 2 1 2 1 3 1 1 2 1 1 2 1 2 1 1 2 2 1 2 2 1 1 1 1 1 2 1 1 1 [925] 2 1 2 2 2 2 1 2 2 2 1 1 1 1 1 1 Within cluster sum of squares by cluster: [1] 689.9626 668.0462 188.2933 (between_SS / total_SS = 45.1 %) Available components: [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" [7] "size" "iter" "ifault" > km$centers R F M 1 -0.1493534 -0.6588930 -0.2717798 2 -0.1604506 1.1148015 0.3928444 3 3.4550549 -0.2956536 0.4491234
#各样本欧氏距离
x = list()
juli = list()
for(i in 1:3){
x[[i]] = matrix(km$centers[i,],nrow = 940, ncol = 3, byrow = T)#940行,3列
juli[[i]] = sqrt(rowSums((Data - x[[i]])^2))
}
juli1 = juli[1]
juli2 = juli[2]
juli3 = juli[3]
dist = data.frame(juli1, juli2, juli3)
colnames(dist) <- c('juli1', 'juli2', 'juli3')
#欧氏距离最小值
y = apply(dist, 1, min)#1指行,提取每一行的最小值
plot(1:940, y, xlim = c(0, 940), xlab = "样本点", ylab = "欧氏距离")
points(which(y > 2.5), y[which(y > 2.5)], pch = 19, col = "red")
> which(y > 2.5) [1] 30 39 226 339 484 525 654 670 933


0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有