R语言数据分析与挖掘实战第五章挖掘建模_托儿硕

http://blog.sina.com.cn/u/6012202141

首页博文目录关于我

个人资料

微博

加好友发纸条

写留言加关注

博客等级：
博客积分：

博客访问：
关注人气：
获赠金笔：0支
赠出金笔：0支
荣誉徽章：

正文字体大小：大中小

R语言数据分析与挖掘实战第五章挖掘建模

(2017-07-21 15:58:13)

分类： R语言数据分析与挖掘实战

一、logistic回归
setwd("E:/code/dm_R/")
Data = read.csv("./chapter5/bankloan.csv")[2:701,]
#数据命名
colnames(Data) <- c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8","y")
#logistic回归模型
glm = glm(y~x1+x2+x3+x4+x5+x6+x7+x8, family = binomial(link = logit),data = Data)
summary(glm)


Call:
glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, family = binomial(link = logit), 
    data = Data)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.352  -0.646  -0.293   0.234   3.009  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.55006    0.61818   -2.51    0.012 *  
x1           0.03464    0.01735    2.00    0.046 *  
x2           0.09029    0.12284    0.74    0.462    
x3          -0.25753    0.03310   -7.78  7.2e-15 ***
x4          -0.10476    0.02320   -4.52  6.3e-06 ***
x5          -0.00907    0.00774   -1.17    0.241    
x6           0.06723    0.03029    2.22    0.026 *  
x7           0.61509    0.11322    5.43  5.5e-08 ***
x8           0.06838    0.07701    0.89    0.375    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 801.68  on 698  degrees of freedom
Residual deviance: 551.00  on 690  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 569

Number of Fisher Scoring iterations: 6

#逐步寻优法

logit.step <- step(glm, direction = "both")

#前向选择法

logit.step <- step(glm, direction = "forward")



Start:  AIC=569
y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8

> summary(logit.step)

Call:
glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, family = binomial(link = logit), 
    data = Data)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.352  -0.646  -0.293   0.234   3.009  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.55006    0.61818   -2.51    0.012 *  
x1           0.03464    0.01735    2.00    0.046 *  
x2           0.09029    0.12284    0.74    0.462    
x3          -0.25753    0.03310   -7.78  7.2e-15 ***
x4          -0.10476    0.02320   -4.52  6.3e-06 ***
x5          -0.00907    0.00774   -1.17    0.241    
x6           0.06723    0.03029    2.22    0.026 *  
x7           0.61509    0.11322    5.43  5.5e-08 ***
x8           0.06838    0.07701    0.89    0.375    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 801.68  on 698  degrees of freedom
Residual deviance: 551.00  on 690  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 569

Number of Fisher Scoring iterations: 6

#后向选择法

logit.step <- step(glm, direction = "backward")

summary(logit.step)

Call:
glm(formula = y ~ x1 + x3 + x4 + x6 + x7, family = binomial(link = logit), 
    data = Data)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.341  -0.653  -0.295   0.259   2.914  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.6456     0.5131   -3.21   0.0013 ** 
x1            0.0327     0.0172    1.91   0.0567 .  
x3           -0.2596     0.0301   -8.62  < 2e-16 ***
x4           -0.1034     0.0231   -4.48  7.4e-06 ***
x6            0.0908     0.0187    4.84  1.3e-06 ***
x7            0.5604     0.0892    6.28  3.4e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 801.68  on 698  degrees of freedom
Residual deviance: 552.70  on 693  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 564.7

Number of Fisher Scoring iterations: 6

二、决策树

#决策树算法预测销量高低代码

data = read.csv("./chapter5/sales_data.csv")[,2:5]

#计算一列数据信息熵

calculateEntropy <- function(data){

  t = table(data)

  sum = sum(t)

  t = t[t != 0]

  entropy <- -sum(log2(t/sum)*(t/sum))

  return(entropy)

}

#计算2列信息熵

calculateEntropy2 <- function(data){

  var = table(data[1])

  p = var/sum(var)

  varnames = names(var)

  array = c()

  for(name in varnames){

    array = append(array,calculateEntropy(subset(data, data[1] == name, select = 2)))

  }

  return(sum(array * p))

}

buildTree = function(data){

  if(length(unique(data$result)) == 1){

    cat(data$result[1])

    return()

  }

  if(length(names(data)) == 1){

    cat("...")

    return()

  }

  entropy = calculateEntropy(data$result)

  labels = names(data)

  label = ""

  temp = Inf

  subentropy = c()

  for(i in 1:(length(data)-1)){

    temp2 = calculateEntropy2(data[c(i,length(labels))])

    if(temp2 < temp){

      temp = temp2

      label = labels[i]

    }

    subentropy = append(subentropy, temp2)

  }

  cat(label)

  cat("[")

  nextLabels = labels[labels != label]

  for(value in unlist(unique(data[label]))){

    cat(value, ":")

    buildTree(subset(data, data[label] == value,select = nextLabels))

    cat(";")

  }

  cat("]")

}

#构建分类树

buildTree(data)


是否周末[yes :天气[bad :是否有促销[yes :...;no :...;];good :是否有促销[yes :...;no :...;];];no :是否有促销[yes :天气[bad :...;good :...;];no :天气[good :...;bad :...;];];]


#神经网络

library(nnet)

colnames(data)<-c("x1","x2","x3","y")

model1 = nnet(y~., data, size=6, decay=5e-4, maxit=1000)

# weights:  31
initial  value 26.051318 
iter  10 value 15.922331
iter  20 value 15.086145
iter  30 value 14.986261
iter  40 value 14.961586
iter  50 value 14.946794
iter  60 value 14.931805
iter  70 value 14.922406
iter  80 value 14.916750
iter  90 value 14.911668
iter 100 value 14.908190
iter 110 value 14.906581
iter 120 value 14.906084
iter 130 value 14.905649
iter 140 value 14.905359
iter 150 value 14.905066
iter 160 value 14.904837
iter 170 value 14.904617
iter 180 value 14.904524
iter 190 value 14.904443
iter 200 value 14.904396
iter 210 value 14.904298
iter 220 value 14.904219
iter 230 value 14.903930
iter 240 value 14.903773
iter 250 value 14.903655
iter 260 value 14.903566
iter 270 value 14.903509
iter 280 value 14.903450
iter 290 value 14.903420
iter 300 value 14.903382
final  value 14.903382 

pred = predict(model1, data[,1:3], type = "class")

(P = sum(as.numeric(pred == data$y))/nrow(data))

[1] 0.7647

table(data$y, pred)

      pred
       high low
  high   14   4
  low     4  12

> prop.table(table(data$y, pred), 1)
      pred
         high    low
  high 0.7778 0.2222
  low  0.2500 0.7500



#k-means聚类

setwd("E://code//dm_R")

Data = read.csv("./chapter5/consumption_data.csv",header = TRUE)[,2:4]

km = kmeans(Data, centers = 3)

> km$centers
         R         F         M
1 18.43978 11.238095 1207.5434
2 16.23223 10.819905 1925.1083
3 15.41667  7.376344  431.9666

km$size/sum(km$size)

[1] 0.3797872 0.2244681 0.3957447

#数据分组

aaa = data.frame(Data, km$cluster)

Data1 = Data[which(aaa$km.cluster == 1),]

Data2 = Data[which(aaa$km.cluster == 2),]

Data3 = Data[which(aaa$km.cluster == 3),]

par(mfrow = c(1,3))#客户分群“1”的概率密度函数图

plot(density(Data1[,1]), col = 'red', main = "R")

plot(density(Data1[,2]), col = 'red', main = "F")

plot(density(Data1[,3]), col = 'red', main = "M")

par(mfrow = c(1,3))#客户分群“1”的概率密度函数图

plot(density(Data2[,1]), col = 'red', main = "R")

plot(density(Data2[,2]), col = 'red', main = "F")

plot(density(Data2[,3]), col = 'red', main = "M")

par(mfrow = c(1,3))#客户分群“1”的概率密度函数图

plot(density(Data3[,1]), col = 'red', main = "R")

plot(density(Data3[,2]), col = 'red', main = "F")

plot(density(Data3[,3]), col = 'red', main = "M")

#使用kmeans函数构建一个聚类模型，使用图表示聚类纪录以及聚类中心

#使用kmeans函数构建一个聚类模型，使用图表示聚类纪录以及聚类中心

set.seed(2)

x = matrix(rnorm(50 * 2), ncol = 2)#生成50*2列个随机数

x[1:25, 1] = x[1:25, 1] + 3

x[1:25 ,2] = x[1:25, 2] - 4

km.out = kmeans(x, 2, nstart = 20)

km.out$cluster

 [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

plot(x, col = (km.out$cluster + 1), main = "K-Means Clustering Results with K = 2",

     xlab = "",ylab = "", pch = 20, cex = 2)

km.out$centers

        [,1]       [,2]
1 -0.1956978 -0.1848774
2  3.3339737 -4.0761910

points(km.out$centers[1,1],km.out$centers[1,2],pch = 10,col = "red", cex = 2)

points(km.out$centers[2,1],km.out$centers[2,2],pch = 10,col = "green", cex = 2)



#关联规则

library(arules)

setwd("E:/code/dm_R")

data = read.table()#argument "file" is missing, with no default

tr <- read.transactions("./chapter5/menu_orders.txt", format = "basket", sep = ",")

summary(tr)

 

transactions as itemMatrix in sparse format with
 10 rows (elements/itemsets/transactions) and
 5 columns (items) and a density of 0.54 

most frequent items:
      b       a       c       e       d (Other) 
      8       7       7       3       2       0 

element (itemset/transaction) length distribution:
sizes
2 3 4 
5 3 2 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    2.0     2.0     2.5     2.7     3.0     4.0 

includes extended item information - examples:
  labels
1      a
2      b
3      c
 

> inspect(tr)
     items    
[1]  {a,c,e}  
[2]  {b,d}    
[3]  {b,c}    
[4]  {a,b,c,d}
[5]  {a,b}    
[6]  {b,c}    
[7]  {a,b}    
[8]  {a,b,c,e}
[9]  {a,b,c}  
[10] {a,c,e}  


> rules0 = apriori(tr, parameter = list(support = 0.2, confidence = 0.5))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target   ext
        0.5    0.1    1 none FALSE            TRUE       5     0.2      1     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 2 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[5 item(s), 10 transaction(s)] done [0.00s].
sorting and recoding items ... [5 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [18 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].


> rules0
set of 18 rules

> inspect(rules0)
     lhs      rhs support confidence lift     
[1]  {}    => {c} 0.7     0.7000000  1.0000000
[2]  {}    => {b} 0.8     0.8000000  1.0000000
[3]  {}    => {a} 0.7     0.7000000  1.0000000
[4]  {d}   => {b} 0.2     1.0000000  1.2500000
[5]  {e}   => {c} 0.3     1.0000000  1.4285714
[6]  {e}   => {a} 0.3     1.0000000  1.4285714
[7]  {c}   => {b} 0.5     0.7142857  0.8928571
[8]  {b}   => {c} 0.5     0.6250000  0.8928571
[9]  {c}   => {a} 0.5     0.7142857  1.0204082
[10] {a}   => {c} 0.5     0.7142857  1.0204082
[11] {b}   => {a} 0.5     0.6250000  0.8928571
[12] {a}   => {b} 0.5     0.7142857  0.8928571
[13] {c,e} => {a} 0.3     1.0000000  1.4285714
[14] {a,e} => {c} 0.3     1.0000000  1.4285714
[15] {a,c} => {e} 0.3     0.6000000  2.0000000
[16] {b,c} => {a} 0.3     0.6000000  0.8571429
[17] {a,c} => {b} 0.3     0.6000000  0.7500000
[18] {a,b} => {c} 0.3     0.6000000  0.8571429

#离群点检测

Data = read.csv("./chapter5/consumption_data.csv")

Data = scale(Data)#标准化

set.seed(12)

km = kmeans(Data, centers = 3)

> print(km)
K-means clustering with 3 clusters of sizes 559, 341, 40

Cluster means:
           R          F          M
1 -0.1493534 -0.6588930 -0.2717798
2 -0.1604506  1.1148015  0.3928444
3  3.4550549 -0.2956536  0.4491234

Clustering vector:
  [1] 1 1 2 1 1 1 1 1 1 2 1 1 2 2 1 2 3 1 1 1 1 1 2 1 1 2 1 2 1 3 2 1 1 1 2 2 1 1 2 1 1 2 1 1
 [45] 2 1 1 1 2 2 1 1 1 2 1 1 1 3 1 1 1 2 2 1 2 1 1 1 1 1 2 2 1 1 2 1 3 2 2 1 1 1 1 1 1 2 2 1
 [89] 1 2 3 1 2 2 1 1 2 1 2 2 1 1 3 2 2 1 1 2 2 1 1 2 1 1 1 3 1 1 3 2 1 1 1 1 1 1 1 1 1 3 1 1
[133] 1 1 1 1 1 1 3 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 2 2 2 2 1 1
[177] 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1
[221] 1 2 1 1 1 3 1 1 1 2 1 1 1 1 1 1 3 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 3 3 1 1 2 1 1 2 1 1
[265] 2 1 2 1 3 1 1 1 1 2 2 2 1 2 1 1 1 3 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 3 1 3 2 2 1 2 1 1 1 1
[309] 2 1 1 1 1 1 1 1 2 3 1 2 1 1 1 2 2 2 1 1 1 1 1 2 3 1 2 2 1 2 2 2 1 1 1 1 2 1 1 1 1 2 1 1
[353] 1 2 1 2 3 2 1 1 1 2 2 1 2 1 1 2 1 1 1 2 2 2 1 1 2 1 3 2 1 2 1 2 1 1 1 1 1 1 2 1 2 2 1 2
[397] 1 2 2 3 2 2 2 1 2 2 2 2 1 2 1 2 2 1 2 2 2 1 3 1 1 2 1 2 2 1 1 2 2 2 1 1 1 2 2 1 2 1 2 1
[441] 2 2 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 2 1 2 2 2
[485] 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 2 1 2 1 1 1 1 2 2 2 3 1 1
[529] 1 1 2 1 2 2 1 1 1 2 2 2 2 2 2 1 2 2 1 2 1 2 1 1 2 1 1 2 1 2 3 1 2 1 1 1 2 1 1 2 2 1 1 2
[573] 2 1 2 1 2 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 2 1 2 2 1 2 2 1 1 1 1 1 2 1 2 1 1 3 1 1 1 1
[617] 1 1 1 2 1 1 2 2 1 3 1 1 1 2 2 2 1 1 2 2 2 1 1 1 3 2 2 2 1 1 1 1 1 2 1 2 1 3 2 2 1 1 2 1
[661] 1 1 1 2 1 2 1 2 1 3 2 2 2 1 2 1 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 2 1 2 1 1 1 1 3 2 2 1
[705] 1 2 1 2 1 1 2 1 1 1 2 2 1 1 1 1 3 2 2 1 1 1 1 2 2 1 2 1 1 2 2 1 1 3 1 1 1 1 2 1 1 1 1 1
[749] 1 1 1 1 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 2 1 1 2 2 1 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 3
[793] 1 3 2 1 2 1 2 2 2 2 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 2 1
[837] 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 2 1 2 2 2 1 1 2 2 3 2 2 2 2 2 1
[881] 1 3 2 2 2 1 1 2 1 1 2 2 2 1 1 2 1 2 1 3 1 1 2 1 1 2 1 2 1 1 2 2 1 2 2 1 1 1 1 1 2 1 1 1
[925] 2 1 2 2 2 2 1 2 2 2 1 1 1 1 1 1

Within cluster sum of squares by cluster:
[1] 689.9626 668.0462 188.2933
 (between_SS / total_SS =  45.1 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      
> km$centers
           R          F          M
1 -0.1493534 -0.6588930 -0.2717798
2 -0.1604506  1.1148015  0.3928444
3  3.4550549 -0.2956536  0.4491234

#各样本欧氏距离

x = list()

juli = list()

for(i in 1:3){

  x[[i]] = matrix(km$centers[i,],nrow = 940, ncol = 3, byrow = T)#940行，3列

  juli[[i]] = sqrt(rowSums((Data - x[[i]])^2))

}

juli1 = juli[1]

juli2 = juli[2]

juli3 = juli[3]

dist = data.frame(juli1, juli2, juli3)

colnames(dist) <- c('juli1', 'juli2', 'juli3')

#欧氏距离最小值

y = apply(dist, 1, min)#1指行，提取每一行的最小值

plot(1:940, y, xlim = c(0, 940), xlab = "样本点", ylab = "欧氏距离")

points(which(y > 2.5), y[which(y > 2.5)], pch = 19, col = "red")

> which(y > 2.5)
[1]  30  39 226 339 484 525 654 670 933

阅读┊ 收藏 ┊ 喜欢 ▼ ┊打印┊举报/Report

前一篇：R语言数据分析与挖掘实战第四章数据预处理

后一篇：python数据分析与挖掘实战第四章

新浪BLOG意见反馈留言板　欢迎批评指正