当前位置:   article > 正文

基于逻辑回归、GBDT、AdaBoost模型的客户流失预测(1)_基于gbdt算法预估电信客户流失

基于gbdt算法预估电信客户流失
  1. # 导入所需的包
  2. library(gbm)
  3. library(ggplot2)
  4. library(ada)
  5. library(class)
  6. library(MASS)
  7. library(caret)
  8. load_data = function(file_path) {
  9. names = c('state', 'acc_length', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'num_vm_message', 'day_min',
  10. 'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge', 'night_min', 'night_calls',
  11. 'night_charge', 'inter_min', 'inter_calls', 'inter_charge', 'cus_ser_calls', 'churn')
  12. data = read.csv(file_path, header = TRUE)
  13. colnames(data) = names
  14. return(data)
  15. }
  16. check_data_feature = function(data) {
  17. print(dim(data))
  18. print(str(data))
  19. print(table(data$churn))
  20. }
  21. draw_feature_plot = function(data){
  22. par(mfrow = c(1,2))
  23. barplot(table(data$churn), col = "skyblue", main = "churn True or False", las = 1)
  24. barplot(table(data$cus_ser_calls),
  25. col = "skyblue", main = "customer service calls times", las = 1)
  26. par(mfrow = c(4, 3), mar = c(3, 3, 2, 1))
  27. name <- c('day_min', 'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge',
  28. 'night_min', 'night_calls', 'night_charge', 'inter_min', 'inter_calls', 'inter_charge')
  29. for (i in seq_along(name)) {
  30. numdata = as.numeric(data[[name[i]]])
  31. hist(numdata, main = paste('Density of', name[i]), xlab = name[i], col = 'skyblue',
  32. border = 'black', probability = TRUE)
  33. lines(density(numdata), col = 'red', lwd = 2)
  34. }
  35. }
  36. feature_associated = function(data){
  37. inter_plan_counts <- table(data$inter_plan, data$churn)
  38. df_inter <- as.data.frame(inter_plan_counts)
  39. names(df_inter) <- c('inter_plan', 'churn', 'count')
  40. p1 <- ggplot(df_inter, aes(x = inter_plan, y = count, fill = churn)) +
  41. geom_bar(stat = 'identity', position = 'stack') +
  42. labs(title = 'Inter or No Inter of Churn', x = 'Inter or Not Inter', y = 'Number')
  43. cus_calls_counts <- table(data$cus_ser_calls, data$churn)
  44. df_cus <- as.data.frame(cus_calls_counts)
  45. names(df_cus) <- c('cus_ser_calls', 'churn', 'count')
  46. p2 <- ggplot(df_cus, aes(x = cus_ser_calls, y = count, fill = churn)) +
  47. geom_bar(stat = 'identity', position = 'stack') +
  48. labs(title = 'Customer Service Calls about Churn', x = 'Customer Service Calls', y = 'Numbers') +
  49. theme_minimal()
  50. print(p1)
  51. print(p2)
  52. }
  53. deal_data <- function(data) {
  54. # 分离目标变量
  55. y <- ifelse(data$churn == "True", 1, 0)
  56. # 将分类变量转换为虚拟变量
  57. new_inter <- model.matrix(~inter_plan - 1, data)
  58. new_vm_plan <- model.matrix(~vm_plan - 1, data)
  59. # 合并数据
  60. data_temp <- cbind(data, new_inter, new_vm_plan)
  61. # 删除无用的特征
  62. to_drop <- c('state', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'churn')
  63. data_df <- data_temp[, !(names(data_temp) %in% to_drop)]
  64. # 标准化特征数据
  65. X <- scale(data_df)
  66. return(list(X = X, y = y))
  67. }
  68. choose_algorithm = function(X, y){
  69. set.seed(1)
  70. train_index <- sample(3333, 2333)
  71. # 从全体数据中选择训练集和测试集
  72. X_train <- X[train_index, ]
  73. X_test <- X[-train_index, ]
  74. y_train <- y[train_index]
  75. y_test <- y[-train_index]
  76. # 逻辑回归
  77. fit_lr <- glm(y_train ~ ., data = as.data.frame(X_train), family = binomial)
  78. pred_test_lr <- predict(fit_lr, newdata = as.data.frame(X_test), type = "response") > 0.5
  79. conf_matrix_test_lr <- table(predicted = pred_test_lr, Actual = y_test)
  80. Accuracy_test_lr <- sum(diag(conf_matrix_test_lr)) / sum(conf_matrix_test_lr)
  81. # 线性判别分析
  82. fit_lda <- lda(y_train ~ ., data = as.data.frame(X_train))
  83. pred_test_lda <- predict(fit_lda, newdata = as.data.frame(X_test))$class
  84. conf_matrix_test_lda <- table(predicted = pred_test_lda, Actual = y_test)
  85. Accuracy_test_lda <- sum(diag(conf_matrix_test_lda)) / sum(conf_matrix_test_lda)
  86. # K最近邻
  87. fit_knn <- knn(X_train, X_test, cl = y_train, k = 3)
  88. conf_matrix_test_knn <- table(predicted = fit_knn, Actual = y_test)
  89. Accuracy_test_knn <- sum(diag(conf_matrix_test_knn)) / sum(conf_matrix_test_knn)
  90. # 输出每个模型的测试集准确率
  91. cat("Logistic Regression Accuracy:", Accuracy_test_lr, "\n")
  92. cat("Linear Discriminant Analysis Accuracy:", Accuracy_test_lda, "\n")
  93. cat("K-Nearest Neighbors Accuracy:", Accuracy_test_knn, "\n")
  94. # 创建一个数据框,包含模型名称和准确率
  95. df <- data.frame(Model = c("LR", "LDA", "KNN(k=3)"),
  96. Accuracy = c(Accuracy_test_lr, Accuracy_test_lda, Accuracy_test_knn))
  97. # 使用 ggplot2 绘制箱线图
  98. ggplot(df, aes(x = Model, y = Accuracy)) +
  99. geom_boxplot(fill = "lightblue", color = "darkblue") +
  100. geom_point(size = 3, color = "red") +
  101. labs(title = "Model Accuracies on Test Set", y = "Accuracy") +
  102. theme_minimal()
  103. }
  104. improve_result = function(X, y){
  105. set.seed(1)
  106. train_index <- sample(3333, 2333)
  107. # 从全体数据中选择训练集和测试集
  108. X_train <- X[train_index, ]
  109. X_test <- X[-train_index, ]
  110. y_train <- y[train_index]
  111. y_test <- y[-train_index]
  112. # 使用 gbm 进行梯度提升
  113. gb_model <- gbm(y_train ~ ., data = as.data.frame(X_train), distribution = "bernoulli", n.trees = 100, interaction.depth = 1, shrinkage = 0.1)
  114. # 在训练集上的预测
  115. prob_train <- predict(gb_model, newdata = as.data.frame(X_train), n.trees = 100, type = "response")
  116. pred_train <- prob_train > 0.5
  117. # 在测试集上的预测
  118. prob_test <- predict(gb_model, newdata = as.data.frame(X_test), n.trees = 100, type = "response")
  119. pred_test <- prob_test > 0.5
  120. # 创建混淆矩阵并计算准确率
  121. conf_matrix_train <- table(predicted = pred_train, Actual = y_train)
  122. accuracy_train <- sum(diag(conf_matrix_train)) / sum(conf_matrix_train)
  123. conf_matrix_test <- table(predicted = pred_test, Actual = y_test)
  124. accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
  125. cat("Gradient Boosting训练集准确率:", accuracy_train, "\n")
  126. cat("Gradient Boosting测试集准确率:", accuracy_test, "\n")
  127. # 使用 ada 进行训练
  128. ada_model <- ada(y_train ~ ., data = as.data.frame(cbind(y_train, X_train)), iter = 100)
  129. # 在训练集上的预测
  130. pred_train_ada <- predict(ada_model, newdata = as.data.frame(X_train))
  131. # 在测试集上的预测
  132. pred_test_ada <- predict(ada_model, newdata = as.data.frame(X_test))
  133. # 创建混淆矩阵并计算准确率
  134. conf_matrix_train_ada <- table(predicted = pred_train_ada, Actual = y_train)
  135. accuracy_train_ada <- sum(diag(conf_matrix_train_ada)) / sum(conf_matrix_train_ada)
  136. conf_matrix_test_ada <- table(predicted = pred_test_ada, Actual = y_test)
  137. accuracy_test_ada <- sum(diag(conf_matrix_test_ada)) / sum(conf_matrix_test_ada)
  138. cat("AdaBoost训练集准确率:", accuracy_train_ada, "\n")
  139. cat("AdaBoost测试集准确率:", accuracy_test_ada, "\n")
  140. }
  141. hunxiao = function(){
  142. # 创建一个简单的混淆矩阵数据框
  143. conf_matrix_data <- data.frame(
  144. Actual = rep(c("0", "1"), each = 2),
  145. Predicted = rep(c("0", ""), times = 2),
  146. Value = c(869, 34, 4, 103) # 修改为新的混淆矩阵的实际值
  147. )
  148. # 使用 ggplot2 创建混淆矩阵图
  149. ggplot(conf_matrix_data, aes(x = Actual, y = Predicted, fill = Value)) +
  150. geom_tile() +
  151. geom_text(aes(label = Value), vjust = 1) +
  152. scale_fill_gradient(low = "lightblue", high = "darkblue") +
  153. theme_minimal() +
  154. theme(
  155. axis.text.x = element_text(angle = 45, hjust = 1),
  156. axis.title = element_blank(),
  157. axis.ticks = element_blank(),
  158. panel.grid = element_blank(),
  159. legend.position = "none"
  160. ) +
  161. labs(title = "Custom Confusion Matrix")
  162. }
  163. main = function(){
  164. file_path = "C:/Users/27128/Desktop/R_project/R_P/data.csv"
  165. data = load_data(file_path)
  166. check_data_feature(data)
  167. draw_feature_plot(data)
  168. feature_associated(data)
  169. result <- deal_data(data)
  170. X <- result$X
  171. y <- result$y
  172. choose_algorithm(X, y)
  173. improve_result(X, y)
  174. hunxiao()
  175. }
  176. main()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/584327
推荐阅读
相关标签
  

闽ICP备14008679号