赞
踩
- # 导入所需的包
- library(gbm)
- library(ggplot2)
- library(ada)
- library(class)
- library(MASS)
- library(caret)
-
- load_data = function(file_path) {
- names = c('state', 'acc_length', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'num_vm_message', 'day_min',
- 'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge', 'night_min', 'night_calls',
- 'night_charge', 'inter_min', 'inter_calls', 'inter_charge', 'cus_ser_calls', 'churn')
-
- data = read.csv(file_path, header = TRUE)
- colnames(data) = names
- return(data)
- }
-
- check_data_feature = function(data) {
- print(dim(data))
- print(str(data))
- print(table(data$churn))
- }
-
- draw_feature_plot = function(data){
- par(mfrow = c(1,2))
- barplot(table(data$churn), col = "skyblue", main = "churn True or False", las = 1)
- barplot(table(data$cus_ser_calls),
- col = "skyblue", main = "customer service calls times", las = 1)
-
- par(mfrow = c(4, 3), mar = c(3, 3, 2, 1))
- name <- c('day_min', 'day_calls', 'day_charge', 'eve_min', 'eve_calls', 'eve_charge',
- 'night_min', 'night_calls', 'night_charge', 'inter_min', 'inter_calls', 'inter_charge')
- for (i in seq_along(name)) {
- numdata = as.numeric(data[[name[i]]])
- hist(numdata, main = paste('Density of', name[i]), xlab = name[i], col = 'skyblue',
- border = 'black', probability = TRUE)
- lines(density(numdata), col = 'red', lwd = 2)
- }
- }
-
- feature_associated = function(data){
- inter_plan_counts <- table(data$inter_plan, data$churn)
- df_inter <- as.data.frame(inter_plan_counts)
- names(df_inter) <- c('inter_plan', 'churn', 'count')
- p1 <- ggplot(df_inter, aes(x = inter_plan, y = count, fill = churn)) +
- geom_bar(stat = 'identity', position = 'stack') +
- labs(title = 'Inter or No Inter of Churn', x = 'Inter or Not Inter', y = 'Number')
-
- cus_calls_counts <- table(data$cus_ser_calls, data$churn)
- df_cus <- as.data.frame(cus_calls_counts)
- names(df_cus) <- c('cus_ser_calls', 'churn', 'count')
- p2 <- ggplot(df_cus, aes(x = cus_ser_calls, y = count, fill = churn)) +
- geom_bar(stat = 'identity', position = 'stack') +
- labs(title = 'Customer Service Calls about Churn', x = 'Customer Service Calls', y = 'Numbers') +
- theme_minimal()
- print(p1)
- print(p2)
- }
-
- deal_data <- function(data) {
- # 分离目标变量
- y <- ifelse(data$churn == "True", 1, 0)
-
- # 将分类变量转换为虚拟变量
- new_inter <- model.matrix(~inter_plan - 1, data)
- new_vm_plan <- model.matrix(~vm_plan - 1, data)
-
- # 合并数据
- data_temp <- cbind(data, new_inter, new_vm_plan)
-
- # 删除无用的特征
- to_drop <- c('state', 'area', 'ph_num', 'inter_plan', 'vm_plan', 'churn')
- data_df <- data_temp[, !(names(data_temp) %in% to_drop)]
-
- # 标准化特征数据
- X <- scale(data_df)
-
- return(list(X = X, y = y))
- }
-
- choose_algorithm = function(X, y){
- set.seed(1)
- train_index <- sample(3333, 2333)
-
- # 从全体数据中选择训练集和测试集
- X_train <- X[train_index, ]
- X_test <- X[-train_index, ]
- y_train <- y[train_index]
- y_test <- y[-train_index]
- # 逻辑回归
- fit_lr <- glm(y_train ~ ., data = as.data.frame(X_train), family = binomial)
- pred_test_lr <- predict(fit_lr, newdata = as.data.frame(X_test), type = "response") > 0.5
- conf_matrix_test_lr <- table(predicted = pred_test_lr, Actual = y_test)
- Accuracy_test_lr <- sum(diag(conf_matrix_test_lr)) / sum(conf_matrix_test_lr)
-
- # 线性判别分析
- fit_lda <- lda(y_train ~ ., data = as.data.frame(X_train))
- pred_test_lda <- predict(fit_lda, newdata = as.data.frame(X_test))$class
- conf_matrix_test_lda <- table(predicted = pred_test_lda, Actual = y_test)
- Accuracy_test_lda <- sum(diag(conf_matrix_test_lda)) / sum(conf_matrix_test_lda)
-
- # K最近邻
- fit_knn <- knn(X_train, X_test, cl = y_train, k = 3)
- conf_matrix_test_knn <- table(predicted = fit_knn, Actual = y_test)
- Accuracy_test_knn <- sum(diag(conf_matrix_test_knn)) / sum(conf_matrix_test_knn)
-
- # 输出每个模型的测试集准确率
- cat("Logistic Regression Accuracy:", Accuracy_test_lr, "\n")
- cat("Linear Discriminant Analysis Accuracy:", Accuracy_test_lda, "\n")
- cat("K-Nearest Neighbors Accuracy:", Accuracy_test_knn, "\n")
-
- # 创建一个数据框,包含模型名称和准确率
- df <- data.frame(Model = c("LR", "LDA", "KNN(k=3)"),
- Accuracy = c(Accuracy_test_lr, Accuracy_test_lda, Accuracy_test_knn))
-
- # 使用 ggplot2 绘制箱线图
- ggplot(df, aes(x = Model, y = Accuracy)) +
- geom_boxplot(fill = "lightblue", color = "darkblue") +
- geom_point(size = 3, color = "red") +
- labs(title = "Model Accuracies on Test Set", y = "Accuracy") +
- theme_minimal()
- }
-
- improve_result = function(X, y){
- set.seed(1)
- train_index <- sample(3333, 2333)
- # 从全体数据中选择训练集和测试集
- X_train <- X[train_index, ]
- X_test <- X[-train_index, ]
- y_train <- y[train_index]
- y_test <- y[-train_index]
-
- # 使用 gbm 进行梯度提升
- gb_model <- gbm(y_train ~ ., data = as.data.frame(X_train), distribution = "bernoulli", n.trees = 100, interaction.depth = 1, shrinkage = 0.1)
-
- # 在训练集上的预测
- prob_train <- predict(gb_model, newdata = as.data.frame(X_train), n.trees = 100, type = "response")
- pred_train <- prob_train > 0.5
-
- # 在测试集上的预测
- prob_test <- predict(gb_model, newdata = as.data.frame(X_test), n.trees = 100, type = "response")
- pred_test <- prob_test > 0.5
-
- # 创建混淆矩阵并计算准确率
- conf_matrix_train <- table(predicted = pred_train, Actual = y_train)
- accuracy_train <- sum(diag(conf_matrix_train)) / sum(conf_matrix_train)
-
- conf_matrix_test <- table(predicted = pred_test, Actual = y_test)
- accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
-
- cat("Gradient Boosting训练集准确率:", accuracy_train, "\n")
- cat("Gradient Boosting测试集准确率:", accuracy_test, "\n")
-
- # 使用 ada 进行训练
- ada_model <- ada(y_train ~ ., data = as.data.frame(cbind(y_train, X_train)), iter = 100)
- # 在训练集上的预测
- pred_train_ada <- predict(ada_model, newdata = as.data.frame(X_train))
-
- # 在测试集上的预测
- pred_test_ada <- predict(ada_model, newdata = as.data.frame(X_test))
-
- # 创建混淆矩阵并计算准确率
- conf_matrix_train_ada <- table(predicted = pred_train_ada, Actual = y_train)
- accuracy_train_ada <- sum(diag(conf_matrix_train_ada)) / sum(conf_matrix_train_ada)
-
- conf_matrix_test_ada <- table(predicted = pred_test_ada, Actual = y_test)
- accuracy_test_ada <- sum(diag(conf_matrix_test_ada)) / sum(conf_matrix_test_ada)
-
- cat("AdaBoost训练集准确率:", accuracy_train_ada, "\n")
- cat("AdaBoost测试集准确率:", accuracy_test_ada, "\n")
- }
-
- hunxiao = function(){
- # 创建一个简单的混淆矩阵数据框
- conf_matrix_data <- data.frame(
- Actual = rep(c("0", "1"), each = 2),
- Predicted = rep(c("0", ""), times = 2),
- Value = c(869, 34, 4, 103) # 修改为新的混淆矩阵的实际值
- )
-
- # 使用 ggplot2 创建混淆矩阵图
- ggplot(conf_matrix_data, aes(x = Actual, y = Predicted, fill = Value)) +
- geom_tile() +
- geom_text(aes(label = Value), vjust = 1) +
- scale_fill_gradient(low = "lightblue", high = "darkblue") +
- theme_minimal() +
- theme(
- axis.text.x = element_text(angle = 45, hjust = 1),
- axis.title = element_blank(),
- axis.ticks = element_blank(),
- panel.grid = element_blank(),
- legend.position = "none"
- ) +
- labs(title = "Custom Confusion Matrix")
- }
-
- main = function(){
- file_path = "C:/Users/27128/Desktop/R_project/R_P/data.csv"
- data = load_data(file_path)
- check_data_feature(data)
- draw_feature_plot(data)
- feature_associated(data)
- result <- deal_data(data)
- X <- result$X
- y <- result$y
- choose_algorithm(X, y)
- improve_result(X, y)
- hunxiao()
- }
-
- main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。