当前位置:   article > 正文

机器学习Sklearn实战——回归算法应用、xgboost、lightingGBM

lightinggbm

回归算法补全人脸

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. #构建方程
  4. from sklearn.linear_model import LinearRegression,Ridge,Lasso
  5. #构建方程???
  6. from sklearn.neighbors import KNeighborsRegressor
  7. from sklearn.tree import DecisionTreeRegressor
  8. from sklearn import datasets
  9. from sklearn.model_selection import train_test_split
  1. faces = datasets.fetch_olivetti_faces()
  2. X = faces.data
  3. images = faces.images #X,images数据一样
  4. y = faces.target
  5. display(X.shape,y.shape,images.shape)

结果:

  1. (400, 4096)
  2. (400,)
  3. (400, 64, 64)
  1. plt.figure(figsize = (2,2))
  2. index = np.random.randint(0,400,size = 1)[0]
  3. img = images[index]
  4. plt.imshow(img,cmap = plt.cm.gray)

结果:

  1. #将X(人脸数据)分成上半张人脸和下半张人脸
  2. X_up = X[:,:2048]
  3. X_down = X[:,2048:]
  4. index = np.random.randint(0,400,size = 1)[0]
  5. axes = plt.subplot(1,3,1)
  6. up_face = X_up[index].reshape(32,64)
  7. axes.imshow(up_face,cmap = plt.cm.gray)
  8. axes = plt.subplot(1,3,2)
  9. down_face = X_down[index].reshape(32,64)
  10. axes.imshow(down_face,cmap = plt.cm.gray)
  11. axes = plt.subplot(1,3,3)
  12. face = X[index].reshape(64,64)
  13. axes.imshow(face,cmap = plt.cm.gray)

  1. X_train,X_test,y_train,y_test = train_test_split(X_up,X_down,test_size = 30)
  2. estimators = {}
  3. estimators["linear"] = LinearRegression()
  4. estimators["ridge"] = Ridge(alpha = 0.1)
  5. estimators["lasso"] = Ridge(alpha = 1)
  6. estimators["knn"] = KNeighborsRegressor(n_neighbors = 5)
  7. estimators["tree"] = DecisionTreeRegressor()
  8. result = {}
  9. for key,model in estimators.items():
  10. model.fit(X_train,y_train)
  11. y_ = model.predict(X_test) #预测的是下半张人脸
  12. result[key] = y_
  1. ###可视化####
  2. plt.figure(figsize = (7*2,10*2))
  3. for i in range(0,10):
  4. #第一列,上半张人脸
  5. axes = plt.subplot(10,7,i*7+1)
  6. up_face = X_test[i].reshape(32,64)
  7. axes.imshow(up_face,cmap = plt.cm.gray)
  8. axes.axis("off")
  9. if i ==0:
  10. axes.set_title("up-face")
  11. #第七列,整张人脸
  12. axes = plt.subplot(10,7,i*7+7)
  13. down_face = y_test[i].reshape(32,64)
  14. total_face = np.concatenate([up_face,down_face])
  15. axes.imshow(total_face,cmap = plt.cm.gray)
  16. axes.axis("off")
  17. if i ==0:
  18. axes.set_title("True-face")
  19. #绘制第二列,到第六列,算法预测的数据在result,字典,key算法,value预测人脸
  20. for j,key in enumerate(result): #j 0,1,2,3,4
  21. axes = plt.subplot(10,7,i*7+j+2)
  22. predict_down_face = result[key][i].reshape(32,64)
  23. predict_face = np.concatenate([up_face,predict_down_face])
  24. axes.imshow(predict_face,cmap = plt.cm.gray)
  25. axes.axis("off")
  26. if i ==0:
  27. axes.set_title(key)

不同回归算法比较

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. from sklearn.neighbors import KNeighborsRegressor
  4. from sklearn.linear_model import LinearRegression
  5. from sklearn.tree import DecisionTreeRegressor
  6. X = np.linspace(0,2*np.pi,50).reshape(-1,1)
  7. y = np.sin(X)
  8. plt.scatter(X,y)

 线性回归

  1. linear = LinearRegression()
  2. linear.fit(X,y)
  3. x = np.linspace(0,2*np.pi,150).reshape(-1,1)
  4. y_ = linear.predict(x)
  5. plt.scatter(X,y)
  6. plt.plot(x,y_,c = "g")
  7. print(linear.coef_,linear.intercept_)

KNN 

  1. #KNN回归不是方程,更像是平均值,找5个邻居,计算5个邻居的平均值,穿过去
  2. knn = KNeighborsRegressor(n_neighbors=1)
  3. knn.fit(X,y)
  4. x = np.linspace(0,2*np.pi,150).reshape(-1,1)
  5. y_ = knn.predict(x)
  6. plt.scatter(X,y)
  7. plt.plot(x,y_,c = "g")

 决策树

  1. model = DecisionTreeRegressor()
  2. model.fit(X,y)
  3. x = np.linspace(0,2*np.pi,150).reshape(-1,1)
  4. y_ = model.predict(x)
  5. plt.scatter(X,y)
  6. plt.plot(x,y_,c = "g")

  1. from sklearn import tree
  2. plt.figure(figsize = (16,12))
  3. tree.plot_tree(model,filled = True)

  1. x = np.linspace(-np.pi,3*np.pi,200).reshape(-1,1)
  2. linear = LinearRegression()
  3. linear.fit(X,y)
  4. y_ = linear.predict(x)
  5. plt.scatter(X,y)
  6. plt.plot(x,y_,c="g")

  1. knn = KNeighborsRegressor()
  2. knn.fit(X,y)
  3. y_ = knn.predict(x)
  4. plt.scatter(X,y)
  5. plt.plot(x,y_,c="g")

  1. dt = DecisionTreeRegressor()
  2. dt.fit(X,y)
  3. #!!!数据预处理
  4. pre_x = x.copy()
  5. cond = pre_x > 2*np.pi
  6. pre_x[cond] -= 2*np.pi
  7. cond2 = pre_x < 0
  8. pre_x[cond2] += 2*np.pi
  9. y_ = dt.predict(pre_x)
  10. plt.scatter(X,y)
  11. plt.plot(x,y_,c="g")

 

线性问题用线性回归好(包括一元二次等) (猜很重要)

  1. f = lambda x:(x-3)**2 + 3.6*x +2.718
  2. X = np.linspace(-2,4,50).reshape(-1,1)
  3. y = f(X)
  4. plt.scatter(X,y)

  1. X = np.concatenate([X**2,X],axis = 1)
  2. X_test = np.linspace(-4,8,200).reshape(-1,1)
  3. X_test = np.concatenate([X_test**2,X_test],axis = 1)
  4. Linear = LinearRegression()
  5. linear.fit(X,y)
  6. y_ = linear.predict(X_test)
  7. plt.scatter(X[:,1],y)
  8. plt.plot(X_test[:,1],y_,c="g")

xgboost算法

xgboost分类使用

XGBoost是一个优化的分布式梯度增强库,旨在高效、灵活和便携。它在梯度提升框架下实现了机器学习算法。XGBoost提供了并行树增强(也称为GBDT,GBM),以快速准确地解决许多数据科学问题。相同的代码在主要分布式环境(Hadoop、SGE、MPI)上运行,可以解决数十亿个示例以外的问题。

cpu复杂计算,gpu繁琐的计算,gpu的速度比cpu的速度快10倍

xgboost

  1. import xgboost as xgb
  2. import numpy as np
  3. from xgboost import XGBClassifier,XGBRegressor
  4. from sklearn import datasets
  5. from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
  6. X,y = datasets.load_wine(True)
  7. from sklearn.model_selection import train_test_split
  8. X_train,X_test,y_train,y_test = train_test_split(X,y)
  9. clf = XGBClassifier(n_estimators = 100,max_depth = 3)
  10. clf.fit(X_train,y_train)
  11. clf.score(X_test,y_test)

结果:

0.9555555555555556

随机森林

  1. forest = RandomForestClassifier(max_depth=3,n_estimators=100)
  2. forest.fit(X_train,y_train)
  3. forest.score(X_test,y_test)

结果:

0.9777777777777777

adaboost

  1. ada = AdaBoostClassifier(n_estimators=100)
  2. ada.fit(X_train,y_train)
  3. ada.score(X_test,y_test)

结果:

0.6

gbdtboost

  1. gbdt = GradientBoostingClassifier(n_estimators=100,max_depth=3)
  2. gbdt.fit(X_train,y_train)
  3. gbdt.score(X_test,y_test)

结果:

0.9777777777777777

xgboost保存数据,稀松矩阵,有的存没有的不存,节省内存

xgboost回归使用

线性回归

  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.linear_model import LinearRegression
  4. from sklearn.metrics import mean_squared_error
  5. train = pd.read_csv("/Users/zhucan/Desktop/zhengqi_train.txt",sep = "\t")
  6. X = train.iloc[:,0:-1]
  7. y = train["target"]
  8. X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
  9. linear = LinearRegression()
  10. linear.fit(X_train,y_train)
  11. linear.score(X_test,y_test)

结果:

0.8778958117853413
  1. y_ = linear.predict(X_test)
  2. mean_squared_error(y_,y_test)

结果:

0.11247900373481347

adaboost 

  1. from sklearn.ensemble import AdaBoostRegressor
  2. ada = AdaBoostRegressor()
  3. ada.fit(X_train,y_train)
  4. ada.score(X_test,y_test)

结果:

0.8209707181954986
  1. y_ = ada.predict(X_test)
  2. mean_squared_error(y_,y_test)

结果:

0.16491682677852665

xgboost

  1. from xgboost import XGBRegressor
  2. xgb = XGBRegressor()
  3. xgb.fit(X_train,y_train)
  4. xgb.score(X_test,y_test)

结果:

0.8682503016507106
  1. y_ = xgb.predict(X_test)
  2. mean_squared_error(y_,y_test)

结果:

0.12136418110931947
  1. #保存数据
  2. pd.Series(data).to_csv("",index = False)

lightingGBM算法

LightGBM是一个梯度增强框架,使用基于树的学习算法。它旨在具有以下优势的分布式和高效:

  1. 更快的训练速度和更高的效率。

  2. 降低内存使用率。

  3. 更好的准确性。

  4. 支持并行、分布式和GPU学习。

  5. 能够处理大规模数据。

(1)无数据清洗xgboost和lightGBM对比

  1. from lightgbm import LGBMRegressor
  2. from xgboost import XGBRegressor
  3. import numpy as np
  4. import pandas as pd
  5. train = pd.read_csv(r"C:\Users\dream\Documents\Tencent Files\1799785728\FileRecv\zhengqi_train的副本.txt",sep = "\t")
  6. test = pd.read_csv(r"C:\Users\dream\Documents\Tencent Files\1799785728\FileRecv\zhengqi_test的副本.txt",sep = "\t")
  7. X_train = train.iloc[:,:-1]
  8. y_train = train["target"]
  9. %%time
  10. light = LGBMRegressor()
  11. light.fit(X_train,y_train)
  12. y_ = light.predict(test)
  13. %%time
  14. xgb = XGBRegressor()
  15. xgb.fit(X_train,y_train)
  16. y_ = xgb.predict(test)

结果:

  1. Wall time: 273 ms
  2. Wall time: 1.35 s
  3. xgb score:0.1416
  4. LGBM score:0.1399

(2)数据清洗后

  1. cov = train.cov()
  2. cov.loc["target"]
  3. drop_labels = cov.index[cov.loc["target"].abs() < 0.1]
  4. X_train.drop(drop_labels,axis = 1,inplace = True)
  5. test.drop(drop_labels,axis = 1,inplace = True)
  1. %%time
  2. light = LGBMRegressor()
  3. light.fit(X_train,y_train)
  4. y_ = light.predict(test)
  5. %%time
  6. xgb = XGBRegressor()
  7. xgb.fit(X_train,y_train)
  8. y_ = xgb.predict(test)

 结果:

  1. Wall time: 194 ms
  2. Wall time: 610 ms
  3. xgb score:
  4. LGBM score:0.1491

将特征值目标值协方差小于0.1的特征值删除,可以提高算法的速度和准确率

  1. #特征在训练和测试样本中分布不均匀
  2. drop_labels = ["V5","V9","V11","V17","V22","V28"]
  3. X_train = train.iloc[:,0:-1]
  4. X_test = test.copy()
  5. X_train.drop(drop_labels,axis = 1,inplace = True)
  6. test.drop(drop_labels,axis = 1,inplace = True)
  1. %%time
  2. light = LGBMRegressor()
  3. light.fit(X_train,y_train)
  4. y_ = light.predict(test)
  5. %%time
  6. xgb = XGBRegressor()
  7. xgb.fit(X_train,y_train)
  8. y_ = xgb.predict(test)

结果:

  1. Wall time: 148 ms
  2. Wall time: 654 ms
  3. xgb score:
  4. LGBM score:0.1421
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/308034
推荐阅读
相关标签
  

闽ICP备14008679号