当前位置:   article > 正文

python开源代码算法_常用机器学习算法的python源码实现

python 3.8 平台中利用 sklearn 库的“randomforestclassifier”模块实现rf的构

常用机器学习算法的python源码实现

大致sklearn

一、思维导图

二、Python源码

## 二分类问题

* 使用skleran自带的逻辑回归、支持向量机、决策树API进行二分类的任务

* 使用sklearn的iris数据集,将iris数据集变成一个二分类的数据集,删除类别为2的数据

* 使用准确率对模型进行评价

### 准备数据

import pandas as pd

import numpy as np

from sklearn import datasets

from sklearn import linear_model

from sklearn import tree

from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt

from sklearn import cross_validation

from sklearn import metrics

iris = datasets.load_iris()

feature_columns = iris.feature_names

target_column = ['res']

np.where(iris.target==2) #说明需要删除100-149行

features = pd.DataFrame(iris.data,columns= feature_columns)

labels = pd.DataFrame(iris.target,columns = target_column)

features = features[0:100]

labels = labels[0:100]

train_test_data = pd.concat([features,labels],axis=1)

#将数据集分为测试集和训练集

train, test = cross_validation.train_test_split(train_test_data, test_size=0.1)

train_X = train[feature_columns].values

train_y = train[target_column].values.reshape(train_y.size)

test_X = test[feature_columns].values

test_y = test[target_column].values.reshape(test_y.size)

### 训练模型

# 训练一个逻辑回归模型

linear = linear_model.LogisticRegression(penalty='l2',C=1.0)

linear.fit(train_X,train_y)

preL = linear.predict(test_X)

metrics.accuracy_score(test_y,preL)

# 训练一个决策树模型

DT = tree.DecisionTreeClassifier(max_depth = 3)

DT = DT.fit(train_X,train_y)

preT = DT.predict(test_X)

metrics.accuracy_score(test_y,preT)

# 训练一个支持向量机

SVM = LinearSVC()

SVM = SVM.fit(train_X,train_y)

preS = SVM.predict(test_X)

metrics.accuracy_score(test_y,preS)

## 多分类问题

* 使用SVM进行一对一,一对多多分类

* 使用决策树进行多分类

* 使用随机森林进行多分类

import pandas as pd

import numpy as np

from sklearn import datasets

from sklearn import linear_model

from sklearn import tree

from sklearn.svm import LinearSVC,SVC

from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

from sklearn import cross_validation

from sklearn import metrics

### 准备数据

iris = datasets.load_iris()

feature_columns = iris.feature_names

target_column = ['res']

features = pd.DataFrame(iris.data,columns= feature_columns)

labels = pd.DataFrame(iris.target,columns = target_column)

train_test_data = pd.concat([features,labels],axis=1)

train, test = cross_validation.train_test_split(train_test_data, test_size=0.1)

train_X = train[feature_columns].values

train_y = train[target_column].values

train_y = train_y.reshape(train_y.size)

test_X = test[feature_columns].values

test_y = test[target_column].values

test_y =test_y.reshape(test_y.size)

### 训练模型

# linearSVC 采用的是 one vs the rest 多分类的支持向量机

Linear_SVM = LinearSVC()

Linear_SVM = Linear_SVM.fit(train_X,train_y)

preLS = Linear_SVM.predict(test_X)

# SVC 采用的是 one vs one 多分类支持向量机(C-Support Vector Classification)

C_SVM = SVC()

C_SVM = C_SVM.fit(train_X,train_y)

preCS = Linear_SVM.predict(test_X)

metrics.accuracy_score(test_y,preCS)

# 使用决策树进行多分类

DT = tree.DecisionTreeClassifier(max_depth = 3)

DT = DT.fit(train_X,train_y)

preT = DT.predict(test_X)

metrics.accuracy_score(test_y,preT)

# 使用随机森林进行多分类

RF = RandomForestClassifier()

RF = RF.fit(train_X,train_y)

preRF = RF.predict(test_X)

metrics.accuracy_score(test_y,preRF)

metrics.accuracy_score(test_y,preLS)

### 回归

* 使用boston数据集

* 使用线性回归

* 使用树回归

* 使用支持向量机进行回归

import pandas as pd

import numpy as np

from sklearn import datasets

from sklearn import tree

from sklearn import linear_model

from sklearn import svm

from sklearn import model_selection

from sklearn import metrics

boston = datasets.load_boston()

feature_columns = boston.feature_names

target_column = ['target']

features = pd.DataFrame(boston.data,columns= feature_columns)

labels = pd.DataFrame(boston.target,columns = target_column)

train_test_data = pd.concat([features,labels],axis=1)

train, test = model_selection.train_test_split(train_test_data, test_size=0.1)

train_X = train[feature_columns].values

train_y = train[target_column].values

train_y = train_y.reshape(train_y.size)

test_X = test[feature_columns].values

test_y = test[target_column].values

test_y =test_y.reshape(test_y.size)

### 训练模型

#训练线性回归模型

linear = linear_model.LinearRegression()

linear.fit(train_X,train_y)

preL = linear.predict(test_X)

metrics.mean_squared_error(test_y,preL)**0.5

#训练树回归模型

DT = tree.DecisionTreeRegressor()

DT = DT.fit(train_X,train_y)

preT = DT.predict(test_X)

metrics.mean_squared_error(test_y,preT)**0.5

#训练支持向量机回归模型

SVM = svm.LinearSVR()

SVM = SVM.fit(train_X,train_y)

preS = SVM.predict(test_X)

metrics.mean_squared_error(test_y,preS)**0.5

## 特征清洗

* 通过pandas了解数据

* 通过pandas填充缺失的数据

data = pd.read_csv(path)#在路径path中读取csv文件,读取后data的格式为pd.DataFrame

data.head() # 查看数据前5行

data.shape #查看数据大小

data.info() #查看数据特征信息,包括缺失值数量等

data.describe() #查看数据的统计信息,包括每个特征的平均值/标准差等

data['feature1'].fillna(value = data['feature1'].mean) #将feature1列中的缺失值以feature1列的平均值进行填充

## 特征工程

### 数值型数据

* 幅度变换

* 计算统计值

* 特征之间进行算术和逻辑运算以产生新特征

* 产生高次特征和交叉特征

* 进行离散化

* One-hot 编码

#1.进行log变化,log对数据进行缩放,有助于数据呈现正态分布

#采用np+apply的方法

import numpy as np

log_feature1 = data['feature1'].apply(lambda x:np.log(x))

data.loc[:,'log_feature1'] = log_feature1 #增加一个对数特征列

#也可以使用sklearn自带的幅度变换函数进行幅度变换

# 幅度缩放,最大最小值缩放

from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()

feature1_mms = mm_scaler.fit_transform(data[['feature1']])

# 幅度缩放,标准化

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

feature1_ss = std_scaler.fit_transform(data[['feature1']])

#等等

#2.计算统计值

data['feature1'].max()

data['feature1'].min()

#计算分位数

data['feature1'].quantile(0.25)

# 3.1特征之间进行运算以产生新特征

data.loc[:,'new_feature1'] = data['feature1']+ 4*data['feature2']+1

data.loc[:,'new_feature2'] = (data['feature1']==0)&(data['feature2']==0)

# 4.产生高次特征和交叉特征

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

poly_fea = poly.fit_transform(data[['feature1','feature2']])

# 5.离散化 通过pandas的cut和qcut

data.loc[:,'feature1_cut'] = pd.cut(df_train['feature1'],5)

data.loc[:,'feature1_qcut'] = pd.qcut(df_train['feature1'],5)

# 6. onehot 编码

feature1_oht = pd.get_dummies(data[['feature1']])

### 日期处理

# 将数据中的data转换成pd中的datatime类型

data.loc[:,'date'] = pd.to_datetime(data['date_t'], format="")

# 取出月份

data.loc[:,'month'] = data['date'].dt.month

# 取出日

data.loc[:,'dom'] = data['date'].dt.day

# 取出一年当中第几天

data.loc[:,'doy'] = data['date'].dt.dayofyear

# 取出星期几

data.loc[:,'dow'] = data['date'].dt.dayofweek

### 文本处理

* 词袋模型

* TF-IDF

# 1.词袋模型

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

corpus = [

'This is the first document.',

'This is the second second document.',

'And the third one.',

'Is this the first document?'

]

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names() #得到特征名

X.toarray() #将X转换为np数组形势

#2.TF-IDF 模型

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_X = tfidf_vectorizer.fit_transform(corpus)

tfidf_vectorizer.get_feature_names()

tfidf_X.toarray()

## 特征选择

* Filter

* Wrapper

* Embedded

# Filter

from sklearn.feature_selection import SelectKBest

X_new = SelectKBest(k=2).fit_transform(X, y)# 默认使用卡方检验

# Wrapper

from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rfe = RFE(estimator=rf, n_features_to_select=2)

X_rfe = rfe.fit_transform(X,y)

#Embedded

from sklearn.feature_selection import SelectFromModel

from sklearn.svm import LinearSVC

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)

model = SelectFromModel(lsvc, prefit=True)

X_embed = model.transform(X)

## 模型融合

* 投票器

* Bagging

* Adaboost

# 投票器

from sklearn import model_selection

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

X = array[:,0:8]

Y = array[:,8]

kfold = model_selection.KFold(n_splits=5, random_state=2018)

# 创建投票器的子模型

estimators = []

model_1 = LogisticRegression()

estimators.append(('logistic', model_1))

model_2 = DecisionTreeClassifier()

estimators.append(('dt', model_2))

model_3 = SVC()

estimators.append(('svm', model_3))

# 构建投票器融合

ensemble = VotingClassifier(estimators)

result = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)

print(result.mean())

# Bagging

from sklearn.ensemble import BaggingClassifier

dt = DecisionTreeClassifier()

num = 100

kfold = model_selection.KFold(n_splits=5, random_state=2018)

model = BaggingClassifier(base_estimator=dt, n_estimators=num, random_state=2018)

result = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(result.mean())

#Adaboost

from sklearn.ensemble import AdaBoostClassifier

num_trees = 25

kfold = model_selection.KFold(n_splits=5, random_state=2018)

model = AdaBoostClassifier(n_estimators=num_trees, random_state=2018)

result = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(result.mean())

## xgboost

import pickle

import xgboost as xgb

import numpy as np

import lightgbm as lgb

from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from sklearn.metrics import confusion_matrix, mean_squared_error

from sklearn.datasets import load_iris, load_digits, load_boston

#回归问题示例

#网格搜索交叉验证,代码举例,没有测试集,测试集和训练集相同,当然这和基本套路不符合

boston = load_boston()

y = boston['target']

X = boston['data']

kf = KFold(n_splits=5, shuffle=True)

xgb_model = xgb.XGBRegressor()

clf = GridSearchCV(xgb_model,

{'max_depth': [2,4,6],

'n_estimators': [50,100,200]}, verbose=0,cv=kf)

print(clf.best_score_)# best_score 的评估标准是什么,还要考证

print(clf.best_params_)

xgb1_model = xgb.XGBRegressor(max_depth = 4,n_estimators = 200).fit(X,y)

predictions = xgb1_model.predict(X)

actuals = y

print("MSE:",mean_squared_error(actuals, predictions))

## lightGBM

estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {

'learning_rate': [0.01, 0.1, 1],

'n_estimators': [20, 40]

}

gbm = GridSearchCV(estimator, param_grid, cv = kf)

print('用网格搜索找到的最优超参数为:')

print(gbm.best_params_)

gbm = lgb.LGBMRegressor(objective='regression',

num_leaves=31,

learning_rate=0.1,

n_estimators=40)

# 使用fit函数拟合

eval_set=[(X, y)],

eval_metric='l1',

early_stopping_rounds=5)

# 预测

print('开始预测...')

y_pred = gbm.predict(X, num_iteration=gbm.best_iteration_)

# 评估预测结果

print('预测结果的rmse是:')

print(mean_squared_error(y, y_pred) ** 0.5)

三、参见模型调参

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/412111
推荐阅读
相关标签
  

闽ICP备14008679号