赞
踩
- #把数据的头置为空,因为要自己设置列名称
- data = pd.read_csv('./breast-cancer-wisconsin.data',header=None)
- # print(data)
-
- #增加列名
- columns = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
- data.columns = columns
- data.replace('?',np.nan,inplace=True)
- #缺失值检测
- # print(data.isnull().sum())
-
- #删除缺失值
- data.dropna(axis=0,how='any',inplace=True)
- #特征筛选,筛选数据集,去除第一列的编号
- data = data.iloc[:,1:]
-
- #获取特征值、目标值
- feature = data.iloc[:,:-1].values
- target = data.iloc[:,-1].values
-
- #拆分数据集
- #返回四种结果
- x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.3,random_state=1)
-
- #标准化数据,特征值需要标准化,目标值不需要标准化
- stand = StandardScaler()
- x_train = stand.fit_transform(x_train)
- x_test = stand.fit_transform(x_test)
- #利用逻辑回归分类
- lr = LogisticRegression() #参考sgd
- #训练数据
- lr.fit(x_train,y_train)
-
- #进行预测数据
- y_predict = lr.predict(x_test)
-
- #准确率
- score = lr.score(x_test,y_test)
-
- #获取权重与偏置
- weight = lr.coef_
- bias = lr.intercept_
-
- # print('权重:\n',weight)
- # print('偏置:\n',bias)
- # print('准确率:\n',score)
- # print('预测值:\n',y_predict)
- #计算召回率,召回率越高越好
- #fl-score越高越好,模型越稳健
- #labels=[2,4],target_names=['良性','恶性'],把显示的行名称由2,4变成良性和恶性
- res_report = classification_report(y_test,y_predict,labels=[2,4],target_names=['良性','恶性'])
- # print(res_report)
针对样本不平衡的状态,这里样本均衡,不使用auc,只作为参考示例
注: 如果样本不均衡就变成均衡
#np.where,把2,4转化成0、1 y_test = np.where(y_test > 3,1,0) # print(y_test) #计算auc指标 --针对样本不平衡的状态,这里样本均衡,不使用auc,只作为参考 auc = roc_auc_score(y_test,y_predict) print(auc)
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report #召回率计算 from sklearn.metrics import roc_auc_score#引入auc指标 #加载数据 data = pd.read_csv('./breast-cancer-wisconsin.data',header=None) # print(data) #增加列名 columns = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] data.columns = columns # print(data) data.replace('?',np.nan,inplace=True) #缺失值检测 # print(data.isnull().sum()) #删除缺失值 data.dropna(axis=0,how='any',inplace=True) # print(data.isnull().sum()) #特征筛选,筛选数据集,去除第一列的编号 data = data.iloc[:,1:] #获取特征值、目标值 feature = data.iloc[:,:-1].values target = data.iloc[:,-1].values #异常值处理---没有异常值 #拆分数据集 #返回四种结果 x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.3,random_state=1) #标准化数据,特征值需要标准化,目标值不需要标准化 stand = StandardScaler() x_train = stand.fit_transform(x_train) x_test = stand.fit_transform(x_test) #利用逻辑回归分类 lr = LogisticRegression() #参考sgd #训练数据 lr.fit(x_train,y_train) #进行预测数据 y_predict = lr.predict(x_test) #准确率 score = lr.score(x_test,y_test) #获取权重与偏置 weight = lr.coef_ bias = lr.intercept_ # print('权重:\n',weight) # print('偏置:\n',bias) # print('准确率:\n',score) # print('预测值:\n',y_predict) #计算召回率,召回率越高越好 #fl-score越高越好,模型越稳健 #labels=[2,4],target_names=['良性','恶性'],把显示的行名称由2,4变成良性和恶性 res_report = classification_report(y_test,y_predict,labels=[2,4],target_names=['良性','恶性']) # print(res_report) #np.where,把2,4转化成0、1 y_test = np.where(y_test > 3,1,0) # print(y_test) #计算auc指标 --针对样本不平衡的状态,这里样本均衡,不使用auc,只作为参考 auc = roc_auc_score(y_test,y_predict) print(auc) #如果样本不均衡就变成均衡
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。