赞
踩
2.实战:
1) 手写数字数据集的分类
使用KNN与Logistic回归两种方法
from sklearn.linear_model import LogisticRegression
import sklearn.datasets as datasets
digits = datasets.load_digits()
digits
data = digits.data
target = digits.target
data.shape
digits.images.shape
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(1,1))
plt.imshow(data[100].reshape((8,8)),cmap="gray")
target[100]
# 对数据进行切分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data,target,test_size=0.02)
lgr = LogisticRegression()
lgr.fit(x_train,y_train)
lgr.score(x_test,y_test)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
2)使用make_blobs产生数据集进行分类
make_blobs()函数用于创建一些列的点,可以根据不同维度和分类
datasets.make_blobs(n_samples=10,n_features=3,random_state=10,centers=5)
data,target = datasets.make_blobs(n_samples=150,random_state=1,centers=3)
data.shape
(150, 2)
target
plt.scatter(data[:,0],data[:,1],c=target)
import numpy as np
# 对以上的三类点进行区域的划分
ymin,ymax = data[:,1].min(),data[:,1].max()
xmin,xmax = data[:,0].min(),data[:,0].max()
x = np.linspace(xmin,xmax,100)
y = np.linspace(ymin,ymax,100)
xx,yy = np.meshgrid(x,y)
xy = np.c_[xx.ravel(),yy.ravel()]
xy.shape
xx,x
lgr = LogisticRegression()
lgr.fit(data,target)
y_ = lgr.predict(xy)
y_
plt.scatter(xy[:,0],xy[:,1],c=y_)
plt.scatter(data[:,0],data[:,1],c=target,cmap="rainbow")
knn = KNeighborsClassifier()
knn.fit(data,target)
y_ = knn.predict(xy)
plt.scatter(xy[:,0],xy[:,1],c=y_)
plt.scatter(data[:,0],data[:,1],c=target,cmap="rainbow")
plt.scatter(xy[:,0],xy[:,1],c=y_)
plt.scatter(data[:,0],data[:,1],c=target,cmap="rainbow")
梯度下降法
import pandas as pd
import pandas as pd
# 预测癌细胞
data = pd.read_csv("../data/cencerData.csv")
data.head()
data.shape
(699, 11)
import numpy as np
# 通过观察数据需要清洗
# 把所有的“?”替换成nan
data.replace(to_replace="?",value=np.nan,inplace=True)
# 把缺失的行丢弃掉
data.dropna(how="any",inplace=True)
data.isnull().any()
data.columns
# 提取出标签和特征
x = data[['Clump Thickness', 'Uniformity of Cell Size',
'Uniformity of Cell Shape', 'Marginal Ashesion',
'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses']]
y = data[["Class"]]
# 划分训练数据和测试数据
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
为了不使某些过大的特征对预测结果起主导作用,可以对数据进行标准化
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train = ss.fit_transform(x_train) # 保证了每个特征的标准差为1,均值为0
# ss.fit(x_test)
# ss.transform(x_test)
x_test = ss.fit_transform(x_test)
x_train,x_test
from sklearn.linear_model import LogisticRegression,SGDClassifier
sgd = SGDClassifier()
sgd.fit(x_train,y_train)
sgd.score(x_test,y_test)
0.96491228070175439
lgr = LogisticRegression()
lgr.fit(x_train,y_train)
lgr.score(x_test,y_test)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。