赞
踩
1.读取数据
2.数据的基本处理
3.特征工程
4.数据可视化
训练数据 特征:打斗和亲吻次数 类别:电影类型
预测数据
5.算法实现
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
my_df = pd.read_excel('电影数据.xlsx')
print(my_df)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)
x = [5,3,31,59,60,80] y = [100,95,105,2,3,10] labels = ["《战狼》","《红海行动》","《碟中谍 6》","《前任3》","《春娇与志明》","《泰坦尼 克号》"] plt.xlabel("亲吻次数") plt.ylabel("打斗次数") plt.xticks(range(0, 150, 10)) plt.yticks(range(0, 150, 10)) i= 0 for x_i, y_i in zip(x, y): plt.annotate(text='{}'.format(labels[i]),xy=(x_i,y_i),xytext=(x_i,y_i)) i+=1 plt.scatter(x,y,s=100)
class MYknn(object): def __init__(self, train_df, k): """ :param train_df: 训练数据 :param k: 近邻点个数 """ self.train_df = train_df self.k = k def predict(self, test_df): """预测函数""" # 计算欧式距离 self.train_df['距离'] = np.sqrt((test_df['打斗次数']-train_df['打斗次数'])**2+(test_df['接吻次数']-train_df['接吻次数'])**2) # 按距离排序 前K个数据的类别 my_types = self.train_df.sort_values(by='距离').iloc[:self.k]['电影类型'] print(my_types) my_type = my_types.value_counts().index[0] print(my_type) # 训练数据 特征:打斗和亲吻次数 类别:电影类型 train_df = my_df.loc[:5, ['打斗次数', '接吻次数', '电影类型']] print(train_df) # 预测数据 test_df = my_df.loc[6, ['打斗次数', '接吻次数']] print(test_df)
mk = MYknn(train_df, 3)
mk.predict(test_df)
距离3最近 然后是4 最后是5
import numpy as np
from sklearn import datasets # 可以提供数据集
from sklearn.model_selection import train_test_split # 分割训练数据和测试数据
from collections import Counter # 计数器
可以得到鸢尾花对应的四个特征和三个类别
data 对应的以下四个特征
花萼长度 花萼宽度 花瓣长度 花瓣宽度
target 类别目标对应以下三种类别
山鸢尾 变色鸢尾 维吉尼亚鸢尾
iris = datasets.load_iris()
# print(iris)
# print(iris.data.shape)
X = iris.data
Y = iris.target
print(X, Y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=2003)
print(len(X_train))
def euc_dis(instance1,instance2):
dist = np.sqrt(sum((instance1-instance2)**2))
return dist
def knn_classify(X, y, testInstance, k=3):
distance = [euc_dis(x,testInstance) for x in X] # 每个训练数据到测试数据的距离集合
print(distance)
# 排序
kneighbors = np.argsort(distance)[:k]
# 投票
count = Counter(y[kneighbors])
return count.most_common()[0][0]
print(np.argsort([11, 2.5, 3.6, 1.5, 9]))
pred = [knn_classify(X_train,Y_train,data,3) for data in X_test]
print(pred)
count = np.count_nonzero((pred == Y_test) == True)
print(count)
print('模型预测正确率:%.3f'%(count/len(X_test)))
import numpy as np
from sklearn import datasets # 自带数据集
from sklearn.model_selection import train_test_split # 分割训练数据和测试数据
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib # 加载和保存模型
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(X_train,y_train)
data = np.array([[1,5,8,6]])
res = clf.predict(data)
print(res)
count = np.count_nonzero((clf.predict(X_test) == y_test) == True)
print('模型预测正确率:%.3f'%(count/len(X_test)))
joblib.dump(clf,“test.pkl”)
clf = joblib.load("test.pkl")
count = np.count_nonzero((clf.predict(X_test) == y_test) == True)
print('模型预测正确率:%.3f'%(count/len(X_test)))
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets # 自带数据集
from sklearn.model_selection import train_test_split # 分割训练数据和测试数据
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
iris = datasets.load_iris()
# 4个特征
# 花萼长度 花萼宽度 花瓣长度 花瓣宽度
X = iris.data
data = iris.data[:, :2]
# 山鸢尾 变色鸢尾 维吉尼亚鸢尾
y = iris.target
label = np.array(y) index_0 = np.where(label == 0) # 查找所有类别为0的索引位置 # data[index_0, 0] 查找类别为0 的第一列特征的数据 plt.scatter(data[index_0, 0], data[index_0, 1], marker='x', color='b', label='0', s=15) index_1 = np.where(label == 1) plt.scatter(data[index_1, 0], data[index_1, 1], marker='o', color='r', label='1', s=15) index_2 = np.where(label == 2) plt.scatter(data[index_2, 0], data[index_2, 1], marker='s', color='g', label='2', s=15) plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y)
tranfer = StandardScaler()
X_train = tranfer.fit_transform(X_train)
X_test = tranfer.fit_transform(X_test)
folds = 4
k_choices = [1,3,5,7,9,13,15,21,25,27]
# 训练集 分为4组数据
X_folds = np.vsplit(X_train,folds)
# print(X_folds)
y_folds = np.hsplit(y_train,folds)
# print(y_folds)
accuracy_of_k = {}
for k in k_choices:
accuracy_of_k[k] = []
accuracy_of_k
for i in range(folds):
# 第二组 + 第三组 + 第四组数据 作训练集
X_train = np.vstack(X_folds[:i]+X_folds[i+1:])
# 第一组 作测试集
X_val = X_folds[i]
y_train = np.hstack(y_folds[:i]+y_folds[i+1:])
y_val = y_folds[i]
for k in k_choices:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
y_val_pred = knn.predict(X_val)
accuracy = np.mean(y_val_pred == y_val)
accuracy_of_k[k].append(accuracy)
for k in sorted(k_choices):
for accuracy in accuracy_of_k[k]:
print('K=%d,accuracy=%f'%(k,accuracy))
for k in k_choices:
plt.scatter([k]*len(accuracy_of_k[k]),accuracy_of_k[k])
accuracies_mean = np.array([np.mean(v) for k,v in sorted(accuracy_of_k.items())])
accuracies_std = np.array([np.std(v) for k ,v in sorted(accuracy_of_k.items())])
plt.errorbar(k_choices,accuracies_mean,yerr=accuracies_std)
plt.title('在k上进行交叉验证')
plt.xlabel('k')
plt.ylabel('交叉验证准确性')
plt.show()
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(X_train,y_train)
count = np.count_nonzero((clf.predict(X_test) == y_test) == True)
print('模型预测正确率:%.3f'%(count/len(X_test)))
常用距离计算方法:
欧氏距离(欧几里得距离)
曼哈顿距离
闵可夫斯基距离
切比雪夫距离
余弦距离
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。