赞
踩
准备好相关数据
链接:https://pan.baidu.com/s/1EvuEnVhSAUghEkF5rckMoA?pwd=2222
提取码:2222
一.利用Kmeans分析时长与评分的关系
导入相关库
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from sklearn.cluster import KMeans
- from datetime import datetime
- from sklearn.model_selection import train_test_split #划分测试集与训练集
- from sklearn.linear_model import LinearRegression as LR #回归模块
- from sklearn.metrics import mean_squared_error #MSE
- from sklearn.metrics import mean_absolute_error #MAE
- from sklearn.metrics import r2_score #R2
- plt.rcParams['font.sans-serif']=['SimHei']
- data = pd.read_csv('C:\\Users\\wt\\Desktop\\data1.csv')
- mold = data.iloc[:, 1]
- avg_rating_num = np.mean(data.iloc[:, [2]], axis=0)
- X = data.iloc[:, [0, 2]]
- X = X.values.astype('float32')
记录相关电影类型数目
- label = {}
- def fetch(s):
- if s in label:
- label[s] += 1
- else:
- label[s] = 1
- for i in mold:
- print(i)
- if len(i) == 2:
- fetch(i[0:2])
- elif len(i) == 3:
- fetch(i[0:3])
- elif len(i) == 5:
- fetch(i[0:2])
- fetch(i[3:5])
- elif len(i) == 8:
- fetch(i[0:2])
- fetch(i[3:5])
- fetch(i[6:8])
- elif len(i) == 11:
- fetch(i[0:2])
- fetch(i[3:5])
- fetch(i[6:8])
- fetch(i[9:11])
- elif len(i) == 14:
- fetch(i[0:2])
- fetch(i[3:5])
- fetch(i[6:8])
- fetch(i[9:11])
- fetch(i[12:14])
打印相关系数
- print(np.corrcoef(X[:, 0], X[:, 1]))
-
- clf = KMeans(n_clusters=4)
- y_pred = clf.fit_predict(X)
- patches, text = plt.pie(label.values(), labels=label.keys(), radius=1)
- text[-1].set_text('')
- text[-2].set_text('')
- text[-3].set_text('')
- text[-5].set_text('')
- text[-6].set_text('')
- text[-7].set_text('')
- text[-8].set_text('')
- text[-9].set_text('')
- text[13].set_text('')
- for t in text:
- t.set_size(10)
- plt.title("高质量电影类型成分分析")
画图分析电影类型占比情况
- x = [n[0] for n in X]
- y = [n[1] for n in X]
- plt.title("Kmeans分析时长与评分")
- plt.scatter(x, y, c=y_pred, marker='x')
- plt.xlabel("时长")
- plt.ylabel("评分")
- plt.show()
利用kmeans时长与分布情况
二.多元回归模型分析播放量
记录电影种类
- item=['剧情','犯罪']
- def finds(iss):
- global item
- iss=iss.split("/")
- i=0
- while(1):
- if i==len(item):
- break
- for y in range(len(iss)):
- if iss[y]== item[i]:
- iss.remove(iss[y])
- break
- i+=1
- return iss
- for i in range(len(data.mold)):
- iss=finds(data.mold[i])
- item=item+list(iss)
自定义独热编码
- def my_get_dummies(ser):
-
- data=[]
- data=list(data)
- base_data=np.zeros((len(ser),),dtype=np.int)
- for i in range(len(item)):
- data.append(base_data)
- array = np.array(data, dtype = int)
- array=array.reshape(250,27)
- df=pd.DataFrame(array,columns=item,index=ser.index)
- for irec in ser.index:
- rec=ser[irec].split(',')
- for dirt in rec:
- if dirt not in item:
- print(dirt)
- else:
- df[dirt][irec]=1
- return df
- data=data.join(my_get_dummies(data.mold))
- data
建立模型
- #二分原则为80%为样例数据作为模型训练集20%为样本数据作为测试集检查估计能力
-
- from sklearn.model_selection import train_test_split #划分测试集与训练集
- from sklearn.linear_model import LinearRegression as LR #回归模块
- ##在ipy中显示图像
- %matplotlib inline
- #设置绘图显示中文字体
-
- pd.set_option('display.max_columns', None)
- #特征提取
- film_type=data[item]
- film_type
- # total_layer=data.总楼层
- # 选择自变量与因变量
- X = pd.concat([film_type,data.duration,data.Wtsee_people,data.Rating_people,data.Comments_people,data.year,data.rating_num],axis=1)
- Y = data.Watching_people
- print(type(X))
- X = X.fillna(0)
- #划分测试集与训练集
- Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=420)
- reg=LR().fit(Xtrain,Ytrain)
- #预测
- Yhat=reg.predict(Xtest)
- #查看回归系数
- print(list(zip(X.columns,reg.coef_)))
- #查看截距
- print(reg.intercept_)
- from sklearn.metrics import mean_squared_error #MSE
- from sklearn.metrics import mean_absolute_error #MAE
- from sklearn.metrics import r2_score #R2
- mse= mean_squared_error(Ytest,Yhat)
- mae= mean_absolute_error(Ytest,Yhat)
- r2=r2_score(Ytest,Yhat)
- #调整R2
- n=Xtest.shape[0]
- k=Xtest.shape[1]
- adj_r2=1-(1-r2)*((n-1)/(n-k-1))
- print('MSE:'+str(mse))
- print('MAE:'+str(mae))
- print('R2:'+str(r2))
- print('调整后R2:'+str(adj_r2))
评估模型
绘制图表评测结果
- #绘制前50条记录
- n=50
- #绘制模型预测值
- plt.plot(range(len(Yhat[:n])),Yhat[:n])
- #绘制模型真实值
- plt.plot(range(len(Ytrain[:n])),Ytrain[:n])
- #图形设置
- plt.xlabel('个例')
- plt.ylabel('播放量')
- plt.title('线性回归预测结果')
- plt.legend(["预估","实际"])
将测试集真实值与模型预测值用折线图的形式表现出来
- #绘制前50条记录
- n=50
- #绘制模型预测值
- plt.plot(range(len(Yhat[:n])),Yhat[:n])
- #绘制模型测试真实值
- plt.plot(range(len(Ytest[:n])),Ytest[:n])
- #图形设置
- plt.xlabel('个例')
- plt.ylabel('播放量')
- plt.title('线性回归预测结果')
- plt.legend(["预估","实际"])
三.决策树预测评分
计算pearsonr系数判断相关程度
- from sklearn import tree#决策树模型
- from sklearn.model_selection import train_test_split#划分测试集合与训练集合
- from sklearn.model_selection import GridSearchCV#用于找到最优模型
- from scipy.stats import pearsonr
- # 通常情况下通过以下取值范围判断变量的相关强度:
- # 相关系数 0.8-1.0 极强相关
- # 0.6-0.8 强相关
- # 0.4-0.6 中等程度相关
- # 0.2-0.4 弱相关
- # 0.0-0.2 极弱相关或无相关
- # x=np.array([1,3,5])
- # y=np.array([1,3,4])
- # pc = pearsonr(x,y)
- # print("相关系数:",pc[0])
- # print("显著性水平:",pc[1])
-
- pccs = pearsonr(data['duration'],data['rating_num'])
- print('时长')
- print("相关系数:",pccs[0])
- print("显著性水平:",pccs[1])
- pccs = pearsonr(data['Watching_people'],data['rating_num'])
- print('评分')
- print("相关系数:",pccs[0])
- print("显著性水平:",pccs[1])
- pccs = pearsonr(data['year'],data['rating_num'])
- print('年份')
- print("相关系数:",pccs[0])
- print("显著性水平:",pccs[1])
- pccs = pearsonr(data['Rating_people'],data['rating_num'])
- print('评价人数')
- print("相关系数:",pccs[0])
- print("显著性水平:",pccs[1])
- pccs = pearsonr(data['Comments_people'],data['rating_num'])
- print('短评人数')
- print("相关系数:",pccs[0])
- print("显著性水平:",pccs[1])
- pccs = pearsonr(data['Wtsee_people'],data['rating_num'])
- print('想看人数')
- print("相关系数:",pccs[0])
- print("显著性水平:",pccs[1])
建立树模型
- X=pd.concat([data['Watching_people'],data['Wtsee_people'],data['Watching_people'],data['Rating_people'],data['Comments_people'],data['year']],axis=1)
- Y=data['rating_num']
- # 划分测试与训练集
- Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.1,random_state=420)
- # 选择最优参数
- tree_param={'criterion':['mse','friedman_mse','mae'],'max_depth':list(range(10))}
- # GridSearchCV网格搜索,搜索的是参数,即在指定的参数范围内,按步长依次调整参数,利用调整的参数训练学习器,从所有的参数中找到在验证集上精度最高的参数,这其实是一个训练和比较的过程。k折交叉验证将所有数据集分成k份,不重复地每次取其中一份做测试集,
- # 用其余k-1份做训练集训练模型,之后计算该模型在测试集上的得分,将k次的得分取平均得到最后的得分。
- grid=GridSearchCV(tree.DecisionTreeRegressor(),param_grid=tree_param,cv=3)#实例化对象
- grid.fit(Xtrain,Ytrain)#训练模型
- grid.best_params_,grid.best_score_#最优参数,最优分数
- print(grid.best_params_)
- print(grid.best_score_)
- # #建立决策树(改进的均方误差不纯度准则)
- dtr=tree.DecisionTreeRegressor(criterion='friedman_mse',max_depth =4)
- # #训练决策树
- #预测训练结果
- dtr.fit(Xtrain,Ytrain)
- pred=dtr.predict(Xtest)
画图预测接下来25条真实评分与预测评分
- fig=plt.figure(figsize=(15.6,7.2))
- ax=fig.add_subplot(111)
- s1=ax.scatter(range(len(pred)),pred,facecolors="red",label='预测')
- s2=ax.scatter(range(len(Ytest)),Ytest,facecolors="blue",label='实际')
- plt.legend()
可观察到有15条左右的预测评分接近真实值
误差在0.3左右
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。