赞
踩
流程:
一,爬取数据 保存到mysql数据库
二,读取数据 分析三种颜色的占比。
1,爬取数据保存到mysql数据库:
- # -*- coding: utf-8 -*-
- """
- Created on Mon Mar 4 11:09:45 2019
- @author: Lenovo
- """""
- import urllib
- import mysql.connector
- import re
- import urllib.error
- import json
- import time as t
-
- #设置请求头
- headers = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36")
- opener=urllib.request.build_opener()
- opener.addheaders=[headers]#添加报头
- urllib.request.install_opener(opener)#设置opner全局化
-
- #设置代理服务器
- def use_proxy_1(url,proxy_add):
- proxy=urllib.request.ProxyHandler({'http':proxy_add})
- opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
- urllib.request.install_opener(opener)
- data=urllib.request.urlopen(url).read().decode('utf-8')
- return data
-
- for currentPage in range(25,100):
- try:
- ratecontent = []
- color = []
- commt_url='https://rate.tmall.com/list_detail_rate.htm?itemId=558760911386&spuId=877095771&sellerId=2616970884&order=3¤tPage='+str(currentPage)
- proxy_add="182.44.224.198:9999"#设置代理服务器
- commt_data=use_proxy_1(commt_url,proxy_add) #爬取网页的评论内容
- #筛选json格式数据
- jsondata=re.search('^[^(]*?\((.*)\)[^)]*$',commt_data).group(1)
- #用json加载数据
- data = json.loads(jsondata)
-
- #数据保存在变量里
- conn = mysql.connector.connect(host='localhost', port=3306, user='root', passwd='password', db='tianmao', charset='utf8mb4')
- cur = conn.cursor()
- #连接mysql
- print('连接成功!!!!')
- for i in range(0, len(data['rateDetail']['rateList'])):
- name = data['rateDetail']['rateList'][i]['displayUserNick']
- content = data['rateDetail']['rateList'][i]['rateContent']
- time = data['rateDetail']['rateList'][i]['rateDate']
- type = data['rateDetail']['rateList'][i]['auctionSku']
-
- typeDetails=re.split('[:;]',type)
- color=typeDetails[3]#颜色
- rom=typeDetails[7]#存贮容量
- net=typeDetails[1]#网络类型
-
- sql_content = "replace into iphone(name,content,time,color,rom,net) values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")" \
- %(str(name),str(content),str(time),str(color),str(rom),str(net))
- cur.execute(sql_content)
- #提交数据
- conn.commit()
- t.sleep(2)
- print('第'+str(currentPage)+'页数据保存完毕!')
- #关闭连接
- conn.close()
- except urllib.error.URLError as e:
- if hasattr(e,"code"):
- print(e.code)
- if hasattr(e,"reason"):
- print(e.reason)
效果:
2,读取数据 并分析颜色占比
- # -*- coding: utf-8 -*-
- """
- Created on Mon Mar 4 16:55:25 2019
- @author: Lenovo
- """
- import mysql.connector
- import matplotlib.pyplot as plt
-
- plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
- plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
-
-
- conn = mysql.connector.connect(host='localhost', port=3306, user='root', passwd='password', db='tianmao', charset='utf8mb4')
- cur = conn.cursor()
-
- cur.execute("select count(*) from iphone;")
- alldata = cur.fetchall()
- print("总数: "+str(alldata))
- #颜色统计
- cur.execute("select count(*) from iphone where color='银色';")
- color1 = cur.fetchall()
-
- cur.execute("select count(*) from iphone where color='金色';")
- color2 = cur.fetchall()
-
- cur.execute("select count(*) from iphone where color='深空灰色';")
- color3 = cur.fetchall()
-
- xlabels=[u'银色',u'金色',u'深空灰色']
- xValues=[color1[0],color2[0],color3[0]]
- fig=plt.figure()
- plt.pie(xValues,labels=xlabels,autopct='%.2f%%')
- plt.title("颜色比例图",14)
- plt.show()
-
-
-
-
-
效果:
由图可见 最受欢迎的颜色为 深空灰
词云图:
- # -*- coding: utf-8 -*-
- """
- Created on Mon Mar 4 16:55:25 2019
- @author: Lenovo
- """
- import mysql.connector
- import matplotlib.pyplot as plt
-
- from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
- import jieba
-
- #词云图
- comments=[]
- with open('content.txt',mode='r',encoding='utf-8') as f:
- rows=f.readlines()
- print('readlines:'+str(len(rows)))
- for row in rows:
- comments.append(row)
-
- #设置分词
- comment_after_split = jieba.cut(str(comments),cut_all=False)#非全模式分词
- words=' '.join(comment_after_split) #以空格进行拼接
-
- #设置屏蔽词
- stopwords=STOPWORDS.copy()
- stopwords.add('此用户没有填写评论!')
- stopwords.add('儿子')
- stopwords.add('第一次')
- stopwords.add('手机')
- stopwords.add('苏宁')
- stopwords.add('苹果')
-
- stopwords.add('还是')
- stopwords.add('不错')
- stopwords.add('问题')
- stopwords.add('收到')
- stopwords.add('用户没有')
- stopwords.add('那天')
- stopwords.add('非常')
-
- #导入背景图片
- bg_image = plt.imread('bg.jpg')
- # 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
- wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF',
- stopwords=stopwords, max_font_size=400, random_state=50)
- # 将分词后数据传入云图
- wc.generate_from_text(words)
- plt.imshow(wc)
- plt.axis('off') # 不显示坐标轴
- plt.show()
- # 保存结果到本地
- wc.to_file('词云图.jpg')
-
-
-
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。