赞
踩
因为找不太到途牛的url规律,就只能慢慢爬取数据,由于页面加载的时间很慢,用一台电脑爬取4000+数据可能需要数小时,这里只是简单实现了每个城市的第一页数据,可以在这个基础上实现多个页面一起爬和强化翻页个功能
一个py文件和一个文本文件就可以爬取了
首先是py文件
import json from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import time from bs4 import BeautifulSoup import re import pymysql # 获取谷歌驱动 driver = webdriver.Chrome("chromedriver.exe") # 访问途牛网 driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E") # 最大化 driver.maximize_window() # 睡眠3秒等待页面加载 time.sleep(3) # 判断一下数据为不为空 为空就将字符串"null"返回去 def judgeLen(temp): if len(temp) > 0: data = temp[0] else: data = "null" return data def getData(): # 连接数据库 connect = pymysql.connect(host="xxxxx", port=12345, user="xxx", passwd="xxxx",database="mydata",charset="utf8") # 获取游标 cursor = connect.cursor() # 建表操作在可视化提前建好即可,或者自行写代码创建 # 打开准备好的全部城市名字的文本文件 with open("AllCity.txt",mode="r",encoding="utf-8") as file: # 将文本读取进来 text = file.read() # 用json解析文本文件 jsondata = json.loads(text) # 遍历解析出来的字典 pro就是key 省份 for pro in jsondata: tempList = jsondata[pro] # 通过key遍历values 这里遍历出来的就是city for city in tempList: # 通过切割得到后面中文的城市名 place = (str(city).split("|")[1]) # -----------------自动修改城市名进行跳转----------------------- # 清空一下输入城市那个标签的内容 driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear() # 将遍历出来的中文城市名填进去 driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place) time.sleep(2) # 点击一下提示框的第一个地点 就会自动跳转到那个城市 ActionChains(driver).move_by_offset(226, 263).click().perform() # 回到原点 ActionChains(driver).move_by_offset(-226, -263).perform() time.sleep(5) # 对驱动返回的页面内容进行解析 bs = BeautifulSoup(driver.page_source, "html.parser") # 获取每个酒店div标签 data = bs.find_all("div", class_="hotel-item") # 遍历div标签 for div in data: # 正则表达式获取每个数据 # 酒店名 namepatt = re.compile(r'span.*?hotel-name f-m.*?>(.*?)</span>') # 钻石图标,当做星星用了 diapatt = re.compile(r'(icon icon-diamond)') # 星星 starpatt = re.compile(r'(icon icon-star)') # 评分 ratingpatt = re.compile( r'"hotel-score f-b f-DINA" data-v-74d0f10f="" style="background: rgb.*?;">(.*?)</div') # 评论数 commpatt = re.compile(r'</span><span class="comment-amount f-r" data-v-74d0f10f="">(.*?)条评论') # 价格 pricepatt = re.compile( r'<span class="amount f-b f-DINA" data-v-74d0f10f="">(.*?)</span><span class="qi') # -----------------匹配环节----------------- # 匹配酒店名字和品牌 name = judgeLen(re.findall(namepatt, str(div))) # 如果有找到"(" if name.find("(") > 0: # 酒店名 hname = name.split("(")[1][:-1] # 品牌 hbrand = name.split("(")[0] else: # 酒店名 hname = name # 品牌 hbrand = "其他" # 匹配星级 通过星星标签数量 if len(re.findall(diapatt, str(div))) > 0: star = str(len(re.findall(diapatt, str(div)))) + "星" else: star = str(len(re.findall(starpatt, str(div)))) + "星" # 评分 rating = judgeLen(re.findall(ratingpatt, str(div))) # 评论数 comm = judgeLen(re.findall(commpatt, str(div))) # 价格 price = judgeLen(re.findall(pricepatt, str(div))) # 往数据库插入数据 insertSql = """ insert into `TC_hotel` (hname,hbrand,province,city,starlevel,rating,comment_count,price)values ('{}','{}','{}','{}','{}','{}','{}','{}') """.format(str(hname), str(hbrand), str(pro), str(place), str(star), str(rating), str(comm), str(price)) # 预编译sql语句 cursor.execute(insertSql) # 提交 connect.commit() # 打印插入信息 print("插入数据 "+str(pro), str(place), str(hname), str(hbrand), str(star), str(rating), str(comm), str(price)) if __name__ == '__main__': getData()
还有一个文本文件
copy过去即可
{ "北京": ["bj|北京"], "天津": ["tj|天津"], "上海": ["sh|上海"], "台湾": ["tw|台湾"], "香港": ["hk|香港"], "澳门": ["am|澳门"], "河北": ["bd|保定", "cangzhou|沧州", "chengde|承德", "dingzhou|定州", "gt|馆陶", "hd|邯郸", "hs|衡水", "lf|廊坊", "qhd|秦皇岛", "sjz|石家庄", "ts|唐山", "xt|邢台", "zjk|张家口", "zd|正定", "zx|赵县", "zhangbei|张北"], "河南": ["ay|安阳", "changge|长葛", "hb|鹤壁", "jiaozuo|焦作", "jiyuan|济源", "kaifeng|开封", "luoyang|洛阳", "luohe|漯河", "mg|明港", "ny|南阳", "pds|平顶山", "puyang|濮阳", "sq|商丘", "smx|三门峡", "xx|新乡", "xc|许昌", "xy|信阳", "yuzhou|禹州", "yanling|鄢陵", "zz|郑州", "zk|周口", "zmd|驻马店"], "黑龙江": ["dq|大庆","dxal|大兴安岭", "hrb|哈尔滨", "hegang|鹤岗", "heihe|黑河", "jms|佳木斯", "jixi|鸡西", "mdj|牡丹江", "qqhr|齐齐哈尔", "qth|七台河", "suihua|绥化", "sys|双鸭山", "yich|伊春"], "吉林": ["bc|白城", "baishan|白山", "cc|长春", "jl|吉林", "liaoyuan|辽源", "songyuan|松原", "sp|四平", "th|通化", "yanbian|延边"], "辽宁" : ["as|鞍山", "benxi|本溪", "cy|朝阳", "dl|大连", "dandong|丹东", "fushun|抚顺", "fx|阜新", "hld|葫芦岛", "jinzhou|锦州", "liaoyang|辽阳", "pj|盘锦", "sy|沈阳", "tl|铁岭", "wfd|瓦房店", "yk|营口", "pld|庄河"], "山东": ["bz|滨州", "dz|德州", "dy|东营", "heze|菏泽", "jn|济南", "jining|济宁", "kl|垦利", "linyi|临沂", "lc|聊城", "lw|莱芜", "qd|青岛", "rizhao|日照", "shouguang|寿光", "longkou|龙口", "ta|泰安", "wf|潍坊", "weihai|威海", "yt|烟台", "zb|淄博", "zaozhuang|枣庄", "zhangqiu|章丘", "zc|诸城"], "内蒙古": ["alsm|阿拉善盟", "bt|包头", "bycem|巴彦淖尔", "chifeng|赤峰", "erds|鄂尔多斯", "hu|呼和浩特", "hlbe|呼伦贝尔", "hlr|海拉尔", "tongliao|通辽", "wuhai|乌海", "wlcb|乌兰察布", "xl|锡林郭勒", "xam|兴安盟"], "江苏": ["cz|常州", "dafeng|大丰", "danyang|丹阳", "dongtai|东台", "donghai|东海", "ha|淮安", "haimen|海门", "haian|海安", "jingjiang|靖江", "jianhu|建湖", "liyang|溧阳", "lyg|连云港", "nj|南京", "nt|南通", "pizhou|邳州", "qidong|启东", "rugao|如皋", "rudong|如东", "su|苏州", "shuyang|沭阳", "suqian|宿迁", "taizhou|泰州", "taixing|泰兴", "wx|无锡", "xinghuashi|兴化", "xinyishi|新沂", "xz|徐州", "xzpeixian|沛县", "yangzhong|扬中", "yz|扬州", "yancheng|盐城", "zj|镇江"], "安徽": ["anqing|安庆", "bengbu|蚌埠", "bozhou|亳州", "ch|巢湖", "chizhou|池州", "chuzhou|滁州", "fy|阜阳", "hf|合肥", "hn|淮南", "huaibei|淮北", "huangshan|黄山", "hexian|和县", "hq|霍邱", "la|六安", "mas|马鞍山", "ningguo|宁国", "suzhou|宿州", "tianchang|天长", "tongling|铜陵", "tongcheng|桐城", "wuhu|芜湖", "xuancheng|宣城"], "山西": ["changzhi|长治", "dt|大同", "jincheng|晋城", "jz|晋中", "lvliang|吕梁", "linfen|临汾", "linyixian|临猗", "qingxu|清徐", "shuozhou|朔州", "ty|太原", "xinzhou|忻州", "yuncheng|运城", "yq|阳泉"], "陕西": ["ankang|安康", "baoji|宝鸡", "hanzhong|汉中", "sl|商洛", "tc|铜川", "wn|渭南", "xa|西安", "xianyang|咸阳", "yanan|延安", "yl|榆林"], "甘肃": ["by|白银", "dx|定西", "gn|甘南", "jinchang|金昌", "jyg|嘉峪关", "jq|酒泉", "lz|兰州", "linxia|临夏", "ln|陇南", "pl|平凉", "qingyang|庆阳", "tianshui|天水", "wuwei|武威", "zhangye|张掖"], "浙江": ["hz|杭州", "cixi|慈溪", "changxing|长兴", "deqing|德清", "dongyang|东阳", "haining|海宁", "huzhou|湖州", "jiashanx|嘉善", "jx|嘉兴", "jh|金华", "lishui|丽水", "nb|宁波", "quzhou|衢州", "ruiancity|瑞安", "sx|绍兴", "tongxiang|桐乡", "tz|台州", "wenling|温岭", "wz|温州", "xiangshanxian|象山", "yiwu|义乌", "yueqingcity|乐清", "yuyao|余姚", "zhoushan|舟山", "zhuji|诸暨"], "江西": ["fuzhou|抚州", "ganzhou|赣州", "jj|九江", "ja|吉安", "jdz|景德镇", "nc|南昌", "px|萍乡", "sr|上饶", "xinyu|新余", "yingtan|鹰潭", "yichun|宜春", "yxx|永新"], "湖北": ["es|恩施", "ez|鄂州", "hshi|黄石", "hg|黄冈", "jingzhou|荆州", "jingmen|荆门", "qianjiang|潜江", "shiyan|十堰", "snj|神农架", "suizhou|随州", "tm|天门", "wh|武汉", "xf|襄阳", "xiaogan|孝感", "xiantao|仙桃", "xianning|咸宁", "yc|宜昌", "yidou|宜都"], "湖南": ["cs|长沙", "changde|常德", "chenzhou|郴州", "hy|衡阳", "hh|怀化", "ld|娄底", "shaoyang|邵阳", "xiangtan|湘潭", "xiangxi|湘西", "yy|岳阳", "yongzhou|永州", "yiyang|益阳", "zhuzhou|株洲", "zjj|张家界"], "贵州": ["anshun|安顺", "bijie|毕节", "gy|贵阳", "lps|六盘水", "qdn|黔东南", "qn|黔南", "qxn|黔西南", "tr|铜仁", "zunyi|遵义"], "四川": ["ab|阿坝", "bazhong|巴中", "cd|成都", "deyang|德阳", "dazhou|达州", "ga|广安", "guangyuan|广元", "ganzi|甘孜", "ls|乐山", "luzhou|泸州", "liangshan|凉山", "mianyang|绵阳", "ms|眉山", "scnj|内江", "nanchong|南充", "panzhihua|攀枝花", "suining|遂宁", "yb|宜宾", "ya|雅安", "zg|自贡", "zy|资阳"], "云南": ["bs|保山", "cx|楚雄", "dali|大理", "diqing|迪庆", "dh|德宏", "honghe|红河", "km|昆明", "lj|丽江", "lincang|临沧", "nujiang|怒江", "pe|普洱", "qj|曲靖", "ws|文山", "bn|西双版纳", "yx|玉溪", "zt|昭通"], "新疆": ["aks|阿克苏", "ale|阿拉尔", "bygl|巴音郭楞", "betl|博尔塔拉", "changji|昌吉", "hami|哈密", "ht|和田", "klmy|克拉玛依", "kel|库尔勒", "ks|喀什", "kzls|克孜勒苏", "shz|石河子", "tlf|吐鲁番", "tmsk|图木舒克", "xj|乌鲁木齐", "wjq|五家渠", "yili|伊犁", "alt|阿勒泰", "tac|塔城"], "宁夏": ["guyuan|固原", "szs|石嘴山", "wuzhong|吴忠", "yinchuan|银川", "zw|中卫"], "青海": ["guoluo|果洛", "huangnan|黄南", "hx|海西", "haidong|海东", "haibei|海北", "hainan|海南", "xn|西宁", "ys|玉树"], "西藏": ["al|阿里", "changdu|昌都", "lasa|拉萨", "linzhi|林芝", "nq|那曲", "rkz|日喀则", "sn|山南", "rituxian|日土", "gaizexian|改则"], "广西": ["baise|百色", "bh|北海", "chongzuo|崇左", "fcg|防城港", "gl|桂林", "gg|贵港", "hc|河池", "hezhou|贺州", "liuzhou|柳州", "lb|来宾", "nn|南宁", "qinzhou|钦州", "wuzhou|梧州", "yulin|玉林"], "广东": ["chaozhou|潮州", "dg|东莞", "fs|佛山", "gz|广州", "huidong|惠东", "huizhou|惠州", "heyuan|河源", "jm|江门", "jy|揭阳", "mm|茂名", "mz|梅州", "qingyuan|清远", "sd|顺德", "sz|深圳", "st|汕头", "sg|韶关", "sw|汕尾", "taishan|台山", "yj|阳江", "yangchun|阳春", "yf|云浮", "zh|珠海", "zs|中山", "zhanjiang|湛江", "zq|肇庆", "boluo|博罗"], "福建": ["fz|福州", "jinjiangshi|晋江", "ly|龙岩", "nd|宁德", "np|南平", "nananshi|南安", "pt|莆田", "qz|泉州", "sm|三明", "shishi|石狮", "wuyishan|武夷山", "xm|厦门", "zhangzhou|漳州"], "海南": ["haikou|海口", "sansha|三沙", "sanya|三亚", "wzs|五指山", "qh|琼海", "wenchang|文昌", "wanning|万宁", "tunchang|屯昌", "qiongzhong|琼中", "lingshui|陵水", "df|东方", "da|定安", "cm|澄迈", "baoting|保亭", "baish|白沙", "tanzhou|儋州"] }
爬出来的数据表(星钻可以不作区分)
设计表(方便插入数据就全部varchar,见谅)
圈起来的就是用到的
首先是app.py文件
from flask import Flask, render_template from flask_sqlalchemy import SQLAlchemy app = Flask(__name__) app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://用户名:密码@域名:端口/数据库?charset=utf8' app.config.setdefault('SQLALCHEMY_TRACK_MODIFICATIONS', True) db = SQLAlchemy(app) """ 1) 编写程序,计算每个酒店的综合得分 先对星级(starlevel)、评价(rating)、评论数(comment_count)3个字段做以下转换: 对评价(rating)和评论数(comment_count)两个字段做归一化,调整到[0, 1]区间,得到评价得分和评论得分; 星级得分为: 星数 * 0.2 。 综合得分为: 星级得分(30%)、评价得分(50%)、评论得分(20%)的加权平均 2) 统计每个省份酒店的平均总得分 3) 主标题为“全国各省酒店综合得分”(红色,加粗) 4) 输出全国各省综合得分情况地图 """ # SQLAlchemy映射 class yang_Table(db.Model): __tablename__ = 'tc_hotel' hname =db.Column(db.String(50),primary_key=True) hbrand =db.Column(db.String(50)) province = db.Column(db.String(50)) city = db.Column(db.String(50)) starlevel = db.Column(db.String(50)) rating = db.Column(db.String(50)) comment_count = db.Column(db.String(50)) price = db.Column(db.String(50)) @app.route("/") @app.route("/china") def china(): data = [] # 将表里的数据获取到,得到的是一个列表 ds = db.session.query(yang_Table.hname, yang_Table.hbrand, yang_Table.province, yang_Table.city,yang_Table.starlevel,yang_Table.rating,yang_Table.comment_count,yang_Table.price).all() # 根据题目计算各个城市的综合分数 以下代码可以根据逻辑自行敲 # 根据需求进行归一化 # 定义并初始化最大最小值 ramax = 0 ramin = 5 comin = 9999999 comax = 0 # 然后遍历列表的数据,求出最大最最小值 for i in ds: ramax = max(ramax, float(i[5])) ramin = min(ramin, float(i[5])) comax = max(comax, float(i[6])) comin = min(comin, float(i[6])) # 计算最大减最小的差 racha = ramax-ramin cocha = comax-comin rating = [] comment = [] star = [] # 进行归一化计算 for i in ds: rating.append(round((float(i[5])-ramin)/racha, 2)) comment.append(round((float(i[6])-comin)/cocha,2)) star.append(round(float(int(i[4][:-1])*0.2), 2)) # 对评价和评论数计算分数 for i in range(len(star)): data.append(round(float(star[i] * 30 + rating[i] * 50 + comment[i] * 20), 2)) print(data) pro = [] temp = ds[0][2] dicData = {"北京":[]} for i in range(len(data)): if ds[i][2] != temp: temp = ds[i][2] pro.append(temp) dicData[temp] = [] else: dicData[temp].append(data[i]) avgdata = dict() for key in dicData: sum = 0 for item in dicData[key]: sum += item avgdata[key] = round(float(sum/len(dicData[key])), 2) # 将数据转为Echarts可以接受的数据 result = [] for key in avgdata: result.append({"name": key, "value": avgdata[key]}) print(len(avgdata)) title = "全国各省酒店综合得分" tips = '综合得分' # 将数据传到前端 return render_template("china.html", data=result, title=title, tips=tips) if __name__ == "__main__": app.run(host='127.0.0.1', port=5222, debug=True)
然后就是html文件
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>数据可视化</title> <style> #map { width : 1000px; height: 600px; margin : 50px auto; } </style> </head> <body> <div id="map"> </div> </body> <!--echarts引入--> <script src="../static/js/echarts.js" charset="utf-8"></script> <script src="../static/js/china.js" charset="utf-8"></script> <script> var myChart = echarts.init(document.getElementById('map')); var option = { title: { text: '{{title|safe}}', textStyle: { color: 'red' , fontSize: 16 , fontWeight: 'bolder', }, left: '40%' }, tooltip: { formatter:function(params,ticket, callback){ return params.seriesName+'<br />'+params.name+':'+params.value } }, visualMap: { min: 0, max: 100, left: 'left', top: 'bottom', text: ['高','低'], inRange: { color: ['#00FF00', '#FFFF00', '#FF0000'] }, show:true }, geo: { map: 'china', roam: false, zoom:1.23, label: { normal: { show: true, fontSize:'10', color: 'rgba(0,0,0,0.7)' } }, itemStyle: { normal:{ borderColor: 'rgba(0, 0, 0, 0.2)' }, emphasis:{ areaColor: '#F3B329', shadowOffsetX: 0, shadowOffsetY: 0, shadowBlur: 20, borderWidth: 0, shadowColor: 'rgba(0, 0, 0, 0.5)' } } }, series : [ { name: '{{tips|safe}}', type: 'map', geoIndex: 0, data:{{data|safe}} } ] } myChart.setOption(option); </script> </html>
js文件如果没有的话可以私信我
原创不易,请给博主一个小小的赞吧~
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。