赞
踩
搜索旅游目的地,例如这里搜索“某某森林公园”,并跳转到相应的评论页
右键单击网页,选择**“检查”**,或者按F12
第一步
第二步
第三步
第四步
## 1导入第三方包 import json import requests from bs4 import BeautifulSoup import bs4 ## 2、参考网页源码构造请求体( 参考payload页) payload = { "arg":{ "channelType": 2, "collapseType": 0, "commentTagId": 0, "pageIndex": 3, "pageSize": 10, "poiId": 78198, "sortType": 3, "sourceType": 1, "starType": 0 }, "head": { "auth": "", "cid": "09031167319556132059", "ctok": "", "cver": "1.0", "extension": [], "lang": "01", "sid": "8888", "syscode": "09", "xsid": "" } } ## 3、爬取网页数据 (参考 headers页) post_url = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList" html = requests.post(post_url, data=json.dumps(payload)).text ## 4、解析网页数据 (为了展示,我此处只选取评论数据) html_data = json.loads(html) html_data['result']['items'][0]['content'] # 第0条评论
第一步,使用上述方法找到景区的编码
第二步,更新爬虫参数
第三步,查看返回结果
import pandas as pd import numpy as np import os import json import time import requests from bs4 import BeautifulSoup import bs4 # 1. 通过检查某程评论页面源码,找到post_url和post_data参数结构 def get_xiecheng_comments(post_url, post_data): try: res = requests.post(post_url, data=json.dumps(post_data)).text return res except: return "" # 2. 从返回的字符串型评论数据中解析出所需要的评论数据, def select_data(html_str): comments =[] scores = [] times = [] length = [] if html_str: html = json.loads(html_str) comment_data = html['result']['items'] for comment in comment_data: com = comment['content'] comments.append(com) leng = len(com) length.append(leng) sco = comment['score'] scores.append(sco) tm = comment['publishTypeTag'][:10] times.append(tm) return comments, length, scores, times #3.构造某程请求体,模拟请求各个页面的评论内容(只适用于XIE程) def payload_data(page_number, poi_id): data = { "arg":{ "channelType": 2, "collapseType": 0, "commentTagId": 0, "pageIndex": page_number, "pageSize": 10, "poiId": poi_id, "sortType": 3, "sourceType": 1, "starType": 0 }, "head": { "auth": "", "cid": "09031167319556132059", "ctok": "", "cver": "1.0", "extension": [], "lang": "01", "sid": "8888", "syscode": "09", "xsid": "" } } return data ## 4.爬取某一条线路下的全部评论数据 def get_tourist_route_comments(post_url, total_pages, route_query_id): html_data = [] comments = [] scores = [] times = [] length = [] for page in range(1, total_pages+1): # 爬取网页内容 payload = payload_data(page, route_query_id) html_str = get_xiecheng_comments(post_data=payload, post_url=post_url) html_data += html_str # 解析网页内容 page_comments, page_com_length, page_scores, page_times = select_data(html_str) comments += page_comments scores += page_scores times += page_times length += page_com_length return html_data, comments, scores, times, length ## 5. 保存数据到Excel文件中 def save_data_to_excel(comments, scores, times, com_length, query_id): # build data save_data = pd.DataFrame({ "comment":comments, "score": scores, "time": times, "comment_length": length }) # duild save path save_path = "./xiecheng/" + str(query_id) + ".xlsx" save_data.to_excel(save_path, index=False)
#### 按照queryID下载数据并保存到Excel中
# 1、爬取数据
print("========================== start get data ==========================")
post_url = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
## 如果需要下载其他线路的评论数据,只需要更新下面两个参数,重复执行即可
total_pages = 298 # 该线路下的评论数据总页数
query_id = 78137 # 该线路ID
html_data, comments, scores, times, length = get_tourist_route_comments(post_url, total_pages, query_id)
# 2、保存数据
print("========================== start save data ==========================")
save_data_to_excel(comments, scores, times, length, query_id)
print("========================== done ==========================")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。