赞
踩
本文以代码+分析的形式记录:利用Scrapy框架和requests库爬取tripadvisor(猫途鹰)多个城市的酒店信息,数据量300w+条(1.09G),运行时间约7h。多个城市与单个城市的操作类似,为避免代码过于冗长,本文仅以爬取London酒店的评论信息为例子。默认读者掌握Python爬虫原理、Scrapy框架及HTML基础语法。有错漏或疑问等等都欢迎在评论区提出
获取London所有酒店的评论信息,根据酒店星级将文件保存到不同的文件夹
可以分两步进行:先获取酒店网址再获取每家酒店的数据
在酒店模块搜索London跳转到London酒店的页面(在控制台分析可知:页面内容在源代码中),可以看到页面中显示30家酒店,接着往后看:
第2页网址https://www.tripadvisor.com/Hotels-g186338-oa30-London_England-Hotels.html
第3页网址https://www.tripadvisor.com/Hotels-g186338-oa60-London_England-Hotels.html
继续往后查看发现oa后面的数字是当前页面酒店的起始序号,结合在页面获取到的酒店总数,通过循环可以获取到所有酒店的网址及名称。
对酒店页面进行分析可以知道:固定的酒店数据(名称、星级、所在地)存储在源代码中,与评论相关的数据在https://www.tripadvisor.com/data/graphql/ids
使用Scrapy框架实现,这里是为了练习,也可以使用requests库
import scrapy
class TripadivisorItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 酒店名字
hotel = scrapy.Field()
# 网址
href = scrapy.Field()
# 评论数
commentNum = scrapy.Field()
import scrapy import bs4 import re from ..items import TripadivisorItem class TripSpider(scrapy.Spider): start_urls = 'https://www.tripadvisor.com/Hotels-g186338-London_England-Hotels.html' # 爬虫名字 name = 'trip' # 限制爬虫爬取的域名 allowed_domain = ['www.tripadvisor.com'] pages = int(4364 / 30) + 1 for page in (1,page+1): url = 'https://www.tripadvisor.com/Hotels-g186338-oa' + str(page) + '-London_England-Hotels.html' start_urls.append(url) # 获取酒店的id、网址 def parse(self, response): bs = bs4.BeautifulSoup(response.text, 'html.parser') datas = bs.find_all('div', class_='meta_listing') for data in datas: item = TripadivisorItem() data = data.find('div', class_='main_col') # 酒店名称 item['hotel'] = data.find('div',class_='listing_title').find('a').text # 酒店网址 item['href'] = 'https://www.tripadvisor.com' + data.find('div', class_='listing_title').find('a')['href'] # 评论数 item['commentNum'] = data.find('a', class_='review_count').text yield item
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter # class TripadivisorPipeline: # def process_item(self, item, spider): # return item import openpyxl class TripadivisorPipeline(object): # 定义一个JobuiPipeline类,负责处理item def __init__(self): # 初始化函数 当类实例化时这个方法会自启动 self.wb = openpyxl.Workbook() # 创建工作薄 self.ws = self.wb.active # 定位活动表 self.ws.append(['酒店名', '网址', '评论数']) # 用append函数往表格添加表头 def process_item(self, item, spider): # 把酒店名称、酒店网址、评论数都写成列表的形式,赋值给line line = [item['hotel'], item['href'], item['commentNum']] # 用append函数把酒店名称、酒店网址、评论数的数据都添加进表格 self.ws.append(line) # 将item丢回给引擎,如果后面还有这个item需要经过的itempipeline,引擎会自己调度 return item def close_spider(self, spider): # close_spider是当爬虫结束运行时,这个方法就会执行 self.wb.save('./all.xlsx') # 保存文件 self.wb.close() # 关闭文件
# 请求头
DEFAULT_REQUEST_HEADERS = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"referer": "https://www.tripadvisor.com/Tourism-g60763-New_York_City_New_York-Vacations.html",
"user-agent": user-agent, # 自行填写
}
# 不遵循爬虫协议
ROBOTSTXT_OBEY = False
利用Scrapy框架获取,除了主程序,其他操作与上一步类似,这里不再重复
import scrapy import bs4 from ..items import StarItem import openpyxl import re class StarSpider(scrapy.Spider): name = 'star' allowed_domains = ['tripadvisor.com'] start_urls = [] # 从excel表格中获取酒店网址 wb = openpyxl.load_workbook('./all.xlsx') sheet = wb[wb.sheetnames[0]] rows = sheet.max_row cols = sheet.max_column for i in range(2, rows+1): cellValue = sheet.cell(row=i, column=2).value start_urls.append(cellValue) def parse(self, response): item = StarItem() bs = bs4.BeautifulSoup(response.text, 'html.parser') # 酒店名称 item['hotel'] = bs.find('h1', id='HEADING').text # 酒店网址 item['url'] = response.url # 酒店星级 try: item['star'] = bs.find('svg', class_='JXZuC')['aria-label'][0:3] except: item['star'] = 'None' # 英文评论数量 languages = bs.find_all('li', class_='ui_radio XpoVm') item['reviews'] = 0 for language in languages: value = language.find('input')['value'] # 提取的评论数有有括号,使用正则可以去除,也可以利用excel去除 if value == 'en': item['reviews'] = language.find('span', class_='POjZy').text yield item
使用requests库
# 根据文件获取评论信息 # 注意monkey要放在最前面,否则会报错 from gevent import monkey monkey.patch_all() import requests import os import json import openpyxl import gevent from bs4 import BeautifulSoup import random import pandas import math import time import re # 获取评论信息 def getComment(geoId, locationId, page, star): try: post_url = 'https://www.tripadvisor.com/data/graphql/ids' data = [{"query": "0eb3cf00f96dd65239a88a6e12769ae1", "variables": {"interaction": {"productInteraction": {"interaction_type": "CLICK", "site": {"site_name": "ta", "site_business_unit": "Hotels", "site_domain": "www.tripadvisor.com"}, "pageview": {"pageview_request_uid": "b3ad9a52-d1c6-4bbe-8eae-a19f04fd67ff", "pageview_attributes": {"location_id": locationId, "geo_id": geoId, "servlet_name": "Hotel_Review"}}, "user": {"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "site_persistent_user_uid": "web390a.218.17.207.101.187546A8056", "unique_user_identifiers": {"session_id": "F61F132D22034DC242255F44CFE7A54C"}}, "search": {}, "item_group": {"item_group_collection_key": "b3ad9a52-d1c6-4bbe-8eae-a19f04fd67ff"}, "item": {"product_type": "Hotels", "item_id_type": "ta-location-id", "item_id": locationId, "item_attributes": {"element_type": "number", "action_name": "REVIEW_NAV", "page_number": page, "offset": (page-1)*10, "limit": 10}}}}}}, {"query": "ea9aad8c98a6b21ee6d510fb765a6522", "variables": {"locationId": locationId, "offset": (page-1)*10, "filters": [{"axis": "LANGUAGE", "selections": ["en"]}], "prefs":None, "initialPrefs":{}, "limit": 10, "filterCacheKey": "locationReviewFilters_10810215", "prefsCacheKey": "locationReviewPrefs_10810215", "needKeywords": False, "keywordVariant": "location_keywords_v2_llr_order_30_en"}}, {"query": "dd297ef79164a42dba1997b10f33d055", "variables": {"locationId": locationId, "application": "HOTEL_DETAIL", "currencyCode": "HKD", "pricingMode": "BASE_RATE", "sessionId": "F61F132D22034DC242255F44CFE7A54C", "pageviewUid": "b3ad9a52-d1c6-4bbe-8eae-a19f04fd67ff", "travelInfo": {"adults": 2, "rooms": 1, "checkInDate": "2023-04-18", "checkOutDate": "2023-04-19", "childAgesPerRoom": [], "usedDefaultDates":False}, "requestNumber":2, "filters":None, "route":{"page": "Hotel_Review", "params": {"detailId": locationId, "geoId": geoId, "offset": "r10"}}}}] # data需要是json类型 data = json.dumps(data) headers = { "user-agent": user-agent, # 自行填写 "content-type": "application/json; charset=UTF-8", "origin": "https://www.tripadvisor.com", "x-requested-by": "TNI1625!ADcEQn+9K+sw7mHgZbsyGI2UftS4iOyyNcdidQPAc+vtAMBvJBsHrS9UBz+Q8f+v5FCuxfo8nOBnILfs1y6pgcNquOYSBOwj3GtzKolFduhNO6O8lTRGC4Eyiv2wQEKhghYw3/0e4t12H4q6zCgiTy3gXUu6p6bZ6FOT8OyQCRVH", } try: response = requests.post(post_url, headers=headers, data=data, timeout=30) except: time.sleep(3) print('正在重请求') response = requests.post(post_url, headers=headers, data=data, timeout=30) ILLEGAL_CHARACTERS_RE = re.compile( r'[\000-\010]|[\013-\014]|[\016-\037]') datas = response.json()[ 1]["data"]["locations"][0]['reviewListPage']['reviews'] for data in datas: item = {} item['pages'] = pages item['page'] = page # 酒店名称 item['hotel'] = response.json( )[1]["data"]["locations"][0]["name"] # 城市Id item['geoId'] = geoId # 酒店所在城市 item['city'] = data['location']['additionalNames']['geo'] # 酒店星级 item['star'] = star # 评论者ID try: item['displayName'] = data['userProfile']['displayName'] except: item['displayName'] = 'None' # 替换非法字符 item['displayName'] = ILLEGAL_CHARACTERS_RE.sub( r'', item['displayName']) # 评论者地址 try: address = data["userProfile"]["hometown"]["location"] if address != None: item['address'] = address['additionalNames']['long'] # 替换非法字符 item['address'] = ILLEGAL_CHARACTERS_RE.sub( r'', item['address']) else: item['address'] = 'None' except: item['address'] = 'None' # 评论者总评论数、总获赞数 userProfile = data['userProfile']["contributionCounts"] if userProfile != None: # 评论者总评论数 item['contribution'] = userProfile['sumAllUgc'] # 评论者总获赞数 item['helpfulVotes'] = userProfile['helpfulVote'] else: item['contribution'] = 'None' item['helpfulVotes'] = 'None' # 评论获赞数 item['helpVote'] = data['helpfulVotes'] # 评论日期 item['publishedDate'] = data['publishedDate'] # 入住日期、旅行类型 tripInfo = data['tripInfo'] if tripInfo != None: # 入住日期 item['stayDate'] = tripInfo['stayDate'] # 旅行类型 item['tripType'] = tripInfo['tripType'] else: # 入住日期 item['stayDate'] = 'None' # 旅行类型 item['tripType'] = 'None' # 总体评分 #总评 item['rating'] = data["rating"] # 各属性评分 value location service rooms cleanliness sleepQuality item['value'] = 'None' item['location'] = 'None' item['service'] = 'None' item['rooms'] = 'None' item['cleanliness'] = 'None' item['sleepQuality'] = 'None' additionalRatings = data['additionalRatings'] if additionalRatings != []: for rating in additionalRatings: if rating["ratingLabel"] == "Value": item['value'] = rating["rating"] elif rating["ratingLabel"] == "Location": item['location'] = rating["rating"] elif rating["ratingLabel"] == "Service": item['service'] = rating["rating"] elif rating["ratingLabel"] == "Rooms": item['rooms'] = rating["rating"] elif rating["ratingLabel"] == "Cleanliness": item['cleanliness'] = rating["rating"] elif rating["ratingLabel"] == "Sleep Quality": item['sleepQuality'] = rating["rating"] # 图片数量 item['imgNum'] = len(data['photoIds']) # 文本评论 item['comment'] = data['text'] item['comment'] = ILLEGAL_CHARACTERS_RE.sub( r'', item['comment']) line = [item['hotel'], item['star'], item['city'], item['displayName'], item['address'], item['contribution'], item['helpfulVotes'], item['helpVote'], item['publishedDate'], item['stayDate'], item['tripType'], item['rating'],item['value'], item['location'], item['service'], item['rooms'], item['cleanliness'], item['sleepQuality'], item['imgNum'], item['comment']] reviewsList.append(line) print(item['hotel']+" 第"+str(page)+"页评论") except requests.exceptions.ConnectionError or requests.exceptions.Timeout: # or requests.exceptions.ReadTimeout # urllib3.exceptions.ReadTimeoutError print("请求超时,正在重新请求") getComment(geoId, locationId, page, star) except: print('请求失败') # getComment(geoId, locationId, page, star) requestsList.append([geoId,locationId,page]) def storage(header, geoId, reviewsList, star, hotel): city = 'London' # 表头 header = header wb = openpyxl.Workbook() sheet = wb.active sheet.title = "commentInfo" sheet.append(header) for reviewList in reviewsList: sheet.append(reviewList) foldername = f'./data/{city}/{star}-star' if not os.path.exists(foldername): os.makedirs(foldername) wb.save(f'./data/{city}/{star}-star/{hotel}.xlsx') if __name__ == "__main__": try: # 从id.xlsx中获取url idsList = [] wb = openpyxl.load_workbook('star.xlsx') sheet = wb[wb.sheetnames[0]] rows = sheet.max_row cols = sheet.max_column # 记录请求失败的酒店及页数 requestsList =[] # 记录存储失败的酒店,方便纠错 storeList = [] # 控制爬取的酒店数量 for i in range(1, rows+1): cellValue = sheet.cell(row=i, column=2).value idList = re.findall('\d+', cellValue) # 城市Id geoId = idList[0] # 酒店Id locationId = idList[1] # 酒店星级 star = sheet.cell(row=i, column=3).value # 英文评论数 reviews = int(sheet.cell(row=i, column=4).value) if int(reviews) < 1000: continue # 评论的页数 pages = int(reviews / 10) + 1 taskList = [] reviewsList = [] # 多协程可以极大提升速度 for page in range(1, pages+1): task = gevent.spawn(getComment, geoId, locationId, page, star) taskList.append(task) gevent.joinall(taskList) # 存储 try: # 忘记原因了 if reviewsList != []: header = ['酒店名称', '酒店星级', '酒店所在城市', '评论者id', '评论者地址', '评论者分享评论数contribution', '评论者所获推荐数helpful votes', '该评论所获help votes', '评论日期', '入住日期Date of Stay', '旅行类型', '总体评分','Value', 'Location', 'Service', 'Rooms', 'Cleanliness', 'Sleep Quality', '图片数量', '文本评论'] storage(header, geoId, reviewsList, star, reviewsList[0][0].replace('/','-')) else: storeList.append([locationId]) except OSError: if reviewsList != []: header = ['酒店名称', '酒店星级', '酒店所在城市', '评论者id', '评论者地址', '评论者分享评论数contribution','评论者所获推荐数helpful votes', '该评论所获help votes', '评论日期', '入住日期Date of Stay', '旅行类型', '总体评分','Value', 'Location', 'Service','Rooms', 'Cleanliness', 'Sleep Quality', '图片数量', '文本评论'] storage(header, geoId, reviewsList, star, geoId) else: print(str(geoId)+'存储失败') storeList.append([locationId]) except: storeList.append([locationId]) except: print(requestsList) print(storeList)
最初的预期是所有的功能通过Scrapy框架实现,但在获取评论信息时发现没有实现Scrapy框架的并行功能,导致爬虫速度慢(大约1万条评论/小时),暂时没有找到解决方法,因此改用requests库,利用gevent库后提升了速度(约7小时抓取3285780条评论数据)。后续熟悉Scrapy框架后会进行优化
除此之外,还可以在以下方向进行优化:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。