赞
踩
爬虫需要模块:BeautifulSoup、requests
爬虫网站:去哪儿-https://travel.qunar.com/place/
例如:https://travel.qunar.com/p-cs300148-haikou
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup
- import pandas as pd
- import requests
-
- def crawer_travel_static_url(url):
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
- req=requests.get(url,headers=headers)
- content=req.text
- soup=BeautifulSoup(content,'lxml')
- return soup
-
- def crawer_travel_city_id():
- url = 'http://travel.qunar.com/place/'
- soup=crawer_travel_static_url(url)
- cat_url = []
- cat_name = []
- sub_list=soup.find_all('div',attrs={'class':'sub_list'})
-
- for i in range(0,len(sub_list)):
- a_attr = sub_list[i].find_all('a')
- for j in range(0,len(a_attr)):
- cat_name.append(a_attr[j].text)
- cat_url.append(a_attr[j].attrs['href'])
- return cat_name,cat_url
-
- city_name_list,city_url_list=crawer_travel_city_id()
- city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list})
- city.to_csv('travel_city.csv',encoding='utf_8_sig',index=False)
例如:https://travel.qunar.com/p-oi5740424-qiloulaojie
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup
- import pandas as pd
- import requests
-
- def crawer_travel_url_content(url):
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
- req=requests.get(url,headers=headers)
- content=req.text
- bsObj=BeautifulSoup(content,'lxml')
- return bsObj
-
- def crawer_travel_attraction_url(url):
- #该城市最大景点数
- maxnum = crawer_travel_url_content(url+'-jingdian').find('p',{'class':'nav_result'}).find('span').text
- #提取数字
- maxnum=int(''.join([x for x in maxnum if x.isdigit()]))
-
- url=url+'-jingdian-1-'
- cat_url = []
- cat_name = []
-
- # 这里取top10景点 每页10条 page从1开始
- page=2
- # 判断是否超过范围
- if (page-1)*10>maxnum :
- page=int(((maxnum+10)/10)+1)
-
- for i in range(1,page):
- url1=url+str(i)
- bsObj=crawer_travel_url_content(url1)
- bs=bsObj.find_all('a',attrs={'data-beacon':'poi','target':'_blank'})
- for j in range(0, len(bs)):
- if(bs[j].text!=''):
- cat_name.append(bs[j].text)
- cat_url.append(bs[j].attrs['href'])
- print(cat_name,cat_url)
- print(len(cat_name))
- print(len(cat_url))
- return cat_name, cat_url
-
- #海口举例
- url='https://travel.qunar.com/p-cs300148-haikou-jingdian'
- city_name_list,city_url_list=crawer_travel_attraction_url(url)
- city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list})
- city.to_csv('travel_attraction.csv',encoding='utf_8_sig')
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup
- import pandas as pd
- import json
- import requests
- import time
-
- def get_static_url_content(url):
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
- req = requests.get(url, headers=headers)
- content = req.text
- bsObj = BeautifulSoup(content, 'lxml')
- return bsObj
-
- def get_jd_comment(url):
- # 该景点最大评论数
- maxnum = get_static_url_content(url).find('span', {'class': 'e_nav_comet_num'}).text
- maxnum = int(maxnum)
-
- poi = ''.join([x for x in url if x.isdigit()])
-
- cat_user_id = []
- cat_user_name= []
- cat_jd_poi = []
- cat_score = []
- cat_user_comment = []
- cat_comment_time = []
-
- url = 'http://travel.qunar.com/place/api/html/comments/poi/' + poi + '?poiList=true&sortField=1&rank=0&pageSize=50&page='
- #这里页数暂时设为101,取的pageSize=50,即爬取100*50条评论
- page = 101
- if (page - 1) * 50 > maxnum:
- page = int(((maxnum + 50) / 50)+1)
- for i in range(1, page):
- url1 = url + str(i)
- json_str = requests.get(url1, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}).text
- try:
- json_data=json.loads(json_str)['data']
- except:
- continue
- #print(json_data)
- bsObj = BeautifulSoup(json_data, 'lxml')
- bs=bsObj.find_all('li',{'class':'e_comment_item clrfix'})
-
- for j in range(0,len(bs)):
- try:
- user=bs[j].find('div', {'class': 'e_comment_usr_name'}).find('a')
- cat_user_id.append(''.join([x for x in user.attrs['href'] if x.isdigit()]))
-
- cat_user_name.append(user.text)
-
- cat_jd_poi.append(poi)
-
- score=''.join([x for x in str(bs[j].find('span',{'class':'total_star'}).find('span')) if x.isdigit()])
- cat_score.append(score)
-
- a=bs[j].find('div',{'class':'e_comment_content'}).find_all('p')
- cat_user_comment.append(''.join(x.text for x in a))
-
- cat_comment_time.append(bs[j].find('div',{'class':'e_comment_add_info'}).find('li').text)
-
- except:
- print('i=',i,'j=',j,'有问题')
- print('已完成poi=',poi,' ',i,'/',page-1)
- time.sleep(3)
-
- return cat_user_id,cat_user_name,cat_jd_poi,cat_score,cat_comment_time,cat_user_comment
-
- # 西湖举例
- url = 'http://travel.qunar.com/p-oi708952-xihu'
- cat_user_id,cat_user_name,cat_jd_poi,cat_score,cat_comment_time,cat_user_comment=get_jd_comment(url)
- city=pd.DataFrame({'user_id':cat_user_id,'user_name':cat_user_name,'jd_poi':cat_jd_poi,'score':cat_score,'time':cat_comment_time,'comment':cat_user_comment})
- city.to_csv('travel_comment.csv',encoding='utf_8_sig')
[1]https://zhuanlan.zhihu.com/p/41324232
[2]https://blog.csdn.net/sdozouhao2007/article/details/84404982
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。