当前位置:   article > 正文

【Python】去哪儿旅游景点数据爬虫_去哪儿爬虫

去哪儿爬虫

爬虫需要模块:BeautifulSoup、requests

爬虫网站:去哪儿-https://travel.qunar.com/place/

1.爬取城市ID链接

例如:https://travel.qunar.com/p-cs300148-haikou

  1. # -*- coding: utf-8 -*-
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4. import requests
  5. def crawer_travel_static_url(url):
  6. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  7. req=requests.get(url,headers=headers)
  8. content=req.text
  9. soup=BeautifulSoup(content,'lxml')
  10. return soup
  11. def crawer_travel_city_id():
  12. url = 'http://travel.qunar.com/place/'
  13. soup=crawer_travel_static_url(url)
  14. cat_url = []
  15. cat_name = []
  16. sub_list=soup.find_all('div',attrs={'class':'sub_list'})
  17. for i in range(0,len(sub_list)):
  18. a_attr = sub_list[i].find_all('a')
  19. for j in range(0,len(a_attr)):
  20. cat_name.append(a_attr[j].text)
  21. cat_url.append(a_attr[j].attrs['href'])
  22. return cat_name,cat_url
  23. city_name_list,city_url_list=crawer_travel_city_id()
  24. city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list})
  25. city.to_csv('travel_city.csv',encoding='utf_8_sig',index=False)

2.爬取城市景点id链接

例如:https://travel.qunar.com/p-oi5740424-qiloulaojie

  1. # -*- coding: utf-8 -*-
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4. import requests
  5. def crawer_travel_url_content(url):
  6. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  7. req=requests.get(url,headers=headers)
  8. content=req.text
  9. bsObj=BeautifulSoup(content,'lxml')
  10. return bsObj
  11. def crawer_travel_attraction_url(url):
  12. #该城市最大景点数
  13. maxnum = crawer_travel_url_content(url+'-jingdian').find('p',{'class':'nav_result'}).find('span').text
  14. #提取数字
  15. maxnum=int(''.join([x for x in maxnum if x.isdigit()]))
  16. url=url+'-jingdian-1-'
  17. cat_url = []
  18. cat_name = []
  19. # 这里取top10景点 每页10条 page从1开始
  20. page=2
  21. # 判断是否超过范围
  22. if (page-1)*10>maxnum :
  23. page=int(((maxnum+10)/10)+1)
  24. for i in range(1,page):
  25. url1=url+str(i)
  26. bsObj=crawer_travel_url_content(url1)
  27. bs=bsObj.find_all('a',attrs={'data-beacon':'poi','target':'_blank'})
  28. for j in range(0, len(bs)):
  29. if(bs[j].text!=''):
  30. cat_name.append(bs[j].text)
  31. cat_url.append(bs[j].attrs['href'])
  32. print(cat_name,cat_url)
  33. print(len(cat_name))
  34. print(len(cat_url))
  35. return cat_name, cat_url
  36. #海口举例
  37. url='https://travel.qunar.com/p-cs300148-haikou-jingdian'
  38. city_name_list,city_url_list=crawer_travel_attraction_url(url)
  39. city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list})
  40. city.to_csv('travel_attraction.csv',encoding='utf_8_sig')

3.爬取景点详细信息

  1. # -*- coding: utf-8 -*-
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4. import json
  5. import requests
  6. import time
  7. def get_static_url_content(url):
  8. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  9. req = requests.get(url, headers=headers)
  10. content = req.text
  11. bsObj = BeautifulSoup(content, 'lxml')
  12. return bsObj
  13. def get_jd_comment(url):
  14. # 该景点最大评论数
  15. maxnum = get_static_url_content(url).find('span', {'class': 'e_nav_comet_num'}).text
  16. maxnum = int(maxnum)
  17. poi = ''.join([x for x in url if x.isdigit()])
  18. cat_user_id = []
  19. cat_user_name= []
  20. cat_jd_poi = []
  21. cat_score = []
  22. cat_user_comment = []
  23. cat_comment_time = []
  24. url = 'http://travel.qunar.com/place/api/html/comments/poi/' + poi + '?poiList=true&sortField=1&rank=0&pageSize=50&page='
  25. #这里页数暂时设为101,取的pageSize=50,即爬取100*50条评论
  26. page = 101
  27. if (page - 1) * 50 > maxnum:
  28. page = int(((maxnum + 50) / 50)+1)
  29. for i in range(1, page):
  30. url1 = url + str(i)
  31. json_str = requests.get(url1, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}).text
  32. try:
  33. json_data=json.loads(json_str)['data']
  34. except:
  35. continue
  36. #print(json_data)
  37. bsObj = BeautifulSoup(json_data, 'lxml')
  38. bs=bsObj.find_all('li',{'class':'e_comment_item clrfix'})
  39. for j in range(0,len(bs)):
  40. try:
  41. user=bs[j].find('div', {'class': 'e_comment_usr_name'}).find('a')
  42. cat_user_id.append(''.join([x for x in user.attrs['href'] if x.isdigit()]))
  43. cat_user_name.append(user.text)
  44. cat_jd_poi.append(poi)
  45. score=''.join([x for x in str(bs[j].find('span',{'class':'total_star'}).find('span')) if x.isdigit()])
  46. cat_score.append(score)
  47. a=bs[j].find('div',{'class':'e_comment_content'}).find_all('p')
  48. cat_user_comment.append(''.join(x.text for x in a))
  49. cat_comment_time.append(bs[j].find('div',{'class':'e_comment_add_info'}).find('li').text)
  50. except:
  51. print('i=',i,'j=',j,'有问题')
  52. print('已完成poi=',poi,' ',i,'/',page-1)
  53. time.sleep(3)
  54. return cat_user_id,cat_user_name,cat_jd_poi,cat_score,cat_comment_time,cat_user_comment
  55. # 西湖举例
  56. url = 'http://travel.qunar.com/p-oi708952-xihu'
  57. cat_user_id,cat_user_name,cat_jd_poi,cat_score,cat_comment_time,cat_user_comment=get_jd_comment(url)
  58. city=pd.DataFrame({'user_id':cat_user_id,'user_name':cat_user_name,'jd_poi':cat_jd_poi,'score':cat_score,'time':cat_comment_time,'comment':cat_user_comment})
  59. city.to_csv('travel_comment.csv',encoding='utf_8_sig')

【参考资料】


[1]https://zhuanlan.zhihu.com/p/41324232

[2]https://blog.csdn.net/sdozouhao2007/article/details/84404982

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/1015336
推荐阅读
相关标签
  

闽ICP备14008679号