当前位置:   article > 正文

《大众点评爬虫程序实战:爬取店铺展示信息》_大众点评 csdn

大众点评 csdn

 一、引言 

  •  在日常生活中,我们经常会遇到一个问题:不知道吃什么。尤其是在陌生的城市或附近的地方,面对众多的餐馆选择,很难做出决策。随着互联网的发展,大众点评等餐饮评价平台应运而生,为我们提供了海量的餐馆信息和用户评价。然而,即使在这样的平台上,面对庞大的数据量,我们仍然很难迅速找到最适合我们口味的美食店

二、爬取目标

  • 采集以下数据:店名,美食类型,地点,评分,评价人数以及人均消费

三、准备工作

  • 版本:python版本3.x及以上
  • 需要用到的包:requests,selenium,re,bs4,tqdm,subprocess,time,random,bag(自写包,可私聊获取)
  • json文件:完整json文件过大,这里只截取了部分数据进行展示

    1. # city.json
    2. {
    3. "郑州": "https://www.dianping.com/zhengzhou",
    4. "珠海": "https://www.dianping.com/zhuhai",
    5. "张家口": "https://www.dianping.com/zhangjiakou"
    6. }
    7. # menu.json
    8. {
    9. "美食": "https://www.dianping.com/{}/ch10",
    10. "丽人": "https//:www.dianping.com/{}/beauty",
    11. "周边游": "https//:www.dianping.com/{}/ch35",
    12. }
    13. """menu.json这个文件通过后面代码自动生成,生成格式如上所示"""
    14. # cookies.json
    15. [{}]
    16. """这里涉及到隐私问题就没把cookies展示出来,
    17. 下面会一步步带领大家如何自动获得可用的cookies
    18. 并且保存到本地需要时直接调用"""

四、爬虫实现

  1. 使用selenium获取登录后的cookies

    1. @echo off
    2. cd "C:\Program Files\Google\Chrome\Application"
    3. start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile"
    1. #!/usr/bin/env python3
    2. # coding:utf-8
    3. import subprocess
    4. import bag
    5. import time
    6. import random
    7. # batch_file_content = r'''
    8. # @echo off
    9. # cd "C:\Program Files\Google\Chrome\Application"
    10. # start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile"
    11. # '''
    12. #
    13. # with open('run_chrome.bat', 'w') as f:
    14. # f.write(batch_file_content)
    15. subprocess.Popen('run_chrome.bat', shell=True)
    16. web = bag.Bag.web_debug()
    17. web.get(r'https://www.dianping.com/')
    18. time.sleep(random.randint(5, 10))
    19. cookie = web.get_cookies()
    20. web.close()
    21. bag.Bag.save_json(cookie, r'./cookies.json')
    • 新建一个文本文件,将第一个代码复制过去并修改后缀为.bat文件,至于为什么要这样做呢,主要是因为这样做了后可以用subprocess来控制程序

    • 运行下面的代码一个可用的cookies便会自动生成

  2. 选择需要爬取的类型并生成menu.json文件

    1. #!/usr/bin/env python3
    2. # coding:utf-8
    3. import bag
    4. from bs4 import BeautifulSoup
    5. import re
    6. session = bag.session.create_session()
    7. for cookie in bag.Bag.read_json(r'./cookies.json'):
    8. session.cookies.set(cookie['name'], cookie['value'])
    9. # 输入需要爬取的城市名称
    10. def choose_city():
    11. js_data = bag.Bag.read_json('./city.json')
    12. choose = input('输入城市名:')
    13. judge = js_data.get(choose) # 判断输入的城市是否存在
    14. # pattern = re.compile(r'<a.*?data-click-title="first".*?href="(.*?)".*?>(.*?)</a>', re.S)
    15. pattern = re.compile(r'<a.*?href="(.*?)".*?>(.*?)</a>', re.S)
    16. dic = {}
    17. if judge:
    18. resp = session.get(judge)
    19. html = BeautifulSoup(resp.text, 'lxml')
    20. soup = html.findAll('span', class_='span-container')
    21. for info in soup:
    22. data = re.findall(pattern, str(info))
    23. mid: list = data[0][0].split('/')
    24. mid[-2] = '{}'
    25. dic[data[0][1]] = 'https:' + ''.join(mid)
    26. else:
    27. print('无效输入!')
    28. choose_city()
    29. print(dic) # 根据输入信息得到的生成结果
    30. '''输入城市名:珠海
    31. {
    32. "美食": "https:www.dianping.com{}ch10",
    33. "休闲娱乐": "https:www.dianping.com{}ch30",
    34. "结婚": "https:www.dianping.com{}wedding",
    35. "电影演出赛事": "https:www.dianping.com{}movie",
    36. "丽人": "https:www.dianping.com{}beauty",
    37. "酒店": "https:www.dianping.com{}hotel",
    38. "亲子": "https:www.dianping.com{}baby",
    39. "周边游": "https:www.dianping.com{}ch35",
    40. "运动健身": "https:www.dianping.com{}ch45",
    41. "购物": "https:www.dianping.com{}ch20",
    42. "家装": "https:www.dianping.com{}home",
    43. "学习培训": "https:www.dianping.com{}education",
    44. "生活服务": "https:www.dianping.com{}ch80",
    45. "医疗健康": "https:www.dianping.com{}ch85",
    46. "爱车": "https:www.dianping.com{}ch65",
    47. "宠物": "https:www.dianping.com{}ch95"
    48. }'''
    49. bag.Bag.save_json(dic, r'./menu.json')
    50. if __name__ == '__main__':
    51. choose_city()
  3. 完整代码

    1. # choose.py
    2. # !/usr/bin/env python3
    3. # coding:utf-8
    4. import bag
    5. def choose_city():
    6. session = bag.session.create_session()
    7. for cookie in bag.Bag.read_json(r'./cookies.json'):
    8. session.cookies.set(cookie['name'], cookie['value'])
    9. session.headers['Connection'] = 'close'
    10. js_data = bag.Bag.read_json('./city.json')
    11. choose = input('输入城市名:')
    12. judge = js_data.get(choose)
    13. if judge:
    14. city = judge.split('/')[-1]
    15. choose_1 = input('输入爬取类类型:')
    16. js_data1 = bag.Bag.read_json('./menu.json')
    17. judge1 = js_data1.get(choose_1)
    18. if judge1:
    19. return judge1.format(city), session
    20. else:
    21. print('开发中......')
    22. return None
    23. else:
    24. print('无效输入!')
    25. return None
    1. # get_shop.py
    2. # !/usr/bin/env python3
    3. # coding:utf-8
    4. import bag
    5. import chooses
    6. import re
    7. from bs4 import BeautifulSoup
    8. from tqdm import tqdm
    9. import requests
    10. proxies = {
    11. "http": "http://{}:{}",
    12. }
    13. def check():
    14. url_ = r'https://www.dianping.com/zhuhai/ch10'
    15. ip_ls = bag.Bag.read_json('../代理ip/IP地址.json')
    16. index = 0
    17. if len(ip_ls) == 0:
    18. print('IP地址全部失效')
    19. exit()
    20. for ip_address in ip_ls:
    21. proxies_ = {
    22. "http": "{}:{}".format(ip_address[0], ip_address[1]),
    23. }
    24. resp = session.get(url_, proxies=proxies_)
    25. if resp.status_code == 200:
    26. proxies['http'] = proxies['http'].format(ip_address[0], ip_address[1]) # 创建虚拟IP
    27. bag.Bag.save_json(ip_ls[index:], r'../代理ip/IP地址.json')
    28. print(f'[{index}] 更换ip成功')
    29. return
    30. index += 1
    31. url, session = chooses.choose_city()
    32. def get_types(): # 正常传参
    33. check()
    34. pattern = re.compile(r'<a.*?href="(.*?)".*?<span>(.*?)</span></a>', re.S)
    35. if bool(url):
    36. resp = session.get(url, proxies=proxies)
    37. html = BeautifulSoup(resp.text, 'lxml')
    38. soup = html.findAll('div', id='classfy')
    39. links = re.findall(pattern, str(soup))
    40. return links
    41. else:
    42. check()
    43. get_types()
    44. def get_shop():
    45. links = get_types()
    46. pattern = re.compile(r'<div class="tit">.*?<a.*?data-shopid="(.*?)".*?href="(.*?)".*?title="(.*?)"'
    47. r'(?:.*?<div class="star_icon">.*?<span class="(.*?)"></span>.*?<b>(.*?)</b>)?'
    48. r'(?:.*?<b>(.*?)</b>)?'
    49. r'(?:.*?<div class="tag-addr">.*?<span class="tag">(.*?)</span>.*?<em class="sep">.*?<span class="tag">(.*?)</span>)?',
    50. re.S)
    51. number = re.compile(r'data-ga-page="(.*?)"', re.S)
    52. result = []
    53. if not bool(links):
    54. print('获取异常')
    55. return
    56. for link in links: # 获取第一页
    57. try:
    58. resp = session.get(link[0], proxies=proxies)
    59. page = [int(i) for i in re.findall(number, resp.text)]
    60. page_num = sorted(page, reverse=True)[0]
    61. html = BeautifulSoup(resp.text, 'lxml')
    62. soup = html.findAll('li', class_='')
    63. for i in soup:
    64. for j in re.findall(pattern, str(i)):
    65. result.append(j)
    66. if page_num >= 2: # 获取第一页往后
    67. for count in tqdm(range(page_num)[1:]):
    68. try:
    69. resp1 = session.get(link[0]+'p{}'.format(count+1), proxies=proxies)
    70. html1 = BeautifulSoup(resp1.text, 'lxml')
    71. soup1 = html1.findAll('li', class_='')
    72. for k in soup1:
    73. info = pattern.search(str(k))
    74. if info:
    75. groups = list(info.groups())
    76. for i in range(len(groups)):
    77. if not groups[i]:
    78. groups[i] = 'null'
    79. result.append(tuple(groups))
    80. except requests.exceptions.RequestException as e:
    81. print(e)
    82. check()
    83. except Exception as e:
    84. print(e)
    85. continue
    86. else:
    87. pass
    88. except requests.exceptions.RequestException as e:
    89. print(e)
    90. check()
    91. except Exception as e:
    92. print(e)
    93. check()
    94. return result
    95. end = get_shop()
    96. bag.Bag.save_excel(end, './商店.xlsx')

五、成品展示

六、总结

  1. 使用selenium结合requests对网页数据进行采集可以避免很多弯弯绕绕的破解
  2. 大众点评反爬机制比较完善,爬取的时候为了防止ip被拉黑建议使用代理ip,代理ip使用方法可自行百度
声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号