赞
踩
json文件:完整json文件过大,这里只截取了部分数据进行展示
- # city.json
- {
- "郑州": "https://www.dianping.com/zhengzhou",
- "珠海": "https://www.dianping.com/zhuhai",
- "张家口": "https://www.dianping.com/zhangjiakou"
- }
-
-
- # menu.json
- {
- "美食": "https://www.dianping.com/{}/ch10",
- "丽人": "https//:www.dianping.com/{}/beauty",
- "周边游": "https//:www.dianping.com/{}/ch35",
- }
- """menu.json这个文件通过后面代码自动生成,生成格式如上所示"""
-
- # cookies.json
- [{}]
- """这里涉及到隐私问题就没把cookies展示出来,
- 下面会一步步带领大家如何自动获得可用的cookies
- 并且保存到本地需要时直接调用"""

- @echo off
- cd "C:\Program Files\Google\Chrome\Application"
- start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile"
- #!/usr/bin/env python3
- # coding:utf-8
- import subprocess
- import bag
- import time
- import random
-
- # batch_file_content = r'''
- # @echo off
- # cd "C:\Program Files\Google\Chrome\Application"
- # start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile"
- # '''
- #
- # with open('run_chrome.bat', 'w') as f:
- # f.write(batch_file_content)
-
- subprocess.Popen('run_chrome.bat', shell=True)
-
- web = bag.Bag.web_debug()
-
- web.get(r'https://www.dianping.com/')
- time.sleep(random.randint(5, 10))
- cookie = web.get_cookies()
-
- web.close()
-
- bag.Bag.save_json(cookie, r'./cookies.json')

新建一个文本文件,将第一个代码复制过去并修改后缀为.bat文件,至于为什么要这样做呢,主要是因为这样做了后可以用subprocess来控制程序
运行下面的代码一个可用的cookies便会自动生成
- #!/usr/bin/env python3
- # coding:utf-8
- import bag
- from bs4 import BeautifulSoup
- import re
-
- session = bag.session.create_session()
-
- for cookie in bag.Bag.read_json(r'./cookies.json'):
- session.cookies.set(cookie['name'], cookie['value'])
-
-
- # 输入需要爬取的城市名称
- def choose_city():
- js_data = bag.Bag.read_json('./city.json')
- choose = input('输入城市名:')
- judge = js_data.get(choose) # 判断输入的城市是否存在
-
- # pattern = re.compile(r'<a.*?data-click-title="first".*?href="(.*?)".*?>(.*?)</a>', re.S)
- pattern = re.compile(r'<a.*?href="(.*?)".*?>(.*?)</a>', re.S)
-
- dic = {}
-
- if judge:
- resp = session.get(judge)
- html = BeautifulSoup(resp.text, 'lxml')
- soup = html.findAll('span', class_='span-container')
- for info in soup:
- data = re.findall(pattern, str(info))
- mid: list = data[0][0].split('/')
- mid[-2] = '{}'
- dic[data[0][1]] = 'https:' + ''.join(mid)
- else:
- print('无效输入!')
- choose_city()
-
- print(dic) # 根据输入信息得到的生成结果
- '''输入城市名:珠海
- {
- "美食": "https:www.dianping.com{}ch10",
- "休闲娱乐": "https:www.dianping.com{}ch30",
- "结婚": "https:www.dianping.com{}wedding",
- "电影演出赛事": "https:www.dianping.com{}movie",
- "丽人": "https:www.dianping.com{}beauty",
- "酒店": "https:www.dianping.com{}hotel",
- "亲子": "https:www.dianping.com{}baby",
- "周边游": "https:www.dianping.com{}ch35",
- "运动健身": "https:www.dianping.com{}ch45",
- "购物": "https:www.dianping.com{}ch20",
- "家装": "https:www.dianping.com{}home",
- "学习培训": "https:www.dianping.com{}education",
- "生活服务": "https:www.dianping.com{}ch80",
- "医疗健康": "https:www.dianping.com{}ch85",
- "爱车": "https:www.dianping.com{}ch65",
- "宠物": "https:www.dianping.com{}ch95"
- }'''
-
- bag.Bag.save_json(dic, r'./menu.json')
-
-
- if __name__ == '__main__':
- choose_city()

- # choose.py
- # !/usr/bin/env python3
- # coding:utf-8
- import bag
-
-
- def choose_city():
- session = bag.session.create_session()
-
- for cookie in bag.Bag.read_json(r'./cookies.json'):
- session.cookies.set(cookie['name'], cookie['value'])
-
- session.headers['Connection'] = 'close'
- js_data = bag.Bag.read_json('./city.json')
- choose = input('输入城市名:')
- judge = js_data.get(choose)
-
- if judge:
- city = judge.split('/')[-1]
- choose_1 = input('输入爬取类类型:')
- js_data1 = bag.Bag.read_json('./menu.json')
- judge1 = js_data1.get(choose_1)
- if judge1:
- return judge1.format(city), session
- else:
- print('开发中......')
- return None
- else:
- print('无效输入!')
- return None
-

- # get_shop.py
- # !/usr/bin/env python3
- # coding:utf-8
- import bag
- import chooses
- import re
- from bs4 import BeautifulSoup
- from tqdm import tqdm
- import requests
-
-
- proxies = {
- "http": "http://{}:{}",
- }
-
-
- def check():
- url_ = r'https://www.dianping.com/zhuhai/ch10'
- ip_ls = bag.Bag.read_json('../代理ip/IP地址.json')
- index = 0
- if len(ip_ls) == 0:
- print('IP地址全部失效')
- exit()
- for ip_address in ip_ls:
- proxies_ = {
- "http": "{}:{}".format(ip_address[0], ip_address[1]),
- }
- resp = session.get(url_, proxies=proxies_)
-
- if resp.status_code == 200:
- proxies['http'] = proxies['http'].format(ip_address[0], ip_address[1]) # 创建虚拟IP
- bag.Bag.save_json(ip_ls[index:], r'../代理ip/IP地址.json')
- print(f'[{index}] 更换ip成功')
- return
- index += 1
-
-
- url, session = chooses.choose_city()
-
-
- def get_types(): # 正常传参
- check()
- pattern = re.compile(r'<a.*?href="(.*?)".*?<span>(.*?)</span></a>', re.S)
- if bool(url):
- resp = session.get(url, proxies=proxies)
- html = BeautifulSoup(resp.text, 'lxml')
- soup = html.findAll('div', id='classfy')
- links = re.findall(pattern, str(soup))
- return links
- else:
- check()
- get_types()
-
-
- def get_shop():
- links = get_types()
- pattern = re.compile(r'<div class="tit">.*?<a.*?data-shopid="(.*?)".*?href="(.*?)".*?title="(.*?)"'
- r'(?:.*?<div class="star_icon">.*?<span class="(.*?)"></span>.*?<b>(.*?)</b>)?'
- r'(?:.*?<b>(.*?)</b>)?'
- r'(?:.*?<div class="tag-addr">.*?<span class="tag">(.*?)</span>.*?<em class="sep">.*?<span class="tag">(.*?)</span>)?',
- re.S)
- number = re.compile(r'data-ga-page="(.*?)"', re.S)
-
- result = []
-
- if not bool(links):
- print('获取异常')
- return
-
- for link in links: # 获取第一页
- try:
- resp = session.get(link[0], proxies=proxies)
- page = [int(i) for i in re.findall(number, resp.text)]
- page_num = sorted(page, reverse=True)[0]
- html = BeautifulSoup(resp.text, 'lxml')
-
- soup = html.findAll('li', class_='')
- for i in soup:
- for j in re.findall(pattern, str(i)):
- result.append(j)
- if page_num >= 2: # 获取第一页往后
- for count in tqdm(range(page_num)[1:]):
- try:
- resp1 = session.get(link[0]+'p{}'.format(count+1), proxies=proxies)
- html1 = BeautifulSoup(resp1.text, 'lxml')
- soup1 = html1.findAll('li', class_='')
- for k in soup1:
- info = pattern.search(str(k))
- if info:
- groups = list(info.groups())
- for i in range(len(groups)):
- if not groups[i]:
- groups[i] = 'null'
- result.append(tuple(groups))
- except requests.exceptions.RequestException as e:
- print(e)
- check()
- except Exception as e:
- print(e)
- continue
- else:
- pass
- except requests.exceptions.RequestException as e:
- print(e)
- check()
- except Exception as e:
- print(e)
- check()
- return result
-
-
- end = get_shop()
- bag.Bag.save_excel(end, './商店.xlsx')

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。