赞
踩
采用python + selenium + browsermobproxy抓包 提取拼多多订单数据,通过selenium模拟登录拼多多后到订单页提取订单数据。browsermob-proxy是一个代理工具,能抓取网页所有的访问细节。下载地址 使用还需要配置java环境 在python安装browsermobproxy
pip install browsermob-proxy
打开拼多多注意,查看network 发现存储数据的json文件
注意,这个只存储从第十一个订单开始的数据,下滑每十个刷新一次,前十个订单数据在源代码中window.rawData中,也在控制台输入即可看见。那么我们要做的就是selenium登录跳转到订单页面,从源代码中获取前十订单数据,从order_list_v3中获取剩余订单数据。
启动browsermob-proxy
from browsermobproxy import Server
server = Server(r'D:\browsermob-proxy-2.1.4-bin\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')//添加browsermob-proxy路径
server.start()//启动
proxy = server.create_proxy()//创建
启动selenium
chrome_driver = r'C:\Users\dell\AppData\Local\Google\Chrome\Application\chromedriver.exe' from selenium import webdriver from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument('--ignore-certificate-errors') # 解决你的链接不是私密问题 chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy)) driver = webdriver.Chrome(executable_path=chrome_driver,options=chrome_options) driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => Chrome }) """ }) driver.implicitly_wait(10) driver.maximize_window() driver.get("http://yangkeduo.com/")
selenium登录评多多跳转到全订单页
driver.find_element_by_xpath('//*[@id="main"]/div/div[2]/div[4]/div/div[1]/div[5]').click()//点击订单跳转到登录页面
driver.find_element_by_class_name('phone-login').click()
driver.find_element_by_id('user-mobile').send_keys('手机号')
driver.find_element_by_id('code-button').click()
code = input('清输入验证码')
driver.find_element_by_id('input-code').send_keys(code)
driver.find_element_by_id('submit-button').click()//登录
启动监听,获取源代码数据
proxy.new_har('order',options={'captureContent': True})
driver.execute_script('return JSON.stringify(window.rawData)')//返回前十个订单数据,里面的json格式与order_list_v3中的不一样,需要单独存储提取==还有编码的坑
f = open('top_ten.json', 'a')
json.dump(data,f)
f.close()//保存数据
循环滚动到底
from selenium.common.exceptions import NoSuchElementException
def isElementPresent():
try:
element = driver.find_element_by_xpath('//*[@class = "loading-text"]')//到底不为空
except NoSuchElementException as e:
return False
else:
return True
while True:
target = isElementPresent()//判断是否到底,如果没有,继续滚动
if target is False :
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(2)
else:
break
从代理的包提取包保存
for entry in result['log']['entries']:
# print(entry['request']['url'])
url = entry['request']['url']
# 根据URL找到数据接口
if "order_list_v3" in url:
response = entry['response']['content']
content = response['text']
f = open('data.json', 'a')
f.write(content + '\n')
f.close()
整体思路就是这样的,最后就是处理数据提取输出Excel表,欢迎指出错误。
def get_data(): detail_list = [] with open(phone_number.get() + 'top_ten.json', 'r+', encoding='utf-8') as f1: data1 = json.load(f1) d = json.loads(data1) for order in d['ordersStore']['orders']: detail = {} detail['order_sn'] = order['orderSn'] detail['order_amount'] = order['orderAmount'] detail['goods_name'] = order['orderGoods'][0]['goodsName'] detail['goods_price'] = order['orderGoods'][0]['goodsPrice'] detail['goods_number'] = order['orderGoods'][0]['goodsNumber'] detail['order_status_prompt'] = order['orderStatusPrompt'] if not detail['order_status_prompt'] in ['交易已取消', '未发货,退款成功', '已发货,退款成功']: detail_list.append(detail.copy()) with open(phone_number.get() + 'data.json', 'r+') as f: try: for item in jsonlines.Reader(f): for order in item['orders']: detail = {} detail['order_sn'] = order['order_sn'] detail['order_amount'] = order['order_amount'] detail['goods_name'] = order['order_goods'][0]['goods_name'] detail['goods_price'] = order['order_goods'][0]['goods_price'] detail['goods_number'] = order['order_goods'][0]['goods_number'] detail['order_status_prompt'] = order['order_status_prompt'] if not detail['order_status_prompt'] in ['交易已取消', '未发货,退款成功', '已发货,退款成功']: detail_list.append(detail.copy()) except: print('没有更多订单了') return detail_list def export_excel(export): # 将字典列表转换为DataFrame pf = pd.DataFrame(list(export)) # 指定字段顺序 order = ['order_sn', 'order_amount', 'goods_name', 'goods_price', 'goods_number', 'order_status_prompt'] pf = pf[order] # 将列名替换为中文 columns_map = { 'order_sn': '订单编号', 'order_amount': '订单数额', 'goods_name': '商品名称', 'goods_price': '商品价格', 'goods_number': '商品数量', 'order_status_prompt': '订单状态', } pf.rename(columns=columns_map, inplace=True) file_path = pd.ExcelWriter(phone_number.get() + '.xlsx') pf.fillna(' ', inplace=True) pf.to_excel(file_path, encoding='utf-8', index=False) file_path.save()
欢迎指正错误。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。