赞
踩
我自己写的dcd爬虫,这个网站比较简单。看了看别人的程序,觉得用起来挺别扭,就自己捣鼓了一天。弄出来了。
这个网站没有反爬,有一些是动态网页,有一些是静态。
首先,获取销量排行榜前300的车型。
- import os
- import json
- import requests
- from parsel import Selector
-
-
- # ---------------------------------------------------------#
- # ---- * 获得车辆销售排行榜前300、100的车 * ----#
- # ---------------------------------------------------------#
-
-
- url = "https://www.dongchedi.com/motor/pc/car/rank_data"
- headers = {
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
- }
-
-
- def get_param(page):
- params = {
- "aid": "1839",
- "app_name": "auto_web_pc",
- "city_name": "烟台",
- "count": "10",
- "offset": page,
- "month": "",
- "new_energy_type": "",
- "rank_data_type": "11",
- "brand_id": "",
- "price": "",
- "manufacturer": "",
- "outter_detail_type": "",
- "nation": "0"
- }
- return params
-
-
- def get_response(pageNum):
- params = get_param(str(pageNum * 10))
-
- with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
- resp.raise_for_status()
- print(resp.status_code)
- return resp
-
-
- data_list = []
- for i in range(30):
- print(f"销量前{i * 10} 的车")
- response = get_response(i)
- data_list.append(response.json())
获取之后,就能访问该车型,一般一个车型有好多款式,我的目的是向比较一些车型的尺寸,所以一个车型就选第一种款式,访问进入该车型第一种款式的参数配置,这样把参数下载下来,放到一个文件里,就可以比较现在卖的车的尺寸情况。
第二部分,我尝试了一下动态请求车型的价格。不过这一部分后面数据分析没有用到。
-
- len(data_list)
- import jsonpath
- data_list[0]['data']['list'][0]['series_name']
-
- name_list = jsonpath.jsonpath(data_list, "$..series_name")
- id_list = jsonpath.jsonpath(data_list, "$..series_id")
- id_list
- first_list = jsonpath.jsonpath(data_list, "$..online_car_ids")
-
- first_list[0][0]
-
- car_id_list = []
- for ls in first_list:
- if ls:
- first_id = ls[0]
- else:
- first_id = None
- car_id_list.append(first_id)
- len(car_id_list)
-
- import pandas as pd
- df = pd.DataFrame({
- "name": name_list,
- "series": id_list,
- "first_id":car_id_list
- })
-
- df
-
- df[df['first_id'] == None]
-
- df2 = df.dropna()
- df.shape
- df2.shape
-
- df2.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
- df = pd.read_csv("Pythn-Anlys-138/dcd/top300cars.csv")
- df.keys()
- df.columns
- df.columns = ['rank', 'name', 'series', 'first_id']
- df.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
-
- # ---------------------------------------------------------#
- # ---- * 价格 * ----#
- # ---------------------------------------------------------#
-
- first_id
-
- def get_price(car_id):
- import json
- import os
- wk_dir = "Pythn-Anlys-138/dcd"
- # fpath = wk_dir + "/" + car_id + ".csv"
- fname = car_id + ".json"
- url = "https://www.dongchedi.com/motor/pc/car/series/car_dealer_price"
- headers = {
- 。。。
- }
- params = {
- "aid": "1839",
- "app_name": "auto_web_pc",
- "car_ids": car_id,
- "city_name": "烟台"
- }
-
- with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
- resp.raise_for_status()
- # print(resp.json())
- rj = resp.json()
- with open(os.path.join(wk_dir, fname), 'w', encoding="utf-8") as f:
- f.write(json.dumps(rj, ensure_ascii=False))
- print(f"保存文件成功 {car_id} !!!")
-
- first_id = str(first_id)
-
- get_price(first_id)
这一部分呢后期没什么用,代码也很乱。
第三部分,获取某一车型的第一种款式的参数。
-
- # ---------------------------------------------------------#
- # ---- * 参数配置 * ----#
- # ---------------------------------------------------------#
-
- from parsel import Selector
-
- def get_detail_page(id):
- url = "https://www.dongchedi.com/auto/params-carIds-" + id
- headers = {
- 。。。
- }
-
- with requests.get(url=url, headers=headers, verify=False) as resp:
- resp.raise_for_status()
- # print(resp.text)
- return resp.text
-
- html = get_detail_page(id)
-
- html
-
- selector = Selector(html)
-
- selector.css('div[data-row-anchor]')
- len(selector.css('div[data-row-anchor]'))
-
- all_rows = selector.css('div[data-row-anchor]')
-
- dct_list = []
- for row in all_rows:
- dct_item = {}
- label = row.css('div:nth-child(1) label::text').get()
- value = row.css('div:nth-child(2) div::text').get()
- dct_item[label] = value
- dct_list.append(dct_item)
-
- dct_list
-
- first_row = all_rows[0]
-
- def parse_detail(id):
- html = get_detail_page(id)
- selector = Selector(html)
- all_rows = selector.css('div[data-row-anchor]')
- dct_list = []
- for row in all_rows:
- dct_item = {}
- label = row.css('div:nth-child(1) label::text').get()
- value = row.css('div:nth-child(2) div::text').get()
- dct_item[label] = value
- dct_list.append(dct_item)
-
- dct_detail = {
- "id":id,
- "detail":dct_list
- }
- return dct_detail
-
- dct_detail = parse_detail(id)
- dct_detail
-
-
- first_id_list
-
- def save_detail(id, dct_detail):
- fname = id + "_dcd_detail.json"
- with open(os.path.join("Pythn-Anlys-138/dcd", fname), 'w', encoding='utf8') as f:
- f.write(json.dumps(dct_detail, ensure_ascii=False))
- print(f"Detail file {id} saved!!!")
-
- for fid in first_id_list:
- dct_detail = parse_detail(fid)
- save_detail(fid, dct_detail)
-
-
最后,下载了一些json文件。后期做了一些数据整理。做成了数据表是这样的。
结果还不错。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。