当前位置:   article > 正文

我的dcd爬虫-Python

我的dcd爬虫-Python

我自己写的dcd爬虫,这个网站比较简单。看了看别人的程序,觉得用起来挺别扭,就自己捣鼓了一天。弄出来了。

这个网站没有反爬,有一些是动态网页,有一些是静态。

首先,获取销量排行榜前300的车型。

  1. import os
  2. import json
  3. import requests
  4. from parsel import Selector
  5. # ---------------------------------------------------------#
  6. # ---- * 获得车辆销售排行榜前300、100的车 * ----#
  7. # ---------------------------------------------------------#
  8. url = "https://www.dongchedi.com/motor/pc/car/rank_data"
  9. headers = {
  10. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
  11. }
  12. def get_param(page):
  13. params = {
  14. "aid": "1839",
  15. "app_name": "auto_web_pc",
  16. "city_name": "烟台",
  17. "count": "10",
  18. "offset": page,
  19. "month": "",
  20. "new_energy_type": "",
  21. "rank_data_type": "11",
  22. "brand_id": "",
  23. "price": "",
  24. "manufacturer": "",
  25. "outter_detail_type": "",
  26. "nation": "0"
  27. }
  28. return params
  29. def get_response(pageNum):
  30. params = get_param(str(pageNum * 10))
  31. with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
  32. resp.raise_for_status()
  33. print(resp.status_code)
  34. return resp
  35. data_list = []
  36. for i in range(30):
  37. print(f"销量前{i * 10} 的车")
  38. response = get_response(i)
  39. data_list.append(response.json())

获取之后,就能访问该车型,一般一个车型有好多款式,我的目的是向比较一些车型的尺寸,所以一个车型就选第一种款式,访问进入该车型第一种款式的参数配置,这样把参数下载下来,放到一个文件里,就可以比较现在卖的车的尺寸情况。

第二部分,我尝试了一下动态请求车型的价格。不过这一部分后面数据分析没有用到。

  1. len(data_list)
  2. import jsonpath
  3. data_list[0]['data']['list'][0]['series_name']
  4. name_list = jsonpath.jsonpath(data_list, "$..series_name")
  5. id_list = jsonpath.jsonpath(data_list, "$..series_id")
  6. id_list
  7. first_list = jsonpath.jsonpath(data_list, "$..online_car_ids")
  8. first_list[0][0]
  9. car_id_list = []
  10. for ls in first_list:
  11. if ls:
  12. first_id = ls[0]
  13. else:
  14. first_id = None
  15. car_id_list.append(first_id)
  16. len(car_id_list)
  17. import pandas as pd
  18. df = pd.DataFrame({
  19. "name": name_list,
  20. "series": id_list,
  21. "first_id":car_id_list
  22. })
  23. df
  24. df[df['first_id'] == None]
  25. df2 = df.dropna()
  26. df.shape
  27. df2.shape
  28. df2.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
  29. df = pd.read_csv("Pythn-Anlys-138/dcd/top300cars.csv")
  30. df.keys()
  31. df.columns
  32. df.columns = ['rank', 'name', 'series', 'first_id']
  33. df.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
  34. # ---------------------------------------------------------#
  35. # ---- * 价格 * ----#
  36. # ---------------------------------------------------------#
  37. first_id
  38. def get_price(car_id):
  39. import json
  40. import os
  41. wk_dir = "Pythn-Anlys-138/dcd"
  42. # fpath = wk_dir + "/" + car_id + ".csv"
  43. fname = car_id + ".json"
  44. url = "https://www.dongchedi.com/motor/pc/car/series/car_dealer_price"
  45. headers = {
  46. 。。。
  47. }
  48. params = {
  49. "aid": "1839",
  50. "app_name": "auto_web_pc",
  51. "car_ids": car_id,
  52. "city_name": "烟台"
  53. }
  54. with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
  55. resp.raise_for_status()
  56. # print(resp.json())
  57. rj = resp.json()
  58. with open(os.path.join(wk_dir, fname), 'w', encoding="utf-8") as f:
  59. f.write(json.dumps(rj, ensure_ascii=False))
  60. print(f"保存文件成功 {car_id} !!!")
  61. first_id = str(first_id)
  62. get_price(first_id)

这一部分呢后期没什么用,代码也很乱。

第三部分,获取某一车型的第一种款式的参数。

  1. # ---------------------------------------------------------#
  2. # ---- * 参数配置 * ----#
  3. # ---------------------------------------------------------#
  4. from parsel import Selector
  5. def get_detail_page(id):
  6. url = "https://www.dongchedi.com/auto/params-carIds-" + id
  7. headers = {
  8. 。。。
  9. }
  10. with requests.get(url=url, headers=headers, verify=False) as resp:
  11. resp.raise_for_status()
  12. # print(resp.text)
  13. return resp.text
  14. html = get_detail_page(id)
  15. html
  16. selector = Selector(html)
  17. selector.css('div[data-row-anchor]')
  18. len(selector.css('div[data-row-anchor]'))
  19. all_rows = selector.css('div[data-row-anchor]')
  20. dct_list = []
  21. for row in all_rows:
  22. dct_item = {}
  23. label = row.css('div:nth-child(1) label::text').get()
  24. value = row.css('div:nth-child(2) div::text').get()
  25. dct_item[label] = value
  26. dct_list.append(dct_item)
  27. dct_list
  28. first_row = all_rows[0]
  29. def parse_detail(id):
  30. html = get_detail_page(id)
  31. selector = Selector(html)
  32. all_rows = selector.css('div[data-row-anchor]')
  33. dct_list = []
  34. for row in all_rows:
  35. dct_item = {}
  36. label = row.css('div:nth-child(1) label::text').get()
  37. value = row.css('div:nth-child(2) div::text').get()
  38. dct_item[label] = value
  39. dct_list.append(dct_item)
  40. dct_detail = {
  41. "id":id,
  42. "detail":dct_list
  43. }
  44. return dct_detail
  45. dct_detail = parse_detail(id)
  46. dct_detail
  47. first_id_list
  48. def save_detail(id, dct_detail):
  49. fname = id + "_dcd_detail.json"
  50. with open(os.path.join("Pythn-Anlys-138/dcd", fname), 'w', encoding='utf8') as f:
  51. f.write(json.dumps(dct_detail, ensure_ascii=False))
  52. print(f"Detail file {id} saved!!!")
  53. for fid in first_id_list:
  54. dct_detail = parse_detail(fid)
  55. save_detail(fid, dct_detail)

最后,下载了一些json文件。后期做了一些数据整理。做成了数据表是这样的。

 结果还不错。

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号