赞
踩
提示
1.实践开发过程概述【爬虫技术】
·开发环境安装部署
·汽车品牌数据获取
·汽车车系数据获取
·汽车数据批量导入2.章节收益:
·网络爬虫
熟悉开源网络爬虫框架的使用方法,掌握动态页面内容的爬取方法
了解网络爬虫的关键技术
·数据导入
掌握基于Neo4j图形性化操作界面,如何进行批量数据节点的创建,索引的关键、关系的创建
·数据资产
207个汽车品牌数据
1219个汽车车系数据
31768个汽车车型数据
以及掌握网络数据的智能钥匙
下载requset-html代码结合起来
开发环境 request-html操作
页面元素分析、品牌数据爬取、数据结构化处理
网页:汽车之家【汽车报价】2022最新汽车价格_汽车报价大全_汽车之家
在网页上有很多个品牌,去看后面的菜单,可以找到菜单的结构在url列表里面存储了品牌信息。在最右边这里
爬取交互页面,就可以根据代码数据的位置获得存储位置
整体代码如下:
- from requests_html import HTMLSession
- from requests_html import HTML
- import requests
- import time
- import json
- import random
- import sys
- import os
- import csv
-
- from fake_useragent import UserAgent
-
- session = HTMLSession()
- url = 'https://car.autohome.com.cn/'
- #https://car.autohome.com.cn/config/series/3862.html
-
- USER_AGENTS = [
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
- "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
- "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
- "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
- ]
- # 品牌列表
- def get_bank():
- response = session.get(url)
- response.html.render()
- banks = response.html.find('.cartree ul li h3')
- for bank in banks:
- #print(bank.text)
- #print(bank.html)
- bk = bank.text
- start = bk.find("(")
- end = bk.find(")")
- print(bk[0:start])
- print(bk[start+1:end])
- print(banks)
-
- # 汽车品牌
- def get_url():
- ##找到当前存储的路径
- current_dir = os.path.abspath('.')
- print(current_dir)
- ##打开存储的csv表格
- file_name = os.path.join(current_dir, "data\\bank2.csv")
- print(file_name)
- ##打开读写csv的管道csvfile1
- with open(file_name, 'wt', newline='') as csvfile1:
- ##表头
- header = ['bank','count', 'url']
- writer = csv.writer(csvfile1)
- ##传入头部信息
- writer.writerow(header)
- #访问页面 访问页面元素
- response = session.get(url)
- response.html.render()
- banks = response.html.find('.cartree ul li h3 a')
- for bank in banks:
- #print(bank.text)
- #格式化
- bk = bank.text
- start = bk.find("(")
- end = bk.find(")")
- ##品牌数据
- bank1 = bk[0:start]
- ##车型数据
- Num = bk[(start+1):end]
- ##数据获取的存储路径
- url2 = url + bank.attrs.get("href", None)
- print(url2)
- save2csc(writer,bank1,Num,url2)
- #print(banks)
- csvfile1.close()
- #
- #写入CSV文件
- def save2csc(writer,bank,num,url):
- header = ['bank','num' 'url']
- #writer.writerow(header)
- csvrow1 = []
- csvrow1.append(bank)
- csvrow1.append(num)
- csvrow1.append(url)
-
- writer.writerow(csvrow1)
- if __name__ == '__main__':
- print("开始处理")
- get_url()
- #get_bank()
- print("处理结束")
代码跑起【撒一部分花】
但是执行结果有问题,就是并没有获取到数据
- from requests_html import HTMLSession
- from requests_html import HTML
- import requests
- import time
- import json
- import random
- import sys
- import os
- import csv
-
- urlroot = "https://car.autohome.com.cn"
- #url = 'https://car.autohome.com.cn/price/brand-33.html'
-
- USER_AGENTS = [
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
- "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
- "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
- "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
- ]
- # 品牌列表
- def get_bank():
- current_dir = os.path.abspath('.')
- print(current_dir)
- file_name1 = os.path.join(current_dir, "data\\type.csv")##存储车系类型
- file_name2 = os.path.join(current_dir, "data\\bank.csv")##存储品牌信息
- #
- with open(file_name1, 'wt',newline='') as csvfile1:
- #header = ['bank','Type','Count']
- writer = csv.writer(csvfile1)
- #writer.writerow(header)
- # 读取并获得所有URL地址,同时记录品牌名称
- with open(file_name2, 'r') as csvfile2:
- reader = csv.reader(csvfile2)
- for row in reader:
- # 随机浏览器 User-Agent
- headers = {"User-Agent": random.choice(USER_AGENTS)}
- session = HTMLSession()
-
- # 01:读取整行
- column1 = row[0] # 读取第1列
- column3 = row[2] #读取第2列
- # 逐个URL进行爬取
- response = session.get(column3, headers=headers)
- response.html.render()
- print("URL=",column3)
-
- banks = response.html.find('.cartree ul li dd a')##下载的时候分析的数据结构
- for bank in banks:
- bk = bank.text
- #逆序查找:车系
- start = bk.rfind("(")
- end = bk.rfind(")")
- bank1 = bk[0:start]##品牌数据 奥迪
- Num = bk[(start + 1):end]##品牌个数
-
- url2 = urlroot + bank.attrs.get("href", None)
- print(column1+" "+bank1 + " " +Num,url2)
- ##去除 右边括号
- save2csv(writer,column1, bank1.rstrip(), Num,url2)
- csvfile1.flush()
- print(banks)
- time.sleep(1)
- #写入CSV文件
- def save2csv(writer,bank,type,num,url):
- #header = ['bank','num' 'url']
- #writer.writerow(header)
- csvrow1 = []
- csvrow1.append(bank)
- csvrow1.append(type)
- csvrow1.append(num)
- csvrow1.append(url)
- try:
- writer.writerow(csvrow1)
- except IOError:
- print("Error: 没有找到文件或读取文件失败")
- if __name__ == '__main__':
- # get_url()
- get_bank()
- print("处理结束")
获取的数据仍然是空的 【猜测可能是因为我写的爬取数据的结构不对,新的网站可能更新了布局】
可以放对应的处理数据:
·品牌 csv数据两个表格放到里
启动代码:neo4j.bat console
对应网址:http://localhost:7474/
##品牌数据导入代码:
LOAD CSV WITH HEADERS FROM "file://bank.csv"AS line
CREATE(:Bank{name:line.bank,count:line.count})
会报错:file后面三个/ (window) 一个/是(linux)
neo4j导入.csv文件时常见问题之Neo.ClientError.Statement.ExternalResourceFailed解决方法_Ray Mond的博客-CSDN博客
如果需要删除节点以及节点图:MATCH (n) DETACH DELETE (n)
##车型数据导入代码:
LOAD CSV WITH HEADERS FROM "file://serise.csv"AS line
CREATE(p:Serise{name:line.serise,count:line.count})
创建了结构后,结果还是有点问题:就是为什么是数字不是视频展示的汽车类别
【解决】step1:给bank和type表格增添表头;对应neoj4创建语句的表头
step2:修改.csv文件的编码格式:用记事本打开,另存为时选择编码方式为utf-8
##品牌/车系关系数据导入
创建品牌和车型之间的关系子类型
LOAD CSV WITH HEADERS FROM "file:///type.csv"AS line
MATCH(entity1:Bank{name:line.bank}),(entity2:Serise{name:line.serise})
CREATE(entity1)-[:Subtype{type:line.relation}]->(entity2)
学习了数据导入和关系创建
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。