赞
踩
---------------------------------------------------------------------------------------------
[版权申明:本文系作者原创,转载请注明出处]
文章出处:https://blog.csdn.net/sdksdk0/article/details/82381198
作者:朱培 ID:sdksdk0
--------------------------------------------------------------------------------------------
本文使用scrapy框架,python3.6进行爬取,主要获取的是携程上河南省的景点名称,地址,省市县,描述,图片地址信息等。首先通过搜索可以得到河南的网页地址为:http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/,然后以这个页面为起始位置开始爬取。将爬取的数据保存到mysql数据库中。
1、创建scrapy项目
scrapy startproject ctrip
2、创建 spider,首先进入ctrip文件夹
scrapy genspider scenic "ctrip.com"
3、settings.py文件中:
- BOT_NAME = 'ctrip'
-
- SPIDER_MODULES = ['ctrip.spiders']
- NEWSPIDER_MODULE = 'ctrip.spiders'
- ROBOTSTXT_OBEY = False
- DEFAULT_REQUEST_HEADERS = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en',
- }
- DOWNLOADER_MIDDLEWARES = {
- 'ctrip.middlewares.UserAgentDownloadMiddleware': 543,
- }
- ITEM_PIPELINES = {
- 'ctrip.pipelines.DBPipeline': 300,
- }
'运行
4、middlewares.py中
- import random
-
-
- class UserAgentDownloadMiddleware (object):
- USER_AGENTS = [
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
- "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
- "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
- "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
- "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
- ]
-
- def process_request(self,request,spider):
- user_agent = random.choice(self.USER_AGENTS)
- request.headers['User-Agent'] = user_agent
'运行
5、items.py
- import scrapy
-
-
- class ScenicItem(scrapy.Item):
- province = scrapy.Field()
- city = scrapy.Field()
- county = scrapy.Field()
- name = scrapy.Field()
- scenic_url = scrapy.Field()
- image_url = scrapy.Field()
- address = scrapy.Field()
- descript = scrapy.Field()
- code = scrapy.Field()
6、scenic.py
- # -*- coding: utf-8 -*-
- import scrapy
- import re
- from ctrip.items import ScenicItem
-
- class ScenicSpider(scrapy.Spider):
- name = 'scenic'
- allowed_domains = ['ctrip.com']
- start_urls = ['http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/']
- count = 0
-
- def parse(self, response):
- trs = response.xpath("//div[@id='searchResultContainer']//div[@class='searchresult_product04']")
-
- for tr in trs:
- ctrip_url = tr.xpath(".//div[1]/a/@href").get()
- c1_url = ctrip_url.split("t/t")
- scemic_num = c1_url[1].split(".")
- scemic_num = scemic_num[0]
- scenic_url = ""
- image_url = tr.xpath(".//div[1]/a/img/@src").get()
- address = tr.xpath(".//div[1]/div[@class='adress']//text()").get().strip()
- address = re.sub(r"地址:", "", address)
- descript = tr.xpath(".//div[1]/div[@class='exercise']//text()").get().strip()
- descript = re.sub(r"特色:", "", descript)
- name = tr.xpath(".//div[1]//h2/a/text()").get().strip()
-
- cityinfo=address
- province = "河南省"
- city = ""
- county = ""
- if "省" in cityinfo:
- matchObj = re.match(r'(.*)[?省](.+?)市(.+?)([县]|[区])', cityinfo, re.M | re.I)
- if matchObj:
- province = matchObj.group(1) + "省"
- city = matchObj.group(2) + "市"
- if "县" in cityinfo:
- county = matchObj.group(3) + "县"
- else:
- county = matchObj.group(3) + "区"
- else:
- matchObj2 = re.match(r'(.*)[?省](.+?)市(.+?)市', cityinfo, re.M | re.I)
- matchObj1 = re.match(r'(.*)[?省](.+?)市', cityinfo, re.M | re.I)
- if matchObj2:
- city = matchObj2.group(2) + "市"
- county = matchObj2.group(3) + "市"
- elif matchObj1:
- city = matchObj1.group(2) + "市"
- else:
- matchObj1 = re.match(r'(.*)[?省](.+?)([县]|[区])', cityinfo, re.M | re.I)
- if matchObj1:
- if "县" in cityinfo:
- county = matchObj1.group(2) + "县"
- else:
- county = matchObj1.group(2) + "区"
-
- else:
- matchObj = re.match(r'(.+?)市(.+?)([县]|[区])', cityinfo, re.M | re.I)
- if matchObj:
- city = matchObj.group(1) + "市"
- if "县" in cityinfo:
- county = matchObj.group(2) + "县"
- else:
- county = matchObj.group(2) + "区"
- else:
- matchObj = re.match(r'(.+?)市', cityinfo, re.M | re.I)
- if matchObj:
- city = matchObj.group(1) + "市"
- else:
- matchObj = re.match(r'(.+?)县', cityinfo, re.M | re.I)
- if matchObj:
- county = matchObj.group(1) + "县"
-
- self.count += 1
- code = "A" + str(self.count)
-
- item = ScenicItem(name=name,province=province,city=city,county=county,address=address,descript=descript,
- scenic_url=scenic_url,image_url=image_url,code=code)
-
- yield item
- next_url = response.xpath('//*[@id="searchResultContainer"]/div[11]/a[11]/@href').get()
- if next_url:
- yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse,meta={})
7、pipelines.py,将数据保存到mysql数据库中
- import pymysql
-
-
- # 用于数据库存储
- class DBPipeline(object):
- def __init__(self):
- # 连接数据库
- self.connect = pymysql.connect(
- host='localhost',
- port=3306,
- db='edu_demo',
- user='root',
- passwd='123456',
- charset='utf8',
- use_unicode=True)
-
- # 通过cursor执行增删查改
- self.cursor = self.connect.cursor();
-
- def process_item(self, item, spider):
- try:
- # 查重处理
- self.cursor.execute(
- """select * from a_scenic where ctrip_url = %s""",
- item['scenic_url'])
- # 是否有重复数据
- repetition = self.cursor.fetchone()
-
- # 重复
- if repetition:
- pass
-
- else:
- # 插入数据
- self.cursor.execute(
- """insert into a_scenic(code,province, city, county, name ,description, ctrip_url,image_url,address,type)
- value (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
- (item['code'],
- item['province'],
- item['city'],
- item['county'],
- item['name'],
- item['descript'],
- item['scenic_url'],
- item['image_url'],
- item['address'], '1'))
-
- # 提交sql语句
- self.connect.commit()
-
- except Exception as error:
- # 出现错误时打印错误日志
- print(error)
- return item
8、start.py
- from scrapy import cmdline
-
- cmdline.execute("scrapy crawl scenic".split())
9、运行start.py即可
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。