当前位置:   article > 正文

python中scrapy框架爬取携程景点数据_scrapy 携程

scrapy 携程

---------------------------------------------------------------------------------------------
[版权申明:本文系作者原创,转载请注明出处] 
文章出处:https://blog.csdn.net/sdksdk0/article/details/82381198

作者:朱培      ID:sdksdk0     
--------------------------------------------------------------------------------------------

本文使用scrapy框架,python3.6进行爬取,主要获取的是携程上河南省的景点名称,地址,省市县,描述,图片地址信息等。首先通过搜索可以得到河南的网页地址为:http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/,然后以这个页面为起始位置开始爬取。将爬取的数据保存到mysql数据库中。

 

1、创建scrapy项目

scrapy startproject ctrip

2、创建 spider,首先进入ctrip文件夹

scrapy genspider scenic "ctrip.com"

3、settings.py文件中:

  1. BOT_NAME = 'ctrip'
  2. SPIDER_MODULES = ['ctrip.spiders']
  3. NEWSPIDER_MODULE = 'ctrip.spiders'
  4. ROBOTSTXT_OBEY = False
  5. DEFAULT_REQUEST_HEADERS = {
  6. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  7. 'Accept-Language': 'en',
  8. }
  9. DOWNLOADER_MIDDLEWARES = {
  10. 'ctrip.middlewares.UserAgentDownloadMiddleware': 543,
  11. }
  12. ITEM_PIPELINES = {
  13. 'ctrip.pipelines.DBPipeline': 300,
  14. }
'
运行

4、middlewares.py中

  1. import random
  2. class UserAgentDownloadMiddleware (object):
  3. USER_AGENTS = [
  4. "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
  5. "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
  6. "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
  7. "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
  8. "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
  9. "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
  10. "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
  11. "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
  12. ]
  13. def process_request(self,request,spider):
  14. user_agent = random.choice(self.USER_AGENTS)
  15. request.headers['User-Agent'] = user_agent
'
运行

5、items.py

  1. import scrapy
  2. class ScenicItem(scrapy.Item):
  3. province = scrapy.Field()
  4. city = scrapy.Field()
  5. county = scrapy.Field()
  6. name = scrapy.Field()
  7. scenic_url = scrapy.Field()
  8. image_url = scrapy.Field()
  9. address = scrapy.Field()
  10. descript = scrapy.Field()
  11. code = scrapy.Field()

6、scenic.py

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import re
  4. from ctrip.items import ScenicItem
  5. class ScenicSpider(scrapy.Spider):
  6. name = 'scenic'
  7. allowed_domains = ['ctrip.com']
  8. start_urls = ['http://piao.ctrip.com/dest/u-_ba_d3_c4_cf/s-tickets/P1/']
  9. count = 0
  10. def parse(self, response):
  11. trs = response.xpath("//div[@id='searchResultContainer']//div[@class='searchresult_product04']")
  12. for tr in trs:
  13. ctrip_url = tr.xpath(".//div[1]/a/@href").get()
  14. c1_url = ctrip_url.split("t/t")
  15. scemic_num = c1_url[1].split(".")
  16. scemic_num = scemic_num[0]
  17. scenic_url = ""
  18. image_url = tr.xpath(".//div[1]/a/img/@src").get()
  19. address = tr.xpath(".//div[1]/div[@class='adress']//text()").get().strip()
  20. address = re.sub(r"地址:", "", address)
  21. descript = tr.xpath(".//div[1]/div[@class='exercise']//text()").get().strip()
  22. descript = re.sub(r"特色:", "", descript)
  23. name = tr.xpath(".//div[1]//h2/a/text()").get().strip()
  24. cityinfo=address
  25. province = "河南省"
  26. city = ""
  27. county = ""
  28. if "省" in cityinfo:
  29. matchObj = re.match(r'(.*)[?省](.+?)市(.+?)([县]|[区])', cityinfo, re.M | re.I)
  30. if matchObj:
  31. province = matchObj.group(1) + "省"
  32. city = matchObj.group(2) + "市"
  33. if "县" in cityinfo:
  34. county = matchObj.group(3) + "县"
  35. else:
  36. county = matchObj.group(3) + "区"
  37. else:
  38. matchObj2 = re.match(r'(.*)[?省](.+?)市(.+?)市', cityinfo, re.M | re.I)
  39. matchObj1 = re.match(r'(.*)[?省](.+?)市', cityinfo, re.M | re.I)
  40. if matchObj2:
  41. city = matchObj2.group(2) + "市"
  42. county = matchObj2.group(3) + "市"
  43. elif matchObj1:
  44. city = matchObj1.group(2) + "市"
  45. else:
  46. matchObj1 = re.match(r'(.*)[?省](.+?)([县]|[区])', cityinfo, re.M | re.I)
  47. if matchObj1:
  48. if "县" in cityinfo:
  49. county = matchObj1.group(2) + "县"
  50. else:
  51. county = matchObj1.group(2) + "区"
  52. else:
  53. matchObj = re.match(r'(.+?)市(.+?)([县]|[区])', cityinfo, re.M | re.I)
  54. if matchObj:
  55. city = matchObj.group(1) + "市"
  56. if "县" in cityinfo:
  57. county = matchObj.group(2) + "县"
  58. else:
  59. county = matchObj.group(2) + "区"
  60. else:
  61. matchObj = re.match(r'(.+?)市', cityinfo, re.M | re.I)
  62. if matchObj:
  63. city = matchObj.group(1) + "市"
  64. else:
  65. matchObj = re.match(r'(.+?)县', cityinfo, re.M | re.I)
  66. if matchObj:
  67. county = matchObj.group(1) + "县"
  68. self.count += 1
  69. code = "A" + str(self.count)
  70. item = ScenicItem(name=name,province=province,city=city,county=county,address=address,descript=descript,
  71. scenic_url=scenic_url,image_url=image_url,code=code)
  72. yield item
  73. next_url = response.xpath('//*[@id="searchResultContainer"]/div[11]/a[11]/@href').get()
  74. if next_url:
  75. yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse,meta={})

7、pipelines.py,将数据保存到mysql数据库中

  1. import pymysql
  2. # 用于数据库存储
  3. class DBPipeline(object):
  4. def __init__(self):
  5. # 连接数据库
  6. self.connect = pymysql.connect(
  7. host='localhost',
  8. port=3306,
  9. db='edu_demo',
  10. user='root',
  11. passwd='123456',
  12. charset='utf8',
  13. use_unicode=True)
  14. # 通过cursor执行增删查改
  15. self.cursor = self.connect.cursor();
  16. def process_item(self, item, spider):
  17. try:
  18. # 查重处理
  19. self.cursor.execute(
  20. """select * from a_scenic where ctrip_url = %s""",
  21. item['scenic_url'])
  22. # 是否有重复数据
  23. repetition = self.cursor.fetchone()
  24. # 重复
  25. if repetition:
  26. pass
  27. else:
  28. # 插入数据
  29. self.cursor.execute(
  30. """insert into a_scenic(code,province, city, county, name ,description, ctrip_url,image_url,address,type)
  31. value (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
  32. (item['code'],
  33. item['province'],
  34. item['city'],
  35. item['county'],
  36. item['name'],
  37. item['descript'],
  38. item['scenic_url'],
  39. item['image_url'],
  40. item['address'], '1'))
  41. # 提交sql语句
  42. self.connect.commit()
  43. except Exception as error:
  44. # 出现错误时打印错误日志
  45. print(error)
  46. return item

8、start.py

  1. from scrapy import cmdline
  2. cmdline.execute("scrapy crawl scenic".split())

9、运行start.py即可

 

 

 

本文内容由网友自发贡献,转载请注明出处:https://www.wpsshop.cn/w/寸_铁/article/detail/885262
推荐阅读
相关标签
  

闽ICP备14008679号