赞
踩
items:
- class GiteeItem(scrapy.Item):
- link = scrapy.Field()
- desc = scrapy.Field()
- pass
db:
- import emoji
- import pymysql
-
- connect = pymysql.connect(host='localhost', user='root', password='root', db='mindsa', charset='utf8mb4')
- cursor = connect.cursor()
-
-
- def insertGitee(item):
- sql = """INSERT INTO gitee(link,`desc`) VALUES ({},{})""".format("'" + emoji.demojize(item['link']) + "'",
- "'" + emoji.demojize(item['desc']) + "'")
- cursor.execute(sql)
- connect.commit()
pipelines:
- class GiteePipeline:
- def process_item(self, item, spider):
- insertGitee(item)
settings:
- ITEM_PIPELINES = {
- 'myscrapy.pipelines.GiteePipeline': 300,
- }
GiteeSprider:
- import scrapy
-
- from myscrapy.items import GiteeItem
-
-
- class GiteeSprider(scrapy.Spider):
- name = 'gitee'
- allow_domains = 'gitee.com'
- start_urls = ['https://gitee.com/explore/all']
-
- def parse(self, response, **kwargs):
- # 使用绝对路径定位标签
- elements = response.xpath('//div[@class="ui relaxed divided items explore-repo__list"]//div[@class="item"]')
- for element in elements:
- # 注意:再次进行xpath的时候是相对路径在需要//前面加上.。是.//而不是//
- link = self.allow_domains + element.xpath('.//h3/a/@href').get()
- desc = element.xpath('.//div[@class="project-desc"]/text()').get()
- item = GiteeItem()
- item['link'] = link
- item['desc'] = desc
- yield item
- # 注意:根据多个属性值进行xpath的时候,用and来连接。
- next_href__get = response.xpath(
- '//div[@class="ui tiny pagination menu"]//a[@class="icon item" and @rel="next"]/@href'
- ).get()
-
- if next_href__get is not None:
- # 如果存在下一页则继续请求
- yield scrapy.Request("https://gitee.com"+next_href__get, self.parse)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。