赞
踩
1.下载scrapy框架
pip install scrapy
2.在E盘下创建一个文件夹scrapy01,在命令行窗体中进入该文件夹
3.创建项目:scrapy startproject 项目名
scrapy startproject first_scrapy
4.使用pycharm打开scrapy01文件夹
5.在items.py文件中创建所需的字段,用于保存数据
- # -*- coding: utf-8 -*-
-
- # Define here the models for your scraped items
- #
- # See documentation in:
- # https://docs.scrapy.org/en/latest/topics/items.html
-
- import scrapy
-
-
- class FirstScrapyItem(scrapy.Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- title = scrapy.Field() # 书名
- price = scrapy.Field() # 价格
- author = scrapy.Field() # 作者
- date = scrapy.Field() # 出版日期
- publisher = scrapy.Field() # 出版社
-

6.在spiders文件夹中创建爬虫程序test.py,代码如下:
- # author:WN
- # datetime:2019/11/3 15:29
- from abc import ABC
- import scrapy
- from .. import items
-
- class MySpider(scrapy.Spider, ABC):
- # 名字
- name = "mySpider"
-
- def start_requests(self):
- for num in range(1, 101):
- url = "http://search.dangdang.com/?key=Python&act=input&page_index=%d" % num
- # 使用yield:请求过后返回的数据等待被取走
- yield scrapy.Request(url=url, callback=self.parse)
-
- def parse(self, response):
- try:
- data = response.text
- # scrapy是使用Xpath进行查找数据的
- # 创建选择查找类Selector()对象
- select = scrapy.Selector(text=data)
- book_data = select.xpath("//ul[@class='bigimg']/li")
- item = items.FirstScrapyItem()
- # 查找具体数据
- for book in book_data:
- title = book.xpath("./a/img/@alt").extract_first().strip()
- price = book.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first().lstrip('¥')
- author = book.xpath("./p[@class='search_book_author']/span/a/@title").extract_first()
- date = book.xpath("./p[@class='search_book_author']/span[2]/text()").extract_first().strip()
- publisher = book.xpath("./p[@class='search_book_author']/span/a[@name='P_cbs']/text()").extract_first()
- item['title'] = title if title else ''
- item['price'] = price if price else ''
- item['author'] = author if author else ''
- item['date'] = date if date else ''
- item['publisher'] = publisher if publisher else ''
- yield item
- except Exception as e:
- print(e)

7.在setings.py中添加配置,以便将test.py中的item推送到piplines.py的类中
- # 设置将item配置到pipelines中的类中
- # 项目名.pipelines.类名
- # 300是一个默认整数,它可以是任意整数
- ITEM_PIPELINES = {
- 'first_scrapy.pipelines.FirstScrapyPipeline': 300,
- }
8.编写pipelines.py的代码,前提先创建mysql数据库book和表books:
- create database book;
- use book;
- set character_set_results=gbk;
- create table books(
- bTitle varchar(256) primary key,
- bPrice varchar(50),
- bAuthoe varchar(50),
- bDate varchar(32),
- bPublisher varchar(256)
- );
- # -*- coding: utf-8 -*-
-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
- import pymysql
-
-
- class FirstScrapyPipeline(object):
- # spider爬虫一开始就会执行下面的函数
- def open_spider(self, spider):
- print('opened')
- try:
- # 连接数据库
- self.con = pymysql.connect(host='localhost', port=3306, user='root', password='root', db='book', charset='utf8')
- # 创建游标
- self.cursor = self.con.cursor()
- self.opened = True
- self.count = 0
- except Exception as e:
- print(e)
- self.opened = False
-
- # spider爬虫关闭执行函数
- def close_spider(self, spider):
- if self.opened:
- self.con.commit()
- self.con.close()
- self.opened = False
- print("close")
- print("总共爬取:", self.count, "本书籍")
-
- def process_item(self, item, spider):
- try:
- print(item['title'])
- print(item['price'])
- print(item['author'])
- print(item['date'])
- print(item['publisher'])
- if self.opened:
- self.cursor.execute(
- 'insert into books(bTitle,bPrice,bAuthor,bDate,bPublisher) values (%s,%s,%s,%s,%s)', (
- item['title'], item['price'], item['author'], item['date'], item['publisher'])
- )
- self.count += 1
- except Exception as err:
- print(err)
- return item

9.运行此项目
(1)在命令行窗体中运行:scrapy crawl 爬虫程序名 -s LOG_ENABLED=False,后边的参数是不显示调试信息
scrapy crawl mySpider -s LOG_ENABLED=False
(2)在spiders文件夹的上一级文件夹下创建run.py,运行此文件就可以运行该项目(不在dos窗口中运行项目)代码如下:
- # author:WN
- # datetime:2019/11/3 15:36
- from scrapy import cmdline
- # 运行语句,不需要再打开dos窗口
- # scrapy crawl 爬虫名 不显示调试信息的参数
- cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。