赞
踩
目录
本机环境:python3,Windows10
我的视频讲解:https://www.bilibili.com/video/BV1xh411C7Xc/
a. 安装pymongo
pip install pymongo
b. 基本操作
- import pymongo
-
- myclient = pymongo.MongoClient("mongodb://localhost:27017/") #使用MongoClient对象,连接数据库
- collist = myclient.list_database_names() # 获取所有数据库
- mydb = myclient["runoobdb"] # 数据库名
- mycol = mydb["sites"] # collection集合(类似SQL的表)
-
- # 插入一条数据
- mydict = { "_id": 1, "name": "RUNOOB", "cn_name": "菜鸟教程"}
- mycol.insert_one(mydict)
-
- # 插入一组数据
- mylist = [
- { "_id": 2, "name": "Google", "address": "Google 搜索"},
- { "_id": 3, "name": "Facebook", "address": "脸书"},
- { "_id": 4, "name": "Taobao", "address": "淘宝"},
- { "_id": 5, "name": "Zhihu", "address": "知乎"}
- ]
- mycol.insert_many(mylist)
-
- # 查找
- for x in mycol.find({},{ "_id": 0, "name": 1, "alexa": 1 }):
- print(x)
-
- # 删除全部
- mycol.delete_many({})
- print(list(mycol.find()))

scrapy startproject tutorial
在tutorial/spiders目录下增加quotes_spider.py文件
- import scrapy
- from ..items import TutorialItem
-
- class QuotesSpider(scrapy.Spider):
- name = "quotes"
- # 也可以如下
- # start_urls = ['http://quotes.toscrape.com/page/1/']
-
- def start_requests(self):
- url_bone = 'http://quotes.toscrape.com/page/{}/'
- for i in range(1,3): # 要能够结束/有出口,否则为死循环
- url = url_bone.format(i)
- print('url: {}'.format(url))
- yield scrapy.Request(url=url, callback=self.parse)
-
- def parse(self, response):
- item = TutorialItem()
- div_list = response.xpath('/html/body/div/div[2]/div[1]/div')[:3]
- for div in div_list:
- words = div.xpath('./span[1]/text()').extract_first()
- person = div.xpath('./span[2]/small/text()').extract_first()
- item['words'] = words
- item['person'] = person
- yield item

通过xpath来定位:
a、F12 检查, 找到对应位置,右键copy-copy xpath
b、使用chrome插件 xpath helper
- class TutorialItem(scrapy.Item):
- # define the fields for your item here like:
- words = scrapy.Field() # 一个人说的话
- person = scrapy.Field() # 说话的人
- ROBOTSTXT_OBEY = False #修改为False
- LOG_LEVEL = 'ERROR' #只有在错误时才打印日志
- DOWNLOAD_DELAY = 1 # 下载时间间隔改为1秒
- # 配置默认的请求头
- DEFAULT_REQUEST_HEADERS = {
- "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
- }
- # 配置使用Pipeline
- ITEM_PIPELINES = {
- 'tutorial.pipelines.TutorialPipeline': 300,
- }
- import pymongo
-
- class TutorialPipeline(object):
- def __init__(self):
- super().__init__()
- myclient = pymongo.MongoClient("mongodb://localhost:27017/")
- mydb = myclient["runoobdb"]
- self.mycol = mydb["quotes"]
-
- def process_item(self, item, spider):
- print(item)
- self.mycol.insert_one(dict(item))
- return item
- from scrapy import cmdline
-
- cmdline.execute('scrapy crawl quotes'.split())
cd到main.py的位置 xx/tutorial/tutorial
python main.py
参考资料:
https://blog.csdn.net/qq_41837900/article/details/96489994
待爬网址:http://quotes.toscrape.com/page/1/
本项目代码位置:https://download.csdn.net/download/GreatXiang888/15108875
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。