赞
踩
1.命令提示符页面(cmd)
- scrapy startproject yaowen
-
- cd yaowen
-
- scrapy genspider yw www.gov.cn
2.item.py
- import scrapy
- class YaowenItem(scrapy.Item):
- title=scrapy.Field()
- date=scrapy.Field()
- url=scrapy.Field()
- neirong=scrapy.Field()
3.yw.py
- import scrapy
- import requests
- from yaowen.items import YaowenItem
- from urllib import parse
- class YwSpider(scrapy.Spider):
- name = 'yw'
- allowed_domains = ["www.gov.cn"]
- start_urls = ['http://www.gov.cn/xinwen/']
-
- def parse(self, response):
- total=response.xpath('//div[@class="zl_channel_body zl_channel_bodyxw"]/dl')
- for b in total:
- item=YaowenItem()
- title1 = b.xpath('./dd/h4/a/text()').extract()
- date1=b.xpath('./dd/h4/span/text()').extract()
- new_url1=b.xpath('./dd/h4/a/@href').extract()
- neirong1=b.xpath('./dd/p/text()').extract()
- # print("########################################################")
- # print(new_url1)
- page_url='http://gov.cn'
- new_url="".join(new_url1)
- title=list(map(str,title1))
- date=list(map(str,date1))
- neirong=list(map(str,neirong1))
- title=' '.join(title)
- date=' '.join(date)
- neirong=' '.join(neirong)
-
- # print(page_url,type(page_url),type(new_url))
- new_full_url=parse.urljoin(page_url,new_url)
- item['title']=title
- item['date']=date
- item['url']=new_full_url
- item['neirong']=neirong
- # print("****************************************************************")
- # print(item['title'])
- # print(item['url'])
- # print(item['date'])
- # print(item['neirong'])
-
- yield item
-
- def get_content(self,url):
- header={
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
- }
- cont=requests.get(url,headers=header)
- content=cont.content.decode("gb2312",errors='ignore')
- return content
4.pipelines.py,存入MySQL
- from itemadapter import ItemAdapter
- import pymysql
- #存入MySQL
- class MysqlPipeline(object):
- def __init__(self):
- self.conn=pymysql.connect(host='localhost',user='root',password='zhangrui2580456',database='shiyanzuoye',port=3306,charset='utf8')
- self.cursor=self.conn.cursor()#游标对象
- def process_item(self,item,spider):
- self.cursor.execute('insert into zuoyeTable(title,date,url,neirong) VALUES ("{}","{}","{}","{}")'.format(item['title'],item['date'],item['url'],item['neirong']))
- self.conn.commit()
5.settings.py
- BOT_NAME = 'yaowen'
-
- SPIDER_MODULES = ['yaowen.spiders']
- NEWSPIDER_MODULE = 'yaowen.spiders'
-
-
- COOKIES_ENABLED = False
-
-
-
- ITEM_PIPELINES = {
- 'yaowen.pipelines.MysqlPipeline': 300,
- 'yaowen.pipelines.MongodbPipeline':400
6.start.py(启动程序,在yaowen目录下新建start.py)
- from scrapy import cmdline
- def main():
- scrapy.cmdline.execute(["scrapy","crawl","yw"])
- # cmdline.excute("xcrapy crawl ")
- if __name__=='__main__':
- main()
7.结果展示
8.关于数据存入mongodb,爬数据时候的翻页爬取操作将以“论文发表网”为例发出
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。