赞
踩
def parse(self, response): # print(response.text) uri = 'https://www.liepin.com' h3List = response.css('h3') print(len(h3List)) del h3List[len(h3List) - 3:len(h3List)] print(len(h3List)) # # test # url = h3List[1].css('a::attr("href")').extract_first() # job_url = str(url) # 遍历该页面的职位详情url for index, h3 in enumerate(h3List): url = h3.css('a::attr("href")').extract_first() job_url = str(url) if 'https://www.liepin.com' not in str(url): job_url = 'https://www.liepin.com' + str(url) # job_url 是列表中岗位的详情页url # 发起request 爬取详情页面 print(index) print(job_url) yield scrapy.Request(job_url, callback=self.get_job_info) # 当前页爬取完毕,下一页 a_selector_list = response.css('.pagerbar a::attr("href")').re('</span>.*?<a href="(.*?)">下一页</a>') next = uri+str(a_selector_list) print('*****/*/*/*/*/*/*/*/*/*/*下一页') yield scrapy.Request(next, callback=self.parse)
一个列表页面:里面有多条项目,每一个项目点击进去是详情页面。列表页面下方有下一页,该页所有的项目爬取完了,想继续”下一页“,
yield scrapy.Request(next, callback=self.parse),这一步就执行不到
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。