赞
踩
1.程序如下
import requests from lxml import etree import json Base_download='http://www.9rmb.com'#后期每一电影的拼接基础 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} def spider(): base_url='http://www.9rmb.com/type/1/{}.html'#每一页电影的基础 movies=[] for num in range(1,8): join_url=base_url.format(num)#进行电影地址的拼接 detail_urls=get_detail_urls(join_url)#调用每一个电影的网址 print(detail_urls) for urls in detail_urls: #对每一个电影进行提取 movie=parse_detail_page(urls) movies.append(movie) print(movie) print(type(movie)) #将获取到信息放入到文件中 with open('11.txt','a',encoding='utf-8')as f: json.dump(movie,f,ensure_ascii=False)#最后一个为了获得是utf8 f.write('\n') def get_detail_urls(urls): #蒋电影网页中的每一电影连接获取到并返回给调用函数 r1=requests.get(url=urls,headers=headers) r1_element=etree.HTML(r1.text) detail_url=r1_element.xpath('//div[@class="movie-item"]/a/@href') detail_urls=map(lambda url:Base_download+url,detail_url)#得到一个生成器 return detail_urls def parse_detail_page(urls): #进行电影目的的获取 movie={} resp=requests.get(urls,headers=headers).content.decode('utf-8','ignore') resp_element=etree.HTML(resp) title=resp_element.xpath('//div[@class="col-md-12"]/h1/text()')[0]#获取电影名称 movie['title']=title main_actors=resp_element.xpath('//td[@id="casts"]/text()')[0] movie['actors']=main_actors#获取电影演员 coutry=resp_element.xpath('//tr[4]/td[2]/text()')[0] movie['country']=coutry#获取电影的国家 evaluate=resp_element.xpath('//a[@class="score"]/text()')[0] movie['evaluate']=evaluate #获取电影评价 return movie if __name__=='__main__': spider() f=open('11.txt','w',encoding='utf-8') f.close()
编写过程参考了:
https://blog.csdn.net/qq_43515464/article/details/102969930?
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。