赞
踩
- import urllib.request
- import bs4
- import re
- import time
- from multiprocessing import Pool
- class getLink(object):
- def __init__(self,url):
- self.url = url
-
- def main(self):
- downFile = open("down.txt", "w", encoding='utf-8')
- downFile.truncate()
- i = 0
- page = 1
- for urlSingle in self.url:
-
- result = self.getResult(urlSingle)
- print("第%d" % (page) + "页")
- downFile.write("第%d" % (page) + "页\n")
- page += 1
- for rs in result:
- pid, Name = self.getInfo(rs)
- DownUrl0, DownUrl1 = self.getDownUrl(pid)
- i += 1
- print("*******************************************")
- print("正在爬取第%d" % (i) + "个 " + "电影名称: " + Name)
- downFile.write("--------")
- downFile.write("第%d" % (i) + "个" + Name + "\n")
- downFile.write("英语中字: " + DownUrl0 + "\n")
- downFile.write("中英双字: " + DownUrl1 + "\n")
-
- def getResult(self,url):
- #shift+tab 同时左移
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name"
- }
- html = urllib.request.Request(url, headers=headers)
- response = urllib.request.urlopen(html).read().decode('utf-8')
- # 获取pid与电影name
- # 设置正则匹配规则pat1
- bs = bs4.BeautifulSoup(response, "lxml")
- result = bs.find_all(class_="main_top")
- return result
-
- def getInfo(self,result):
- # 获取名字
- Name = result.find('a').getText()
-
- # 获取href
- href = result.find('a').get('href')
- # 获取pid
- str1 = href.split('.')
- str2 = str1[2].split('/')
- pid = str2[4]
- return pid, Name
-
- def getDownUrl(self,pid):
- DownUrl0 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=0"
- DownUrl1 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=1"
- return DownUrl0, DownUrl1
- if __name__ == '__main__':
- pool = Pool(4)
- url = []
- for i in range(1467):
- url.append("http://www.dexiazai.cc" + "/plus/list.php?tid=50&PageNo=" + str(i))
- Link = getLink(url)
- #Link.main()
- pool.map_async(Link.main())
- pool.close()
- pool.join()

python基础学习路线:点击打开链接
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。