当前位置:   article > 正文

Python爬取De下载站相关代码

Python爬取De下载站相关代码
Python爬取De下载站相关代码,因为没有设置代理,所以爬到800页左右就被干掉了,后续要加上
  1. import urllib.request
  2. import bs4
  3. import re
  4. import time
  5. from multiprocessing import Pool
  6. class getLink(object):
  7. def __init__(self,url):
  8. self.url = url
  9. def main(self):
  10. downFile = open("down.txt", "w", encoding='utf-8')
  11. downFile.truncate()
  12. i = 0
  13. page = 1
  14. for urlSingle in self.url:
  15. result = self.getResult(urlSingle)
  16. print("第%d" % (page) + "页")
  17. downFile.write("第%d" % (page) + "页\n")
  18. page += 1
  19. for rs in result:
  20. pid, Name = self.getInfo(rs)
  21. DownUrl0, DownUrl1 = self.getDownUrl(pid)
  22. i += 1
  23. print("*******************************************")
  24. print("正在爬取第%d" % (i) + "个 " + "电影名称: " + Name)
  25. downFile.write("--------")
  26. downFile.write("第%d" % (i) + "个" + Name + "\n")
  27. downFile.write("英语中字: " + DownUrl0 + "\n")
  28. downFile.write("中英双字: " + DownUrl1 + "\n")
  29. def getResult(self,url):
  30. #shift+tab 同时左移
  31. headers = {
  32. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name"
  33. }
  34. html = urllib.request.Request(url, headers=headers)
  35. response = urllib.request.urlopen(html).read().decode('utf-8')
  36. # 获取pid与电影name
  37. # 设置正则匹配规则pat1
  38. bs = bs4.BeautifulSoup(response, "lxml")
  39. result = bs.find_all(class_="main_top")
  40. return result
  41. def getInfo(self,result):
  42. # 获取名字
  43. Name = result.find('a').getText()
  44. # 获取href
  45. href = result.find('a').get('href')
  46. # 获取pid
  47. str1 = href.split('.')
  48. str2 = str1[2].split('/')
  49. pid = str2[4]
  50. return pid, Name
  51. def getDownUrl(self,pid):
  52. DownUrl0 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=0"
  53. DownUrl1 = "http://www.dexiazai.cc/newdown/?pid=" + pid + "&linkn=1"
  54. return DownUrl0, DownUrl1
  55. if __name__ == '__main__':
  56. pool = Pool(4)
  57. url = []
  58. for i in range(1467):
  59. url.append("http://www.dexiazai.cc" + "/plus/list.php?tid=50&PageNo=" + str(i))
  60. Link = getLink(url)
  61. #Link.main()
  62. pool.map_async(Link.main())
  63. pool.close()
  64. pool.join()

python基础学习路线:点击打开链接

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/109060
推荐阅读
相关标签
  

闽ICP备14008679号