当前位置:   article > 正文

python爬虫下载网站磁力链接_爬虫怎么抓取window.open (magnet)

爬虫怎么抓取window.open (magnet)

设计分三步走:

1.获取明星列表地址

2.获取明星作品序列号

3.根据作品序列号查找磁力链接


一、获取网站中明星列表的作品集地址

  1. #coding=utf8
  2. import requests
  3. import re
  4. import xlrd
  5. import xlwt
  6. import time
  7. from bs4 import BeautifulSoup
  8. #新建excel表格用于存储数据
  9. myfile=xlwt.Workbook()
  10. table=myfile.add_sheet(u"信息",cell_overwrite_ok=True)
  11. table.write(0,0,u"名字")
  12. table.write(0,1,u"链接")
  13. user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 '
  14. headers = { 'User-Agent' : user_agent }
  15. class geturl():
  16. def __init__(self,page):
  17. self.page = page
  18. def get_url(self):
  19. for p in range(1,self.page+1):
  20. url = 'https://avso.pw/cn/actresses/page/'+str(p)
  21. r = requests.get(url,headers=headers)
  22. html = r.text
  23. #print html
  24. soup = BeautifulSoup(html)
  25. i = (p-1)*50 + 1
  26. for tag in soup.find_all(href=re.compile("https://avso.pw/cn/star")):
  27. #print tag.attrs['href']
  28. table.write(i,1,tag.attrs['href'])
  29. i += 1
  30. j = (p-1)*50 +1
  31. for tag in soup.find_all(class_='photo-info'):
  32. for gg in tag.find_all('span'):
  33. #print gg.string
  34. table.write(j,0,gg.string)
  35. j += 1
  36. print u"完成读取第%s页信息"%p
  37. test = geturl(2)
  38. test.get_url()
  39. filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"url.xlsx"
  40. myfile.save(filename)
  41. print u"完成%s的url备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
二、获取明星作品的番号

  1. #coding=utf8
  2. import requests
  3. import re
  4. import xlrd
  5. import xlwt
  6. import time
  7. import ConfigParser
  8. from bs4 import BeautifulSoup
  9. user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 '
  10. headers = { 'User-Agent' : user_agent }
  11. myfile=xlwt.Workbook()
  12. wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=True)
  13. wtable.write(0,0,u"名字")
  14. wtable.write(0,1,u"链接")
  15. wtable.write(0,2,u"番号")
  16. class getserial():
  17. def get_serial(self):
  18. data = xlrd.open_workbook('url.xls')
  19. table = data.sheets()[0]
  20. nrows = table.nrows
  21. for j in range(nrows):
  22. try:
  23. cf = ConfigParser.ConfigParser()
  24. cf.read("liao.ini")
  25. p = cf.getint('num','p')
  26. if j == 0:
  27. continue
  28. else:
  29. url = table.cell(j,1).value
  30. r = requests.get(url,headers=headers)
  31. html = r.text
  32. soup = BeautifulSoup(html)
  33. i = 0
  34. for tag in soup.find_all('date'):
  35. if i%2 == 0:
  36. #print tag.string
  37. wtable.write(p,2,tag.string)
  38. wtable.write(p,0,table.cell(j,0).value)
  39. wtable.write(p,1,table.cell(j,1).value)
  40. p += 1
  41. i+=1
  42. print j
  43. cf.set("num", "p", p)
  44. cf.write(open("liao.ini", "w"))
  45. except:
  46. filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"
  47. myfile.save(filename)
  48. print u"出现异常自动保存%s的番号备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
  49. test = getserial()
  50. test.get_serial()
  51. filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"
  52. myfile.save(filename)
  53. print u"完成%s的番号备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())

三、根据番号查找对应的磁力链接

  1. #coding=utf8
  2. import requests
  3. import re
  4. import xlrd
  5. import xlwt
  6. import time
  7. import ConfigParser
  8. import threading
  9. from bs4 import BeautifulSoup
  10. user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36'
  11. headers = {
  12. 'Accept':'text/css,*/*;q=0.1',
  13. 'Accept-Encoding':'gzip, deflate, sdch, br',
  14. 'Accept-Language':'zh-CN,zh;q=0.8',
  15. 'Cache-Control':'max-age=0',
  16. 'Connection':'keep-alive',
  17. 'User-Agent' : user_agent ,
  18. }
  19. class getlink():
  20. def get_link(self,conf,excel):
  21. myfile=xlwt.Workbook()
  22. wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=True)
  23. wtable.write(0,0,u"名字")
  24. wtable.write(0,1,u"番号")
  25. wtable.write(0,2,u"文件大小")
  26. wtable.write(0,3,u"文件更新日期")
  27. wtable.write(0,4,u"链接")
  28. wtable.write(0,5,u"磁力链接")
  29. data = xlrd.open_workbook(excel)
  30. table = data.sheets()[0]
  31. nrows = table.nrows
  32. for j in range(nrows):
  33. try:
  34. cf = ConfigParser.ConfigParser()
  35. cf.read(conf)
  36. p = cf.getint('num','p')
  37. if j == 0:
  38. continue
  39. else:
  40. serial = table.cell(j,2).value
  41. url = 'https://btso.pw/search/' + serial
  42. #print url
  43. r = requests.get(url,headers=headers,timeout=30)
  44. html = r.text
  45. #print html
  46. soup = BeautifulSoup(html)
  47. for tag in soup.find_all('div',class_='row'):
  48. for gg in tag.find_all(class_='col-sm-2 col-lg-1 hidden-xs text-right size'):
  49. print gg.string
  50. wtable.write(p,0,table.cell(j,0).value)
  51. wtable.write(p,1,table.cell(j,2).value)
  52. wtable.write(p,2,gg.string)
  53. for aa in tag.find_all(class_='col-sm-2 col-lg-2 hidden-xs text-right date'):
  54. print aa.string
  55. wtable.write(p,3,aa.string)
  56. for xx in tag.find_all(href=re.compile("https://btso.pw/magnet/detail/hash")):
  57. print xx.attrs['href']
  58. wtable.write(p,4,xx.attrs['href'])
  59. r1 = requests.get(xx.attrs['href'],headers=headers,timeout=30)
  60. html1 = r1.text
  61. #print html1
  62. soup1 = BeautifulSoup(html1)
  63. for tag1 in soup1.find_all('textarea',id='magnetLink'):
  64. print tag1.string
  65. wtable.write(p,5,tag1.string)
  66. p += 1
  67. cf.set("num", "p", p)
  68. cf.write(open(conf, "w"))
  69. except:
  70. filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls"
  71. myfile.save(filename)
  72. print u"出现异常自动保存%s的磁力链接备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
  73. filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls"
  74. myfile.save(filename)
  75. print u"自动保存%s的磁力链接备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
  76. if __name__ == '__main__':
  77. test = getlink()
  78. threads = []
  79. t1 = threading.Thread(target=test.get_link,args=('link1.ini','serial1.xls',))
  80. threads.append(t1)
  81. t2 = threading.Thread(target=test.get_link,args=('link2.ini','serial2.xls',))
  82. threads.append(t2)
  83. t3 = threading.Thread(target=test.get_link,args=('link3.ini','serial3.xls',))
  84. threads.append(t3)
  85. t4 = threading.Thread(target=test.get_link,args=('link4.ini','serial4.xls',))
  86. threads.append(t4)
  87. t5 = threading.Thread(target=test.get_link,args=('link5.ini','serial5.xls',))
  88. threads.append(t5)
  89. t6 = threading.Thread(target=test.get_link,args=('link6.ini','serial6.xls',))
  90. threads.append(t6)
  91. for t in threads:
  92. t.setDaemon(True)
  93. t.start()
  94. t.join()
  95. print u"完成所有进程"



磁力链接丢到迅雷就可以下载了。

看看最后的excel:




声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/177104
推荐阅读
相关标签
  

闽ICP备14008679号