赞
踩
设计分三步走:
1.获取明星列表地址
2.获取明星作品序列号
3.根据作品序列号查找磁力链接
一、获取网站中明星列表的作品集地址
- #coding=utf8
- import requests
- import re
- import xlrd
- import xlwt
- import time
- from bs4 import BeautifulSoup
-
- #新建excel表格用于存储数据
-
- myfile=xlwt.Workbook()
- table=myfile.add_sheet(u"信息",cell_overwrite_ok=True)
- table.write(0,0,u"名字")
- table.write(0,1,u"链接")
-
- user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 '
- headers = { 'User-Agent' : user_agent }
-
- class geturl():
- def __init__(self,page):
- self.page = page
-
- def get_url(self):
-
- for p in range(1,self.page+1):
- url = 'https://avso.pw/cn/actresses/page/'+str(p)
- r = requests.get(url,headers=headers)
- html = r.text
- #print html
-
-
- soup = BeautifulSoup(html)
-
- i = (p-1)*50 + 1
- for tag in soup.find_all(href=re.compile("https://avso.pw/cn/star")):
- #print tag.attrs['href']
- table.write(i,1,tag.attrs['href'])
- i += 1
-
- j = (p-1)*50 +1
- for tag in soup.find_all(class_='photo-info'):
- for gg in tag.find_all('span'):
- #print gg.string
- table.write(j,0,gg.string)
- j += 1
- print u"完成读取第%s页信息"%p
-
-
-
-
-
-
- test = geturl(2)
- test.get_url()
- filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"url.xlsx"
- myfile.save(filename)
- print u"完成%s的url备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
-
二、获取明星作品的番号
- #coding=utf8
- import requests
- import re
- import xlrd
- import xlwt
- import time
- import ConfigParser
- from bs4 import BeautifulSoup
-
- user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 '
- headers = { 'User-Agent' : user_agent }
-
- myfile=xlwt.Workbook()
- wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=True)
- wtable.write(0,0,u"名字")
- wtable.write(0,1,u"链接")
- wtable.write(0,2,u"番号")
-
- class getserial():
-
- def get_serial(self):
- data = xlrd.open_workbook('url.xls')
- table = data.sheets()[0]
- nrows = table.nrows
- for j in range(nrows):
- try:
- cf = ConfigParser.ConfigParser()
- cf.read("liao.ini")
- p = cf.getint('num','p')
- if j == 0:
- continue
- else:
- url = table.cell(j,1).value
-
- r = requests.get(url,headers=headers)
- html = r.text
- soup = BeautifulSoup(html)
- i = 0
-
- for tag in soup.find_all('date'):
- if i%2 == 0:
- #print tag.string
- wtable.write(p,2,tag.string)
- wtable.write(p,0,table.cell(j,0).value)
- wtable.write(p,1,table.cell(j,1).value)
- p += 1
- i+=1
- print j
- cf.set("num", "p", p)
- cf.write(open("liao.ini", "w"))
- except:
- filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"
- myfile.save(filename)
- print u"出现异常自动保存%s的番号备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
-
- test = getserial()
- test.get_serial()
- filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"
- myfile.save(filename)
- print u"完成%s的番号备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
- #coding=utf8
- import requests
- import re
- import xlrd
- import xlwt
- import time
- import ConfigParser
- import threading
- from bs4 import BeautifulSoup
-
- user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36'
- headers = {
- 'Accept':'text/css,*/*;q=0.1',
- 'Accept-Encoding':'gzip, deflate, sdch, br',
- 'Accept-Language':'zh-CN,zh;q=0.8',
- 'Cache-Control':'max-age=0',
- 'Connection':'keep-alive',
- 'User-Agent' : user_agent ,
- }
-
-
- class getlink():
- def get_link(self,conf,excel):
- myfile=xlwt.Workbook()
- wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=True)
- wtable.write(0,0,u"名字")
- wtable.write(0,1,u"番号")
- wtable.write(0,2,u"文件大小")
- wtable.write(0,3,u"文件更新日期")
- wtable.write(0,4,u"链接")
- wtable.write(0,5,u"磁力链接")
- data = xlrd.open_workbook(excel)
- table = data.sheets()[0]
- nrows = table.nrows
- for j in range(nrows):
- try:
- cf = ConfigParser.ConfigParser()
- cf.read(conf)
- p = cf.getint('num','p')
- if j == 0:
- continue
- else:
- serial = table.cell(j,2).value
- url = 'https://btso.pw/search/' + serial
- #print url
- r = requests.get(url,headers=headers,timeout=30)
- html = r.text
- #print html
- soup = BeautifulSoup(html)
-
- for tag in soup.find_all('div',class_='row'):
-
- for gg in tag.find_all(class_='col-sm-2 col-lg-1 hidden-xs text-right size'):
- print gg.string
- wtable.write(p,0,table.cell(j,0).value)
- wtable.write(p,1,table.cell(j,2).value)
- wtable.write(p,2,gg.string)
-
- for aa in tag.find_all(class_='col-sm-2 col-lg-2 hidden-xs text-right date'):
- print aa.string
- wtable.write(p,3,aa.string)
-
- for xx in tag.find_all(href=re.compile("https://btso.pw/magnet/detail/hash")):
- print xx.attrs['href']
- wtable.write(p,4,xx.attrs['href'])
- r1 = requests.get(xx.attrs['href'],headers=headers,timeout=30)
- html1 = r1.text
- #print html1
- soup1 = BeautifulSoup(html1)
- for tag1 in soup1.find_all('textarea',id='magnetLink'):
- print tag1.string
- wtable.write(p,5,tag1.string)
- p += 1
- cf.set("num", "p", p)
- cf.write(open(conf, "w"))
-
- except:
- filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls"
- myfile.save(filename)
- print u"出现异常自动保存%s的磁力链接备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
- filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls"
- myfile.save(filename)
- print u"自动保存%s的磁力链接备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
- if __name__ == '__main__':
- test = getlink()
- threads = []
- t1 = threading.Thread(target=test.get_link,args=('link1.ini','serial1.xls',))
- threads.append(t1)
- t2 = threading.Thread(target=test.get_link,args=('link2.ini','serial2.xls',))
- threads.append(t2)
- t3 = threading.Thread(target=test.get_link,args=('link3.ini','serial3.xls',))
- threads.append(t3)
- t4 = threading.Thread(target=test.get_link,args=('link4.ini','serial4.xls',))
- threads.append(t4)
- t5 = threading.Thread(target=test.get_link,args=('link5.ini','serial5.xls',))
- threads.append(t5)
- t6 = threading.Thread(target=test.get_link,args=('link6.ini','serial6.xls',))
- threads.append(t6)
- for t in threads:
- t.setDaemon(True)
- t.start()
- t.join()
- print u"完成所有进程"
-
看看最后的excel:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。