当前位置:   article > 正文

python - 1024 Spider_1024bt

1024bt

python3 urlretrieve 


  1. import urllib.request
  2. url = 'http://i3.hunantv.com/p1/20150906/1637244570C.jpg'
  3. file_name = 'F:\\img\\ym.jpg'
  4. ...
  5. # Download the file from `url` and save it locally under `file_name`:
  6. urllib.request.urlretrieve(url, file_name)

python md5 加密


  1. import hashlib
  2. import time
  3. m = hashlib.md5()
  4. m.update(str(time.time()).encode())
  5. md5value=m.hexdigest()
  6. print(md5value)
'
运行

python base64 加密,encodeURIComponent


  1. import time
  2. from urllib import parse
  3. tm = time.time()
  4. print(str(tm))
  5. tm = str(tm)[0:10]
  6. print(tm)
  7. import base64
  8. print(base64.b64encode(tm.encode()).decode())
  9. b64 = base64.b64encode(tm.encode()).decode()
  10. print(parse.quote(b64)) # encodeURIComponent
  11. '''
  12. MTUwMjQyODkwMw==
  13. MTUwMjQyNTE1Mw==
  14. '''
'
运行


草榴 Spider


config


  1. PROXIES = { "http": "http://xxx:xxxx", "https": "http://xxx:xxxx", }
  2. MONGO_URL = '127.0.0.1'
  3. MONGO_DB = 'caoliu'
  4. CLSQ = 'http://xxx.com/' # 地址就不留了
  5. CLSQ_DOWNLOAD = 'http://www.xxx.com/download.php?'
  6. BT_PATH = 'F:\\1024bt\\{}\\'
'
运行


viidii


  1. from selenium import webdriver
  2. from selenium.webdriver.support.ui import WebDriverWait
  3. from selenium.webdriver.support import expected_conditions as EC
  4. from selenium.webdriver.common.by import By
  5. from pyquery import PyQuery as pq
  6. def get_b64(art_hash):
  7. browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=true'])
  8. browser.set_window_size(1400, 900)
  9. wait = WebDriverWait(browser, 10)
  10. browser.get('http://www.viidii.info/?http://www______rmdown______com/link______php?hash={}'.format(art_hash))
  11. wait.until(
  12. EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type=hidden]'))
  13. )
  14. html = browser.page_source
  15. doc = pq(html)
  16. b64 = doc.find('input[type=hidden]').attr('value')
  17. return b64


spider


  1. import requests
  2. import pymongo
  3. import os
  4. import re
  5. # import base64
  6. import time
  7. import viidii
  8. from pyquery import PyQuery as pq
  9. from Config import *
  10. from urllib import parse
  11. # db
  12. client = pymongo.MongoClient(MONGO_URL)
  13. db = client[MONGO_DB]
  14. global table
  15. # 时间
  16. import datetime
  17. today = datetime.date.today()
  18. one_day = datetime.timedelta(1)
  19. yesterday = today - one_day
  20. before_day = yesterday - one_day
  21. yesterday_str = yesterday.strftime('%Y-%m-%d')
  22. before_day_str = before_day.strftime('%Y-%m-%d')
  23. # 计数器
  24. global count
  25. def insert_to_mongo(info):
  26. global count
  27. global table
  28. query_info = {'art_name': info['art_name']}
  29. if table.find(query_info).limit(1).count() == 0:
  30. table.insert(info)
  31. count += 1
  32. print('成功插入第 ', count, ' 数据:', info)
  33. else:
  34. print('重复:', info)
  35. '''
  36. 1. 插入mongo 之前,info 中被加入一条_id字段
  37. 2. python 传递参数
  38. 对于不可变对象作为函数参数,相当于C系语言的值传递;
  39. 对于可变对象作为函数参数,相当于C系语言的引用传递。
  40. 3. 所以如果新插入的info 如果不flush,mongo 将会报错
  41. '''
  42. def art_bt_hash(url):
  43. '''
  44. 得到hash 码
  45. :param url:
  46. :return:
  47. '''
  48. doc = requests.get(url=url, proxies=PROXIES).content.decode('gbk')
  49. hash = re.search(r'rmdown\.com/link\.php\?hash=(.*?)<', doc, re.S).group(1) # 正则直接匹配
  50. return hash
  51. def art_item(tr):
  52. '''
  53. 得到一条(一件艺术品),获取这个art 的hash 码,对info 进行封装
  54. :param tr:
  55. :return:
  56. '''
  57. info = {}
  58. info['art_name'] = tr.find('h3').text()
  59. hash_url = tr.find('.tal > h3 > a').attr('href')
  60. art_url = '{}{}'.format(CLSQ, hash_url)
  61. info['art_url'] = art_url
  62. try:
  63. info['art_hash'] = art_bt_hash(art_url)
  64. except AttributeError as e:
  65. print(e.args)
  66. return
  67. except UnicodeDecodeError as e:
  68. print(e.args)
  69. return
  70. info['art_time'] = yesterday_str
  71. info['art_flag'] = '0'
  72. insert_to_mongo(info=info)
  73. def next_tags(**kwargs):
  74. '''
  75. 翻页
  76. :param kwargs:
  77. :return:
  78. '''
  79. base = kwargs['base']
  80. page_num = kwargs['page_num']
  81. url = '{}&page={}'.format(base, page_num)
  82. print(url)
  83. try:
  84. doc = requests.get(url=url, proxies=PROXIES).content
  85. except requests.exceptions.ContentDecodingError as e:
  86. print(e.args)
  87. time.sleep(2)
  88. next_tags(base=base, page_num=page_num)
  89. return
  90. html = pq(doc)
  91. trs = html.find('#ajaxtable > tbody:nth-child(2) > tr').items()
  92. for tr in trs:
  93. art_time = tr.find('div[class=f10]').text()
  94. if art_time == before_day_str:
  95. return
  96. if art_time == yesterday_str:
  97. art_item(tr)
  98. next_tags(base=base, page_num=page_num+1)
  99. # 得到 hash 码,然后放入mongodb
  100. def art_tags(**kwargs):
  101. url = kwargs['url']
  102. print(url)
  103. doc = requests.get(url=url, proxies=PROXIES).content
  104. html = pq(doc)
  105. trs = html.find('#ajaxtable > tbody:nth-child(2) > tr').items()
  106. flag = False
  107. for tr in trs:
  108. if flag:
  109. art_time = tr.find('div[class=f10]').text()
  110. if art_time == before_day_str:
  111. return
  112. if art_time == yesterday_str:
  113. art_item(tr)
  114. if tr.text() == '普通主題':
  115. flag = True
  116. next_tags(base=url, page_num=2)
  117. def downloader(**kwargs):
  118. '''
  119. 下载器
  120. r = requests.get(url).content
  121. with open(file=path, mode='wb') as f:
  122. f.write(r)
  123. :param kwargs:
  124. :return:
  125. '''
  126. url = kwargs['url']
  127. hash = kwargs['hash']
  128. r = requests.get(url).content
  129. print(url, r)
  130. path = '{}{}.torrent'.format(BT_PATH.format(yesterday_str), hash)
  131. try:
  132. with open(file=path, mode='wb') as f:
  133. f.write(r)
  134. except FileNotFoundError as e:
  135. print(e.args)
  136. return False
  137. print('bt -> ', path)
  138. return True
  139. # bt 下载器,从mongodb 中得到hash,下载bt
  140. def art_bt_download(**kwargs):
  141. global table
  142. query_info = kwargs['query_info']
  143. for item in table.find(query_info):
  144. art_hash = item['art_hash']
  145. # stamp_base64 = parse.quote(base64.b64encode(str(time.time())[0:10].encode()).decode())
  146. stamp_base64 = parse.quote(viidii.get_b64(art_hash=art_hash))
  147. url = '{}ref={}&reff={}&submit=download'.format(CLSQ_DOWNLOAD, art_hash, stamp_base64)
  148. if downloader(url=url, hash=art_hash):
  149. update_data = {'$set' : {'art_flag' : '1'}}
  150. table.update(spec=item, document=update_data, upsert=False)
  151. else:
  152. table.remove(item)
  153. print('删除一条数据...')
  154. if __name__ == '__main__':
  155. global count
  156. # 创建文件夹
  157. if not os.path.exists(BT_PATH.format(yesterday_str)):
  158. os.makedirs(BT_PATH.format(yesterday_str))
  159. # 类别list
  160. url_dict = {'2' : '亞洲無碼原創區', '15' : '亞洲有碼原創區', '5' : '動漫原創區'}
  161. # 遍历
  162. for type, name in url_dict.items():
  163. count = 0
  164. global table
  165. table = db[name]
  166. # art_tags(url='{}thread0806.php?fid={}'.format(CLSQ, type))
  167. print('启动下载器...')
  168. # art_bt_download(query_info={'art_flag': '0'})
  169. art_bt_download(query_info={'art_time' : '2017-08-10'})




声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/酷酷是懒虫/article/detail/929131
推荐阅读
相关标签
  

闽ICP备14008679号