赞
踩
- import urllib.request
- url = 'http://i3.hunantv.com/p1/20150906/1637244570C.jpg'
- file_name = 'F:\\img\\ym.jpg'
- ...
- # Download the file from `url` and save it locally under `file_name`:
- urllib.request.urlretrieve(url, file_name)
- import hashlib
- import time
-
- m = hashlib.md5()
-
- m.update(str(time.time()).encode())
-
- md5value=m.hexdigest()
-
- print(md5value)
'运行
- import time
- from urllib import parse
-
- tm = time.time()
-
- print(str(tm))
-
- tm = str(tm)[0:10]
-
- print(tm)
-
- import base64
-
- print(base64.b64encode(tm.encode()).decode())
-
- b64 = base64.b64encode(tm.encode()).decode()
-
- print(parse.quote(b64)) # encodeURIComponent
-
- '''
- MTUwMjQyODkwMw==
- MTUwMjQyNTE1Mw==
- '''
'运行
- PROXIES = { "http": "http://xxx:xxxx", "https": "http://xxx:xxxx", }
-
- MONGO_URL = '127.0.0.1'
- MONGO_DB = 'caoliu'
-
- CLSQ = 'http://xxx.com/' # 地址就不留了
- CLSQ_DOWNLOAD = 'http://www.xxx.com/download.php?'
-
- BT_PATH = 'F:\\1024bt\\{}\\'
'运行
- from selenium import webdriver
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.by import By
-
- from pyquery import PyQuery as pq
-
- def get_b64(art_hash):
- browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=true'])
- browser.set_window_size(1400, 900)
- wait = WebDriverWait(browser, 10)
-
- browser.get('http://www.viidii.info/?http://www______rmdown______com/link______php?hash={}'.format(art_hash))
-
- wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type=hidden]'))
- )
-
- html = browser.page_source
- doc = pq(html)
- b64 = doc.find('input[type=hidden]').attr('value')
- return b64
- import requests
- import pymongo
- import os
- import re
- # import base64
- import time
- import viidii
-
- from pyquery import PyQuery as pq
- from Config import *
-
- from urllib import parse
-
-
- # db
- client = pymongo.MongoClient(MONGO_URL)
- db = client[MONGO_DB]
- global table
-
- # 时间
- import datetime
- today = datetime.date.today()
- one_day = datetime.timedelta(1)
- yesterday = today - one_day
- before_day = yesterday - one_day
-
- yesterday_str = yesterday.strftime('%Y-%m-%d')
- before_day_str = before_day.strftime('%Y-%m-%d')
-
- # 计数器
- global count
-
- def insert_to_mongo(info):
- global count
- global table
- query_info = {'art_name': info['art_name']}
- if table.find(query_info).limit(1).count() == 0:
- table.insert(info)
- count += 1
- print('成功插入第 ', count, ' 数据:', info)
- else:
- print('重复:', info)
- '''
- 1. 插入mongo 之前,info 中被加入一条_id字段
- 2. python 传递参数
- 对于不可变对象作为函数参数,相当于C系语言的值传递;
- 对于可变对象作为函数参数,相当于C系语言的引用传递。
- 3. 所以如果新插入的info 如果不flush,mongo 将会报错
- '''
-
- def art_bt_hash(url):
- '''
- 得到hash 码
- :param url:
- :return:
- '''
- doc = requests.get(url=url, proxies=PROXIES).content.decode('gbk')
- hash = re.search(r'rmdown\.com/link\.php\?hash=(.*?)<', doc, re.S).group(1) # 正则直接匹配
- return hash
-
- def art_item(tr):
- '''
- 得到一条(一件艺术品),获取这个art 的hash 码,对info 进行封装
- :param tr:
- :return:
- '''
- info = {}
- info['art_name'] = tr.find('h3').text()
- hash_url = tr.find('.tal > h3 > a').attr('href')
- art_url = '{}{}'.format(CLSQ, hash_url)
- info['art_url'] = art_url
- try:
- info['art_hash'] = art_bt_hash(art_url)
- except AttributeError as e:
- print(e.args)
- return
- except UnicodeDecodeError as e:
- print(e.args)
- return
- info['art_time'] = yesterday_str
- info['art_flag'] = '0'
- insert_to_mongo(info=info)
-
- def next_tags(**kwargs):
- '''
- 翻页
- :param kwargs:
- :return:
- '''
- base = kwargs['base']
- page_num = kwargs['page_num']
- url = '{}&page={}'.format(base, page_num)
- print(url)
- try:
- doc = requests.get(url=url, proxies=PROXIES).content
- except requests.exceptions.ContentDecodingError as e:
- print(e.args)
- time.sleep(2)
- next_tags(base=base, page_num=page_num)
- return
- html = pq(doc)
- trs = html.find('#ajaxtable > tbody:nth-child(2) > tr').items()
-
- for tr in trs:
- art_time = tr.find('div[class=f10]').text()
- if art_time == before_day_str:
- return
- if art_time == yesterday_str:
- art_item(tr)
- next_tags(base=base, page_num=page_num+1)
-
- # 得到 hash 码,然后放入mongodb
- def art_tags(**kwargs):
- url = kwargs['url']
- print(url)
- doc = requests.get(url=url, proxies=PROXIES).content
- html = pq(doc)
- trs = html.find('#ajaxtable > tbody:nth-child(2) > tr').items()
- flag = False
-
- for tr in trs:
- if flag:
- art_time = tr.find('div[class=f10]').text()
- if art_time == before_day_str:
- return
- if art_time == yesterday_str:
- art_item(tr)
- if tr.text() == '普通主題':
- flag = True
- next_tags(base=url, page_num=2)
-
- def downloader(**kwargs):
- '''
- 下载器
- r = requests.get(url).content
- with open(file=path, mode='wb') as f:
- f.write(r)
- :param kwargs:
- :return:
- '''
- url = kwargs['url']
- hash = kwargs['hash']
- r = requests.get(url).content
- print(url, r)
- path = '{}{}.torrent'.format(BT_PATH.format(yesterday_str), hash)
- try:
- with open(file=path, mode='wb') as f:
- f.write(r)
- except FileNotFoundError as e:
- print(e.args)
- return False
- print('bt -> ', path)
- return True
- # bt 下载器,从mongodb 中得到hash,下载bt
- def art_bt_download(**kwargs):
- global table
- query_info = kwargs['query_info']
- for item in table.find(query_info):
- art_hash = item['art_hash']
- # stamp_base64 = parse.quote(base64.b64encode(str(time.time())[0:10].encode()).decode())
- stamp_base64 = parse.quote(viidii.get_b64(art_hash=art_hash))
- url = '{}ref={}&reff={}&submit=download'.format(CLSQ_DOWNLOAD, art_hash, stamp_base64)
- if downloader(url=url, hash=art_hash):
- update_data = {'$set' : {'art_flag' : '1'}}
- table.update(spec=item, document=update_data, upsert=False)
- else:
- table.remove(item)
- print('删除一条数据...')
-
- if __name__ == '__main__':
- global count
-
- # 创建文件夹
- if not os.path.exists(BT_PATH.format(yesterday_str)):
- os.makedirs(BT_PATH.format(yesterday_str))
- # 类别list
- url_dict = {'2' : '亞洲無碼原創區', '15' : '亞洲有碼原創區', '5' : '動漫原創區'}
- # 遍历
- for type, name in url_dict.items():
- count = 0
- global table
- table = db[name]
- # art_tags(url='{}thread0806.php?fid={}'.format(CLSQ, type))
- print('启动下载器...')
- # art_bt_download(query_info={'art_flag': '0'})
- art_bt_download(query_info={'art_time' : '2017-08-10'})
-
-
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。