赞
踩
''' 步骤: 1.先拿到通过搜索的url,分析url包含哪些参数是在变的 2.然后对通过搜索的url发送请求,解析并拿到想要的数据用来连拼接歌曲的url 3.对歌曲的url发送请求,解析并拿到下载歌曲的url 4.拿到下载歌曲的url,进行下载保存到一个文件夹里面 对下载的歌曲命名为:歌名+.mp4 用MD5对signature进行解密 ''' import requests import time from hashlib import md5 from urllib import parse import json import re class KugouSongSpider(object): #初始化函数 def __init__(self): #通过搜索的url 需要拼接的参数有 keyword:用户输入的歌名 clienttime、mid、uuid:13位的时间戳 signature:加密的参数 self.url='https://complexsearch.kugou.com/v2/search/song?callback=callback123&keyword={}&page=1&pagesize=30&bitrate=0&isfuzzy=0&tag=em&inputtype=0&platform=WebFilter&userid=0&clientver=2000&iscorrection=1&privilege_filter=0&srcappid=2919&clienttime={}&mid={}&uuid={}&dfid=-&signature={}' self.headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'cookie': 'kg_mid=ecc98921826e1bfe5f18c7e63a0b8d46; kg_dfid=0hO1xg4HydW64ZSjCx377ZFv; kg_dfid_collect=d41d8cd98f00b204e9800998ecf8427e; Hm_lvt_aedee6983d4cfc62f509129360d6bb3d=1623140158,1623465800,1624347175,1624352681; Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d=1624362070', 'accept-language': 'zh-CN,zh;q=0.9', 'referer': 'https://www.kugou.com/' } self.word = input("请您输入想下载歌曲名:") #13位的时间戳 self.timec = str(time.time()*1000)[:13] #歌曲的url 需要拼接的参数有 hash album_id 还有最后一个是时间戳 self.two_url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191027654644883288304_1624443984782&hash={}&dfid=0hO1xg4HydW64ZSjCx377ZFv&mid=ecc98921826e1bfe5f18c7e63a0b8d46&platid=4&album_id={}&_={}' #定义一个列表来存放拼接歌曲的url的hash和album_id参数 self.song_list=[] #定义一个列表来存放歌名 self.FileName_list=[] #定义一个列表来存放歌手 self.Songer_list=[] #对url发送请求 并拿到JSON格式的数据 def get_page(self,url): res = requests.get(url = url,headers=self.headers) html = res.content.decode('utf-8') json_ = html.replace('callback123(','').replace(')','') #print(json_) return json_ #对signature进行解密 来生成一个signature def get_sign(self): #拿到加密的参数 sign_text=[ "NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt", "bitrate=0", "callback=callback123", "clienttime={}".format(self.timec), "clientver=2000", "dfid=-", "inputtype=0", "iscorrection=1", "isfuzzy=0", "keyword={}".format(self.word), "mid={}".format(self.timec), "page=1", "pagesize=30", "platform=WebFilter", "privilege_filter=0", "srcappid=2919", "tag=em", "userid=0", "uuid={}".format(self.timec), "NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt" ] string = '' for d in sign_text: string += d #创建MD5()对象 s = md5() #调用update方法 s.update(string.encode()) #调用十六进制的方法 sign = s.hexdigest() #返回sign ---> signature return sign #解析 提取参数 def parse_page(self,html): #将json字符串转化为Python的数据类型 python_json = json.loads(html) #print(python_json) json_str = python_json['data']['lists'] #定义一个序号 从0开始 s = 0 #遍历json_str for li in json_str: #创建一个字典来存放参数 li_dict = {} li_dict['AlbumID'] = li['AlbumID'] li_dict['FileHash'] = li['FileHash'] li_dict['FileName'] = str(s)+'-------'+li['FileName'] li_dict['SingerName'] = li['SingerName'] s+=1 self.song_list.append(li_dict) self.FileName_list.append(li_dict['FileName']) self.Songer_list.append(li_dict['SingerName']) #将歌名和歌手名打包成一个新的字典 并打印通过搜索歌曲的结果 inf = dict(zip(self.FileName_list,self.Songer_list)) inf = json.dumps(inf,ensure_ascii=False,indent=2) text = inf.replace('"','').replace(':','-------') print(text) #print(json_str) number = int(input("请输入对应的编号[0-29]:")) #hash值 file_hash = self.song_list[number]['FileHash'] #album_id值 album_id = self.song_list[number]['AlbumID'] #print(file_hash,album_id) #调用get_down_page函数 并把file_hash和album_id给它 self.get_down_page(file_hash,album_id) #对歌曲的url发送请求 def get_down_page(self,file_hash,album_id): #拼接歌曲的url two_url = self.two_url.format(file_hash,album_id,self.timec) res = requests.get(url=two_url,headers=self.headers) json_str = res.content.decode('utf-8') #print(two_url) #调用parse_two_page函数 并把json_str给它 self.parse_two_page(json_str) #解析 提取参数 用来拼接下载歌曲的url def parse_two_page(self,json_str): #print(json_str) #使用正则提取参数 #提取下载歌曲的url play_url = re.compile('jQuery191027654644883288304_1624443984782.*?"play_url":"(.*?)","authors":.*?',re.S) #提取歌手名 song_Name = re.compile('jQuery191027654644883288304_1624443984782.*?","song_name":(".*?"),"lyrics":',re.S) down_url = play_url.findall(json_str) Name = song_Name.findall(json_str) #print(Name) #遍历 for name in Name: #print(name) #对Unicode进行转码 SongName = eval(u'%s'%name) #print(SongName) for u in down_url: #把play_url中的反斜杠替换掉 down_play_url = u.replace("\\","") print(down_play_url) #调用write_down_music函数 并把url和SongName给它 self.write_down_music(down_play_url,SongName) print(f"歌曲:{SongName} 已下载并保存成功!!!") #下载并保存歌曲 def write_down_music(self,down_play_url,SongName): filename='./酷狗歌曲/'+SongName+'.mp4' with open(filename,'wb') as f: f.write(requests.get(url=down_play_url,headers=self.headers).content) def main(self): #signature值 sign = self.get_sign() keyword = parse.quote(self.word) #拼接通过搜索的url url = self.url.format(keyword,self.timec,self.timec,self.timec,sign) html = self.get_page(url) self.parse_page(html) if __name__=='__main__': spider = KugouSongSpider() spider.main()
''' 步骤: <1>.找到对应的详情页 拿到url <2>.分析--->通过post方式发送请求--->拿到data <3>.找到sign加密的位置,并分析它的加密方式 <4>.然后采用MD5方式对sign进行解密--->得到sign参数 sign: n.md5("fanyideskweb" + e + i + "Tbh5E8=q6U3EXe+&L[4c@") i表示13位的时间戳+1位(0-9)随机数 e表示请输入需要翻译的内容 ''' from fake_useragent import UserAgent import requests import time from hashlib import md5 import random def get_data(i): salt = str(int(time.time()*1000))+str(random.randint(0,9)) ts = str(int(time.time()*1000)) string = "fanyideskweb" + i + salt + "Tbh5E8=q6U3EXe+&L[4c@" s = md5() s.update(string.encode()) sign = s.hexdigest() return salt,ts,sign def Youdao(i): salt,ts,sign = get_data(i) url = 'https://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' headers={ 'user-agent':UserAgent().random, 'Cookie': 'OUTFOX_SEARCH_USER_ID=115997245@10.108.160.101; OUTFOX_SEARCH_USER_ID_NCOO=303186368.56868434; JSESSIONID=aaaPc_J_dz8H9dKB-xxOx; ___rl__test__cookies=1623889930051', 'Host': 'fanyi.youdao.com', 'Origin': 'https://fanyi.youdao.com', 'Referer': 'https://fanyi.youdao.com/' } data={ 'i': i, 'from': 'AUTO', 'to': to, 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': salt, 'sign': sign, 'lts': ts, 'bv': '4f7ca50d9eda878f3f40fb696cce4d6d', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTlME' } res = requests.post(url,headers = headers,data = data).json() result = res['translateResult'][0][0]['tgt'] return result if __name__=='__main__': i = input("请输入翻译的内容:\t") to = input("请选择您想翻译(英语:en 日语:ja 韩语:ko 中文: ch)的语言\t") result = Youdao(i) print("翻译结果是:"+result)
# -*- coding = utf-8 -*- # @Time :2022/5/1-17:27 # @Author : Dong # @File : 虎牙直播.py # @Software : PyCharm from selenium import webdriver import pymysql from lxml import etree class HuYa(object): def my_sql(self): my = pymysql.connect(host='127.0.0.1', user='dongxizhi', passwd='dongxizhi', db='dongxizhi') return my def Firefox(self,my): url = 'https://www.huya.com/l' driver = webdriver.Firefox() driver.get(url) # e = etree.HT ML(html) item = {} page = 1 while True: print("-----" + str(page) + "-----") page += 1 html = driver.page_source title = driver.find_elements_by_xpath('//ul[@class="live-list clearfix"]/li/a[2]') # img = e.find_elements_by_name('//ul[@class="live-list clearfix"]/li/a[1]/img[@class="pic"]/@src') try: num = driver.find_elements_by_xpath('//ul/li/span/span[3]/i[2]') name = driver.find_elements_by_xpath('//ul/li/span/span[1]/i') except: print("Error!!!") print(len(name)) for ti, nu, na in zip(title, num, name): # print("直播名:" + ti.text + ' ' * 30 + "播放量:" + nu.text) try: item["直播名"] = ti.text item["播放量"] = nu.text item["主播名"] = na.text sql = "insert into zhibo(broadcast,counts,anchor) values('" + item["直播名"] + "','" + item[ "播放量"] + "','" + \ item["主播名"] + "')" # print(item) my.query(sql) print("写入成功!!!") except: print("Encoding error!!!") if html.find('class="laypage_next"') != -1: driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div/div/div[4]/div[1]/div/a[8]').click() else: break def main(self): my = self.my_sql() self.Firefox(my) if __name__ == '__main__': spider = HuYa() spider.main()
# -*- coding = utf-8 -*- # @Time :2021/5/19 15:07 # @Author : dongxizhi # @File : dict.py # @Software : PyCharm import requests import asyncio import aiohttp #https://boxnovel.baidu.com/boxnovel/content?gid=4306063500&data=%7B"fromaction"%3A"dushu"%7D&cid=11348571 async def download(c_id): cid = c_id url = f'https://boxnovel.baidu.com/boxnovel/content?gid=4306063500&data=%7B"fromaction"%3A"dushu"%7D&cid={cid}' async with aiohttp.ClientSession() as session: async with session.get(url) as res: dit = await res.text() print(dit) async def get_Search(url): res = requests.get(url) json_str = res.json() tasks = [ ] for item in json_str['data']['chapter']['chapterInfo']: c_id = item['chapter_id'] #print(chapter_id) tasks.append(download(c_id)) await asyncio.wait(tasks) if __name__ == '__main__': b_id = 4306063500 url = f'https://boxnovel.baidu.com/boxnovel/wiseapi/chapterList?bookid={b_id}&pageNum=1&order=asc&site=' asyncio.run(get_Search(url))
import time import aiohttp import asyncio from lxml import etree urls = [ 'https://www.baidu.com/', 'https://www.hao123.com/?src=from_pc' ] headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Mobile Safari/537.36' } async def get_request(url): # 实例化一个请求对象 async with aiohttp.ClientSession() as session: # 使用get发送请求,并获得一个response响应对象 # get/post(url,headers,params/data,proxy='http://ip:port') async with await session.get(url=url) as response: # text()获取字符串形式的响应数据 # read()获取type类型响应数据 page_text = await response.text() # page_read = await response.read() # print(page_text) # print(page_read) return page_text # 提取数据 def parse(t): page_txt = t.result() html = etree.HTML(page_txt) parse_text = html.xpath('//*[@id="aging-total-page"]/text()') print(parse_text) if __name__ == '__main__': start = time.time() tasks = [] for url in urls: c = get_request(url) task = asyncio.ensure_future(c) # 回调函数 task.add_done_callback(parse) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) print(time.time() - start)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。