当前位置:   article > 正文

通过GitHub探索Python爬虫技术_python爬虫案例代码github

python爬虫案例代码github

1.检索爬取内容案例。

 2.找到最近更新的。(最新一般都可以直接运行)

 3.选择适合自己的项目,目前测试下面画红圈的是可行的。

4.方便大家查看就把代码粘贴出来了。

  1. #图中画圈一代码
  2. import requests
  3. import os
  4. import re
  5. while True:
  6. music_id = input("请输入歌曲id或歌曲链接: ")
  7. if music_id.startswith("http"):
  8. music_id = re.search(r"id=(\d+)", music_id).group(1)
  9. get_lyric = requests.get(url="https://music.163.com/api/song/lyric", params={"id": music_id, "lv": 1, "kv": 1, "tv": -1}).json()
  10. print(get_lyric)
  11. if get_lyric.get("lrc").get("lyric") == "":
  12. print("该歌曲没有歌词")
  13. else:
  14. if not os.path.exists("./OutLyric"):
  15. os.makedirs("./OutLyric")
  16. with open(f"./OutLyric/{music_id}.lrc", "w", encoding="utf-8") as save_lyric:
  17. if get_lyric.get("tlyric").get("lyric") == "":
  18. save_lyric.write(get_lyric.get("lrc").get("lyric"))
  19. else:
  20. zh_cn_lyric = re.sub(r'\[[^0-9]*:[^0-9.]*]\n', '', get_lyric.get("tlyric").get("lyric"))
  21. save_lyric.write(f'{get_lyric.get("lrc").get("lyric")}\n{zh_cn_lyric}')
  22. print(f"下载成功,可将该文件重命名至与歌曲相同的名字使用,lrc文件保存至./OutLyric/{music_id}.lrc")
  1. #图中画圈2代码
  2. #!/usr/bin/env python
  3. # -*- coding:utf-8 -*-
  4. import requests as rq
  5. from requests import exceptions
  6. from bs4 import BeautifulSoup as BS
  7. import os
  8. import re
  9. import csv
  10. SONG_NUM = 0
  11. def getMusic(ID, path, num):
  12. cloud = 'http://music.163.com/song/media/outer/url?id='
  13. kv = {
  14. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
  15. try:
  16. url = cloud+ID+'.mp3'
  17. tmp = rq.get(url, headers=kv)
  18. tmp.raise_for_status()
  19. print(num+"、歌曲正在下载...")
  20. with open(path, 'wb') as f:
  21. f.write(tmp.content)
  22. f.close()
  23. print(num+"、歌曲下载成功!")
  24. except exceptions.HTTPError as e:
  25. print(e)
  26. except Exception as e:
  27. print(e)
  28. def getMusicText(ID, path, num):
  29. muTextUrl = 'http://music.163.com/api/song/lyric?id=' + ID + '&lv=1&kv=1&tv=-1'
  30. headers = {
  31. 'Referer': 'https://music.163.com',
  32. 'Host': 'music.163.com',
  33. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
  34. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
  35. }
  36. try:
  37. res = rq.get(muTextUrl, headers=headers)
  38. res.raise_for_status()
  39. false = False # 解决eval报错 name 'false' is not defined
  40. true = True
  41. null = None
  42. lrc_dict = eval(res.text) # 转换为dict字典
  43. lrc_dict = lrc_dict['lrc']
  44. music_lyric = lrc_dict['lyric']
  45. print(num+"、歌词正在下载...")
  46. with open(path, 'w', encoding="utf-8") as f:
  47. f.write(music_lyric)
  48. f.close()
  49. print(num+"、歌词下载成功!")
  50. except exceptions.HTTPError as e:
  51. print(e)
  52. except Exception as e:
  53. print(e)
  54. def create_csv_head():
  55. headers = ['song_num', 'song_name', 'singer', 'song_duration']
  56. with open("./music/musicMsg.csv", "a", newline="", encoding="utf-8") as f:
  57. writer = csv.DictWriter(f, fieldnames=headers)
  58. head = {'song_num': '榜单序号', 'song_name': '歌曲名称',
  59. 'singer': '歌手', 'song_duration': '歌曲时长'}
  60. writer.writerow(head)
  61. def save_musicMsg(music_dict):
  62. headers = ['song_num', 'song_name', 'singer', 'song_duration']
  63. with open("./music/musicMsg.csv", "a", newline="", encoding="utf-8") as f:
  64. writer = csv.DictWriter(f, fieldnames=headers)
  65. writer.writerow(music_dict)
  66. def split_Msg(msg):
  67. msg = msg.split('"')
  68. item = msg[1]
  69. return item
  70. def getMusicMsg(ID):
  71. global SONG_NUM
  72. song_url = 'https://music.163.com/song?id=' + ID
  73. headers = {
  74. 'Referer': 'https://music.163.com',
  75. 'Host': 'music.163.com',
  76. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
  77. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
  78. }
  79. try:
  80. s = rq.session()
  81. res = s.get(song_url, headers=headers)
  82. soup = BS(res.content, 'lxml')
  83. # 获取歌手
  84. singer = str(soup.find('meta', {'property': 'og:music:artist'}))
  85. singer = split_Msg(singer)
  86. # 获取歌曲名
  87. song_name = str(soup.find('meta', {'property': 'og:title'}))
  88. song_name = split_Msg(song_name)
  89. # 获取歌曲时长
  90. song_duration = str(soup.find('meta', {'property': 'music:duration'}))
  91. song_duration = split_Msg(song_duration)
  92. m, s = divmod(int(song_duration), 60)
  93. song_duration = ("%02d:%02d" % (m, s))
  94. music_dict = {
  95. 'song_num': SONG_NUM,
  96. 'song_name': song_name,
  97. 'singer': singer,
  98. 'song_duration': song_duration
  99. }
  100. save_musicMsg(music_dict)
  101. # 歌曲名中/\\替换为空
  102. if '/' in song_name or '\\' in song_name or ':' in song_name:
  103. song_name = song_name.replace('/', '')
  104. song_name = song_name.replace('\\', '')
  105. song_name = song_name.replace(':', '')
  106. # 歌手名中/\\替换为&
  107. if '/' in singer or '\\' in singer or ':' in singer:
  108. singer = singer.replace('/', '&')
  109. singer = singer.replace('\\', '&')
  110. singer = singer.replace(':', '')
  111. dirName = singer+'-'+song_name
  112. print(dirName)
  113. return dirName
  114. except exceptions.HTTPError as e:
  115. print(e)
  116. except Exception as e:
  117. print(e)
  118. def getMusicList():
  119. headers = {
  120. 'Referer': 'https://music.163.com',
  121. 'Host': 'music.163.com',
  122. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
  123. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
  124. }
  125. base_url = 'https://music.163.com/discover/toplist'
  126. s = rq.session()
  127. url = base_url
  128. response = s.get(url, headers=headers)
  129. soup = BS(response.content, "lxml")
  130. main = soup.find('ul', {'class': 'f-hide'})
  131. ls = main.find_all('a')
  132. songID_dic = {} # key song_name ,value songID
  133. print('一共有'+str(len(ls))+'首歌')
  134. a = 1
  135. for music in ls:
  136. name = music.text
  137. ID = str(music['href'].replace('/song?id=', ''))
  138. name = name+'_'+str(a)
  139. a += 1
  140. songID_dic[name] = ID
  141. print("Name:{:30}\tID{:^10}".format(name, ID))
  142. print('一共有'+str(len(songID_dic))+'')
  143. return songID_dic
  144. def main():
  145. global SONG_NUM
  146. songID_dic = getMusicList()
  147. rootDir = 'music'
  148. if os.path.exists(rootDir):
  149. print(rootDir+"文件夹已存在")
  150. else:
  151. os.mkdir(rootDir)
  152. print("创建文件夹"+rootDir)
  153. create_csv_head()
  154. for item in songID_dic:
  155. item_clear = item.split('_')[0]
  156. SONG_NUM += 1
  157. dirName = getMusicMsg(songID_dic[item])
  158. if dirName[-2:-1] == '.':
  159. dirName = dirName.replace('.', '·')
  160. musicDir = './'+rootDir+'/' + dirName
  161. if os.path.exists(musicDir):
  162. print(musicDir+"文件夹已存在")
  163. else:
  164. os.mkdir(musicDir)
  165. print("创建文件夹"+musicDir)
  166. if len(item_clear) > 75:
  167. item_clear = item_clear[:70]+'···'
  168. elif '.' in item_clear:
  169. item_clear = item_clear.replace('.', '·')
  170. print(item_clear, end=" \n")
  171. mp3_path = musicDir+'/'+item_clear+'.mp3'
  172. m4a_path = musicDir+'/'+item_clear+'.m4a'
  173. lyric_path = musicDir+'/'+item_clear+'.txt'
  174. num = str(SONG_NUM)
  175. print('='*50)
  176. getMusic(songID_dic[item], mp3_path, num)
  177. getMusic(songID_dic[item], m4a_path, num)
  178. print('*'*50)
  179. getMusicText(songID_dic[item], lyric_path, num)
  180. print('='*50)
  181. if __name__ == '__main__':
  182. main()
  183. # getMusicList()
  184. # getMusicText("1994955842", "path")
  185. # getMusicMsg("1998931166")

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/496683
推荐阅读
相关标签
  

闽ICP备14008679号