当前位置:   article > 正文

Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址_python 匹配出html所有的js,css和资源地址

python 匹配出html所有的js,css和资源地址

直接上代码:

脱敏后自用的py采集代码,

  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. """
  4. @author:Andy
  5. @file:xxx.py
  6. @time:下午05:50
  7. @desc:采集的文章数据进博客
  8. """
  9. import os
  10. import re
  11. import time
  12. import requests
  13. from bs4 import BeautifulSoup, SoupStrainer
  14. from requests.exceptions import RequestException
  15. from hashlib import md5
  16. from urllib.parse import urlparse
  17. import urllib
  18. headers = {
  19. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
  20. }
  21. def get_content():
  22. url = 'http://ask.xxxx.com/question/xxxx' # url
  23. response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '')
  24. soup = BeautifulSoup(response, 'lxml')
  25. # div = soup.select('#aw-mod-body ueditor-p-reset')
  26. pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S)
  27. p = soup.find_all('a')
  28. for item in p:
  29. # print(str(item))
  30. result = re.findall(pattern, str(item))
  31. if result:
  32. # print(result)
  33. for i in result:
  34. url, name = i
  35. # print(i)
  36. yield {
  37. 'url': url,
  38. 'name': name
  39. }
  40. def mkdir(path):
  41. # 去除首位空格
  42. path=path.strip()
  43. # 去除尾部 \ 符号
  44. path=path.rstrip("\\")
  45. # 判断路径是否存在
  46. # 存在 True
  47. # 不存在 False
  48. isExists=os.path.exists(path)
  49. # 判断结果
  50. if not isExists:
  51. # 如果不存在则创建目录
  52. # 创建目录操作函数
  53. os.makedirs(path)
  54. print(path+' 创建成功')
  55. return True
  56. else:
  57. # 如果目录存在则不创建,并提示目录已存在
  58. print(path+' 目录已存在')
  59. return False
  60. def getUrl(html):
  61. #patterncss = '<link href="(.*?)"'
  62. patternjs = '<script src="(.*?)"'
  63. patternimg = '<img src="(.*?)"'
  64. #href = re.compile(patterncss, re.S).findall(html)
  65. href = re.compile(patternimg, re.S).findall(html)
  66. href += re.compile(patternjs, re.S).findall(html)
  67. return href
  68. def getCssUrl(html):
  69. patterncss = '<link href="(.*?)"'
  70. href = re.compile(patterncss, re.S).findall(html)
  71. return href
  72. # 下载网页
  73. def download_html(root_path, url):
  74. a = urlparse(url)
  75. file_path = a.path
  76. file_name = os.path.basename(file_path)
  77. _, file_suffix = os.path.splitext(file_name)
  78. if file_suffix != '.html':
  79. file_name_real = file_name + '.html'
  80. else:
  81. file_name_real = file_name
  82. file_path_real = file_path.replace(file_name, '')
  83. file_path_reals = file_path_real.replace('/', "\\")
  84. all_file_path_real = root_path + file_path_reals + file_name_real
  85. headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
  86. re = requests.get(url, headers = headers)
  87. re.encoding = "utf-8"
  88. itemurl = getUrl(re.text)
  89. for item1 in itemurl:
  90. download_commonimgjs(root_path, item1)
  91. itemcssurl = getCssUrl(re.text)
  92. for item2 in itemcssurl:
  93. download_css(root_path, item2)
  94. new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com')
  95. new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com')
  96. with open(all_file_path_real, "w+", encoding="utf-8") as html_file:
  97. html_file.write(new_texts)
  98. def download_commonimgjs(root_path, url):
  99. if str(url[:1]) == r"/":
  100. imgurl = "https://www.xxxxxx.com" + url
  101. else:
  102. imgurl = url
  103. a = urlparse(imgurl)
  104. file_path = a.path
  105. file_name = os.path.basename(file_path)
  106. _, file_suffix = os.path.splitext(file_name)
  107. # print(os.path.curdir(file_path))
  108. match_url = file_path.replace(file_name, '')
  109. match_url_new = match_url.replace('/', "\\")
  110. newmkpath = root_path + match_url_new
  111. if os.path.isfile(newmkpath + file_name):
  112. return
  113. # 调用函数
  114. mkdir(newmkpath)
  115. try:
  116. opener = urllib.request.build_opener()
  117. opener.addheaders = [('User-agent',
  118. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
  119. urllib.request.install_opener(opener)
  120. urllib.request.urlretrieve(imgurl, newmkpath + file_name)
  121. except urllib.error.HTTPError:
  122. print('error')
  123. def download_img(root_path, url):
  124. if str(url[:1]) == r"/":
  125. imgurl = "https://www.xxxxxx.com" + url
  126. else:
  127. imgurl = url
  128. a = urlparse(imgurl)
  129. file_path = a.path
  130. file_name = os.path.basename(file_path)
  131. _, file_suffix = os.path.splitext(file_name)
  132. # print(os.path.curdir(file_path))
  133. match_url = file_path.replace(file_name, '')
  134. match_url_new = match_url.replace('/', "\\")
  135. newmkpath = root_path + match_url_new
  136. # 调用函数
  137. mkdir(newmkpath)
  138. opener = urllib.request.build_opener()
  139. opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
  140. urllib.request.install_opener(opener)
  141. urllib.request.urlretrieve(imgurl, newmkpath + file_name)
  142. def download_js(root_path, url):
  143. if str(url[:1]) == r"/":
  144. imgurl = "https://www.xxxxxx.com" + url
  145. else:
  146. imgurl = url
  147. a = urlparse(imgurl)
  148. file_path = a.path
  149. file_name = os.path.basename(file_path)
  150. _, file_suffix = os.path.splitext(file_name)
  151. # print(os.path.curdir(file_path))
  152. match_url = file_path.replace(file_name, '')
  153. match_url_new = match_url.replace('/', "\\")
  154. newmkpath = root_path + match_url_new
  155. # 调用函数
  156. mkdir(newmkpath)
  157. opener = urllib.request.build_opener()
  158. opener.addheaders = [('User-agent',
  159. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
  160. urllib.request.install_opener(opener)
  161. urllib.request.urlretrieve(imgurl, newmkpath + file_name)
  162. def download_css(root_path, url):
  163. if str(url[:1]) == r"/":
  164. imgurl = "https://www.xxxxxx.com" + url
  165. else:
  166. imgurl = url
  167. a = urlparse(imgurl)
  168. file_path = a.path
  169. file_name = os.path.basename(file_path)
  170. _, file_suffix = os.path.splitext(file_name)
  171. if file_suffix != '.css':
  172. return
  173. # print(os.path.curdir(file_path))
  174. match_url = file_path.replace(file_name, '')
  175. match_url_new = match_url.replace('/', "\\")
  176. newmkpath = root_path + match_url_new
  177. if os.path.isfile(newmkpath + file_name):
  178. return
  179. # 调用函数
  180. mkdir(newmkpath)
  181. try:
  182. opener = urllib.request.build_opener()
  183. opener.addheaders = [('User-agent',
  184. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
  185. urllib.request.install_opener(opener)
  186. urllib.request.urlretrieve(imgurl, newmkpath + file_name)
  187. except urllib.error.HTTPError:
  188. print('error')
  189. def get_xml():
  190. url = 'https://www.xxxxxx.com/sitemap-1.xml'
  191. headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
  192. res = requests.get(url, headers=headers)
  193. res.encoding = "utf-8"
  194. # 根据你的文章链接格式写正则匹配,可能与我的不一样
  195. r = re.compile(r'https://www.xxxxxx.com/\S*?')
  196. big = re.findall(r, res.text)
  197. for i in big:
  198. print(i)
  199. def main():
  200. # get_content()
  201. # url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498'
  202. url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png'
  203. # 定义要创建的目录
  204. root_path = "F:\\Project-cz\\shanhubei\\3cinno"
  205. #download_img(root_path, url)
  206. #htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball'
  207. #download_html(root_path, htmlurl)
  208. cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css'
  209. #download_css(root_path, cssurl)
  210. #demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404'
  211. #demo(demourl)
  212. get_xml()
  213. def demo(url):
  214. a = urlparse(url)
  215. file_path = a.path
  216. print(a.scheme)
  217. print(a.hostname)
  218. print('a.file_path=' + file_path)
  219. file_name = os.path.basename(file_path)
  220. print('file_name=' +file_name)
  221. _, file_suffix = os.path.splitext(file_name)
  222. print('a.file_suffix=' + file_suffix)
  223. if __name__ == '__main__':
  224. main()

 来源:http://www.shanhubei.com/archives/2491.html

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/秋刀鱼在做梦/article/detail/868501
推荐阅读
相关标签
  

闽ICP备14008679号