赞
踩
- from lxml import etree
- import requests
-
- header = {'User-Agent':
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}
-
-
- # 1.获取单个格言url中的内容
- def get_content(url, header):
- r = requests.get(url, headers=header)
-
- html = etree.HTML(r.text)
- # soup = BeautifulSoup(r, 'html.parser')
- title = html.xpath('//article/h1/text()')[0]
- result = html.xpath('//div[@id="print-area"]/p/text()')
- content = "\n".join(result[1:])
- # div = soup.find('div', attrs={'id': 'print-area'})
- # result1 = div.find_all('p')
- # for t in result1:
- # with open('')
- return title, content
- # 2.获得一个网页内的所有链接
- base_url = "https://www.fenzhiwu.com/lizhigeyan/rensheng/"
- response = requests.get(base_url, header)
- response.encoding = response.apparent_encoding
-
- base_html = etree.HTML(response.text)
-
- urls = base_html.xpath('//div[@class="uk-width-medium-4-5"]/h2/a/@href')
- # 3.下载保存格言内容
- for url in urls:
- url = 'https:' + url
- title, content = get_content(url, header)
- with open(f'格言/{title}.txt', 'w', encoding='utf-8') as f:
- f.write(title + '\n\n')
- f.write(content + '\n\n')
- print(f'已下载...{title}')
- print('下载完成!!!!!!')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。