题目要求我们用XPATH去爬某个网站并且保存为CSV文件
代码如下,仅供参考
- # -*- coding: UTF-8 -*-
- # 开发人员:萌狼蓝天
- # 博客:Https://mllt.cc
- # 笔记:Https://cnblogs.com/mllt
- # 哔哩哔哩/微信公众号:萌狼蓝天
- # 开发时间:2022/10/5
- import pandas as pd
- import requests
- import lxml.html
-
- csv_data = pd.DataFrame(columns=["序号", "标题", "链接", "作者", "点击", "回复", "更新时间"])
- # 获取页面源码
- headers = {
- "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; Tablet PC 2.0; wbx 1.0.0; wbxapp 1.0.0; Zoom 3.6.0)",
- "X-Amzn-Trace-Id": "Root=1-628b672d-4d6de7f34d15a77960784504"
- }
- code = requests.get("http://bbs.tianya.cn/list-no02-1.shtml", headers=headers).content.decode("utf-8")
- print("-------------------------------------------------获取源码-----------------------------------")
- # print(code)
- selector = lxml.html.fromstring(code)
- print("-------------------------------------------------获取关键部分-----------------------------------")
- lists = selector.xpath('//div[@class="mt5"]/table')
- print("-------------------------------------------------获取单独部分-----------------------------------")
- print(len(lists))
- for i in lists:
- x = 0
- for j in range(2, 9):
- for c in range(1, 11):
- x += 1
- title = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[1]/a/text()')[0].replace("\t", "").replace("\r", "").replace("\n", "")
- link = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[1]/a')[0].attrib['href'].replace("\t", "")
- author = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[2]/a/text()')[0].replace("\t", "")
- click = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[3]/text()')[0].replace("\t", "")
- reply = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[4]/text()')[0].replace("\t", "")
- reply_time = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[5]/text()')[0].replace("\t", "")
- csv_data=csv_data.append({"序号": x, "标题": title, "链接": 'http://bbs.tianya.cn/'+link, "作者": author, "点击": click, "回复": reply,
- "更新时间": reply_time}, ignore_index=True)
- print(title, link, author)
- print(csv_data)
- csv_data.to_csv("result.csv")
往期文章