赞
踩
- # !/usr/bin/env python
- # -*-coding:utf-8 -*-
- import os.path
- import bs4,shutil,time
- from pandas.core.frame import DataFrame
-
- def get_html_tabledata(htmlpath,tableindex: int = 0):
- """
- html文件,获取表格数据
- :param htmlpath: html文件路径
- :param tableindex: table索引,int,默认为0
- :return:字典列表
- """
- with open(htmlpath, 'r+',encoding='UTF-8') as f:
- s = f.read()
- wb = s.strip().replace('\ufeff', '')
- soup = bs4.BeautifulSoup(wb, 'lxml') # 解析html
-
- # 获取指定表格的数据
- table=soup.findAll("table")[tableindex] # 读取第二个表格
- table_rows = table.findAll("tr") # 获得表格中行的集合
- # 获取表格第一行作为字典keykey
- keys = [table_rows[0].findAll(['th', 'td'])[i].getText().strip() for i in range(len(table_rows[0].findAll(['th', 'td']))) ]
- tabledata = []
- for table_row in table_rows[1:]:
- row = table_row.findAll(['th', 'td']) # 获取th/td标签
- linedata = {keys[i]: row[i].getText().strip() for i in range(len(row))} # 每行数据按字段返回:键值对
- tabledata.append(linedata)
- # print(tabledata)
- return tabledata
-
- def html_to_excel(htmlpath,excelpath,tableindex: int = 0):
- """html文件,将指定表格数据保存到excel文件"""
- tabledata = get_html_tabledata(htmlpath,tableindex)
- data = DataFrame(tabledata) # 将字典列表转为表格样式
- # print(data,len(data),len(data.columns)) # 获取行数:len(df);获取列数:len(df.columns)
- # 写入excel
- data.to_excel(excelpath, index=False, header=True) # 输出为表,不带列号,输出文件名
-
- if __name__ == '__main__':
- htmlpath = r'C:\Users\yhen\Downloads\2022-06-17T13_51_06+0800.html'
- tabledata = get_html_tabledata(htmlpath,1)
- tabledata = sorted(tabledata,key=lambda x:x['关键字'])
- print(tabledata)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。