赞
踩
在本教程中,我们将使用Python编写一个功能强大的桌面应用程序,用于数据清洗、压缩、合并excel文件。该程序基于PySide6库和其他辅助库进行开发,提供了直观的用户界面和易于使用的功能。下面我们将对代码进行分块介绍。(底部附项目完整代码)
在程序的开头,我们导入了各种必要的库和模块,包括在数据处理中使用的bag模块、图形界面相关的PySide6库、文件处理相关的zipfile、xlsxwriter和openpyxl,以及其他辅助库
DataCleaning类是我们应用程序的核心组件之一,用于实现数据清洗功能。它包含了选择文件、保存文件、开始清洗和清洗附加选项等功能。具体而言,该类包括以下方法:
CompressionDecompression类用于实现文件压缩和解压功能。它包括选择文件、保存文件、压缩文件和解压文件等方法。具体而言,该类包括以下方法:
SelectDirectory类实现了合并表格的功能。它包括选择文件夹、选择保存文件和运行合并操作等方法。具体而言,该类包括以下方法:
Layout类是整个应用程序的主布局类,用于显示各个功能的按钮和界面。它包含了数据清洗、压缩、合并和关键词抽取四个子界面,并提供了切换子界面的功能
在程序的最后,我们创建了一个QApplication对象,实例化了Layout类,并显示应用程序的主窗口。
通过以上分块介绍,我们已经了解了整个应用程序的结构和各个功能的实现方式
- #!/usr/bin/env python3
- # coding:utf-8
- import re
- import sys
- import time
- import os
- from datetime import datetime
- import bag
- from PySide6.QtGui import QFont, QPainter, QPen, QPalette, QColor, QIntValidator
- from PySide6.QtWidgets import QApplication, QMainWindow, QLabel, QPushButton, QFrame, QFileDialog, QLineEdit, \
- QProgressBar, QMessageBox, QStackedWidget, QVBoxLayout, QTextBrowser
- from PySide6.QtCore import Qt, QJsonDocument
- from tqdm import tqdm
- import zipfile
- import xlsxwriter
- import openpyxl
- import emoji
- import json
-
-
- session = bag.session.create_session()
-
-
- def judge(file_path): # 加载数据
- try:
- suffix = file_path.split('.')[-1]
- suffix_dic = {'xlsx': bag.Bag.read_excel, 'csv': bag.Bag.read_csv, 'json': bag.Bag.read_json}
- data = suffix_dic.get(suffix)(file_path)
- return data
- except AttributeError:
- print('未选择文件')
- exit()
-
-
- def set_text_browser_background_color(text_browser, color): # 设置背景颜色
- palette = text_browser.palette()
- palette.setColor(QPalette.Base, color)
- text_browser.setPalette(palette)
-
-
- def set_text_browser_foreground_color(text_browser, color): # 设置字体颜色
- palette = text_browser.palette()
- palette.setColor(QPalette.Text, color)
- text_browser.setPalette(palette)
-
-
- def unzip_file(zip_path, extract_path):
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
- zip_ref.extractall(extract_path)
-
-
- # noinspection PyTypeChecker
- class SeparatorLine(QFrame):
- def paintEvent(self, event):
- painter = QPainter(self)
- painter.setPen(QPen(self.palette().color(QPalette.Dark).darker(150), 2))
- painter.drawLine(self.width() / 2, 0, self.width() / 2, self.height())
-
-
- # noinspection PyTypeChecker
- class SeparatorLine1(QFrame):
- def paintEvent(self, event):
- painter = QPainter(self)
- painter.setPen(QPen(self.palette().color(QPalette.Dark).darker(150), 2))
- painter.drawLine(0, self.height() / 2, self.width(), self.height() / 2)
-
-
- class Layout(QMainWindow):
- def __init__(self):
- super().__init__()
-
- self.data_cleaning = None
- self.Compression = None
- self.file_directory = None
- self.keywords = None
- self.ppt_order = None
-
- # 定义标题与窗口大小
- self.setWindowTitle("数据清洗")
- self.setGeometry(100, 100, 1280, 720)
-
- self.stacked_widget = QStackedWidget(self)
- self.setCentralWidget(self.stacked_widget)
-
- self.label = QLabel(datetime.now().strftime("%Y-%m-%d"), self)
- self.label.setGeometry(10, 5, 130, 50)
- self.label.setFont(QFont("等线", 18)) # 设置字体和字号
-
- self.clean_data = QPushButton('数据清洗', self)
- self.clean_data.setFont(QFont('等线', 11))
- self.clean_data.setGeometry(155, 10, 130, 30)
- self.clean_data.clicked.connect(self.show_data_cleaning)
-
- self.zip_data = QPushButton('压缩/解压', self)
- self.zip_data.setFont(QFont('等线', 11))
- self.zip_data.setGeometry(300, 10, 130, 30)
- self.zip_data.clicked.connect(self.show_zip_page)
-
- self.zip_data = QPushButton('合并excel', self)
- self.zip_data.setFont(QFont('等线', 11))
- self.zip_data.setGeometry(445, 10, 130, 30)
- self.zip_data.clicked.connect(self.select_directory)
-
- self.line = SeparatorLine(self)
- self.line.setGeometry(150, 0, 2, 10000) # 添加垂直分割线
-
- self.line_horizontal = SeparatorLine1(self)
- # noinspection PyTypeChecker
- self.line_horizontal.setGeometry(150, self.height() / 8, 10000, 2) # 添加水平分割线
-
- def show_data_cleaning(self):
- if not self.data_cleaning:
- self.data_cleaning = DataCleaning()
- self.stacked_widget.addWidget(self.data_cleaning)
- self.stacked_widget.setCurrentWidget(self.data_cleaning)
-
- def show_zip_page(self):
- if not self.Compression:
- self.Compression = CompressionDecompression()
- self.stacked_widget.addWidget(self.Compression)
- self.stacked_widget.setCurrentWidget(self.Compression)
-
- def select_directory(self):
- if not self.file_directory:
- self.file_directory = SelectDirectory()
- self.stacked_widget.addWidget(self.file_directory)
- self.stacked_widget.setCurrentWidget(self.file_directory)
-
-
- # 清洗
- class DataCleaning(Layout):
- def __init__(self):
- super().__init__()
- self.save_path = None
- self.file_path = None
- self.layout = QVBoxLayout()
-
- self.progress_bar = QProgressBar(self)
- self.progress_bar.setGeometry(155, 235, 668, 28)
- self.progress_bar.setVisible(False)
-
- self.button_select = QPushButton("打开", self)
- self.button_select.setFont(QFont('等线', 11))
- self.button_select.setGeometry(10, 60, 130, 30)
- self.button_select.clicked.connect(self.select_file)
-
- self.button_save = QPushButton("另存为", self)
- self.button_save.setFont(QFont('等线', 11))
- self.button_save.setGeometry(10, 95, 130, 30)
- self.button_save.clicked.connect(self.save_file)
-
- self.button_run = QPushButton("开始清洗", self)
- self.button_run.setFont(QFont('等线', 11))
- self.button_run.setGeometry(10, 130, 130, 30)
- self.button_run.clicked.connect(self.run)
-
- self.button = QPushButton("清洗附加选项", self)
- self.button.setFont(QFont('等线', 11))
- self.button.setGeometry(155, 60, 130, 30)
- self.button.clicked.connect(self.button_clicked)
-
- self.label_1_1 = QLabel(self)
- self.label_1_1.setGeometry(290, 50, 700, 50)
- self.label_1_1.setFont(QFont("等线", 14)) # 设置字体和字号
- self.label_1_1.setVisible(False)
-
- self.input_text = QLineEdit(self) # 创建一个输入框
- self.input_text.setGeometry(155, 125, 130, 30)
- self.input_text.setVisible(False)
- self.input_text.setValidator(QIntValidator())
-
- self.label_info = QLabel("清洗指定列", self) # 创建一个标签
- self.label_info.setFont(QFont('等线', 11))
- self.label_info.setGeometry(155, 95, 130, 30) # 设置标签位置和大小
- self.label_info.setVisible(False) # 初始时隐藏标签
-
- self.label_info_1 = QLabel("根据指定列去重", self)
- self.label_info_1.setFont(QFont('等线', 11))
- self.label_info_1.setGeometry(155, 155, 130, 30)
- self.label_info_1.setVisible(False)
-
- self.input_text_1 = QLineEdit(self)
- self.input_text_1.setGeometry(155, 185, 130, 30)
- self.input_text_1.setVisible(False)
- self.input_text_1.setValidator(QIntValidator())
-
- # 指示灯
- self.red_light = QLabel(self)
- self.red_light.setGeometry(10, 170, 20, 20)
- self.red_light.setStyleSheet("background-color: red")
- self.red_light.setVisible(False)
-
- self.green_light = QLabel(self)
- self.green_light.setGeometry(35, 170, 20, 20)
- self.green_light.setStyleSheet("background-color: green")
- self.green_light.setVisible(False)
-
- self.line_horizontal_1 = SeparatorLine1(self)
- self.line_horizontal_1.setGeometry(150, 225, 10000, 2)
- self.line_horizontal_1.setVisible(False)
-
- def button_clicked(self):
- self.label_1_1.setText('(温馨提示,下方的输入框只能输入数字,以下参数非必须参数,可不填)')
- self.input_text.setVisible(not self.input_text.isVisible()) # 切换输入框的可见性
- self.label_info.setVisible(self.input_text.isVisible())
-
- self.input_text_1.setVisible(not self.input_text_1.isVisible()) # 切换输入框的可见性
- self.label_info_1.setVisible(self.input_text_1.isVisible())
-
- self.label_1_1.setVisible(not self.label_1_1.isVisible())
- self.label_1_1.setVisible(self.label_1_1.isVisible())
-
- def select_file(self): # 选择文件
- file_dialog = QFileDialog()
- file_path, _ = file_dialog.getOpenFileName(self, "选择文件", "", "All Files (*)")
- if file_path:
- self.file_path = file_path
-
- def save_file(self): # 保存文件
- file_dialog = QFileDialog()
- file_dialog.setWindowTitle("另存为") # 修改对话框标题
- file_dialog.setNameFilters(["Text Files (*.txt)", "CSV Files (*.csv)",
- "Excel Files (*.xlsx *.xls)", "JSON Files (*.json *.jsonl)"])
- file_dialog.selectNameFilter("Excel Files (*.xlsx")
- if file_dialog.exec():
- self.save_path = file_dialog.selectedFiles()[0]
- if os.path.exists(self.save_path):
- # noinspection PyUnresolvedReferences
- result = QMessageBox.warning(self, "警告", "文件已存在,是否覆盖?",
- QMessageBox.Yes | QMessageBox.No)
- # noinspection PyUnresolvedReferences
- if result == QMessageBox.Yes:
- bag.Bag.save_excel([], self.save_path)
- else:
- self.save_path = ''
- else:
- bag.Bag.save_excel([], self.save_path)
-
- def run(self): # 开始清洗
- resp = judge(self.file_path)
- total = len(resp)
-
- def clean_1(ls, col_number, col_name):
- self.red_light.setVisible(True)
- result = []
- for i, info in enumerate(tqdm(ls)):
- mid = []
- for value in info[col_number].split('\n'):
- if re.sub(r'\s', '', value):
- text = value.lstrip(',.?!;:,。?;:')
- text1 = text.strip()
- text2 = emoji.replace_emoji(text1, replace='<emoji>')
- mid.append(text2)
- else:
- pass
- info[col_number] = '\n'.join(mid)
- result.append(info)
-
- # 更新进度条的值
- progress = int((i + 1) / total * 100)
- self.progress_bar.setValue(progress)
- QApplication.processEvents() # 刷新界面
-
- sign = []
- new_result = [item for item in result if item[col_name] not in sign and not sign.append(item[col_name])]
-
- self.red_light.setVisible(False)
- self.green_light.setVisible(True)
-
- if bool(self.save_path):
- bag.Bag.save_excel(new_result, self.save_path)
- else:
- self.save_file()
- bag.Bag.save_excel(new_result, self.save_path)
- time.sleep(1)
- self.green_light.setVisible(False)
-
- def clean_2(ls, col_number):
- self.red_light.setVisible(True)
- result = []
- for i, info in enumerate(tqdm(ls)):
- mid = []
- for value in info[col_number].split('\n'):
- if re.sub(r'\s', '', value):
- text = value.lstrip(',.?!;:,。?;:')
- text1 = text.strip()
- text2 = emoji.replace_emoji(text1, replace='<emoji>')
- mid.append(text2)
- else:
- pass
- info[col_number] = '\n'.join(mid)
- result.append(info)
-
- progress = int((i + 1) / total * 100)
- self.progress_bar.setValue(progress)
- QApplication.processEvents() # 刷新界面
-
- self.red_light.setVisible(False)
- self.green_light.setVisible(True)
-
- if bool(self.save_path):
- bag.Bag.save_excel(result, self.save_path)
- else:
- self.save_file()
- bag.Bag.save_excel(result, self.save_path)
- time.sleep(1)
- self.green_light.setVisible(False)
-
- def clean_3(ls, col_name):
- self.red_light.setVisible(True)
- result = []
- for i, info in enumerate(tqdm(ls)):
- mid1 = []
- for info1 in info:
- mid = []
- for value in info1.split('\n'):
- if re.sub(r'\s', '', value):
- text = value.lstrip(',.?!;:,。?;:')
- text1 = text.strip()
- text2 = emoji.replace_emoji(text1, replace='<emoji>')
- mid.append(text2)
- else:
- pass
- mid1.append('\n'.join(mid))
- result.append(mid1)
-
- progress = int((i + 1) / total * 100)
- self.progress_bar.setValue(progress)
- QApplication.processEvents() # 刷新界面
-
- sign = []
- new_result = [item for item in result if item[col_name] not in sign and not sign.append(item[col_name])]
-
- self.red_light.setVisible(False)
- self.green_light.setVisible(True)
-
- if bool(self.save_path):
- bag.Bag.save_excel(new_result, self.save_path)
- else:
- self.save_file()
- bag.Bag.save_excel(new_result, self.save_path)
- time.sleep(1)
- self.green_light.setVisible(False)
-
- def clean_4(ls):
- self.red_light.setVisible(True)
- result = []
- for i, info in enumerate(tqdm(ls)):
- mid1 = []
- for info1 in info:
- mid = []
- for value in info1.split('\n'):
- if re.sub(r'\s', '', value):
- text = value.lstrip(',.?!;:,。?;:')
- text1 = text.strip()
- text2 = emoji.replace_emoji(text1, replace='<emoji>')
- mid.append(text2)
- else:
- pass
- mid1.append('\n'.join(mid))
- result.append(mid1)
-
- progress = int((i + 1) / total * 100)
- self.progress_bar.setValue(progress)
- QApplication.processEvents() # 刷新界面
-
- self.red_light.setVisible(False)
- self.green_light.setVisible(True)
-
- if bool(self.save_path):
- bag.Bag.save_excel(result, self.save_path)
- else:
- self.save_file()
- bag.Bag.save_excel(result, self.save_path)
-
- time.sleep(1)
- self.green_light.setVisible(False)
-
- additional_options = self.input_text.text() # 获取输入框的文本内容
- additional_options_1 = self.input_text_1.text()
- col_num = additional_options
- tag_name = additional_options_1
- """判断是否夹带清洗附加条件"""
- target = bool(col_num)
- target_1 = bool(tag_name)
-
- # 进度条
- self.progress_bar.setVisible(True)
- self.line_horizontal_1.setVisible(True)
-
- # 处理异常
- try:
- if target and target_1: # 清洗指定列以及根据某某标签去重
- clean_1(resp, int(col_num)-1, int(tag_name)-1)
- elif target and not target_1: # 清洗子指定列,不去重
- clean_2(resp, int(col_num)-1)
- elif not target and target_1: # 清洗全文再根据某某列去重
- clean_3(resp, int(tag_name)-1)
- else: # 清洗全文,不去重
- clean_4(resp)
- QMessageBox.information(self, "完成", "数据清洗完成")
- except Exception as e:
- error_message = str(e)
- QMessageBox.critical(self, "错误", error_message)
-
-
- # 压缩、解压
- class CompressionDecompression(Layout):
- def __init__(self):
- super().__init__()
- self.layout = QVBoxLayout()
-
- self.file_path = None
-
- self.button_compress = QPushButton("压缩文件", self)
- self.button_compress.setFont(QFont('等线', 11))
- self.button_compress.setGeometry(10, 60, 130, 30)
- self.button_compress.clicked.connect(self.on_button_clicked)
-
- self.button_extract = QPushButton("解压文件", self)
- self.button_extract.setFont(QFont('等线', 11))
- self.button_extract.setGeometry(10, 95, 130, 30)
- self.button_extract.clicked.connect(self.on_button_clicked1)
-
- def on_button_clicked(self):
- selected_file = bool(self.file_path)
- if selected_file:
- extract_path = QFileDialog.getExistingDirectory(self, "另存为")
- if bool(extract_path):
- unzip_file(selected_file, extract_path)
- QMessageBox.information(self, 'success', '解压成功!')
- else:
- QMessageBox.critical(self, 'error', '解压失败')
- else:
- self.select_file()
- extract_path = QFileDialog.getExistingDirectory(self, "另存为")
- if bool(extract_path):
- unzip_file(self.file_path, extract_path)
- QMessageBox.information(self, 'success', '解压成功!')
- else:
- QMessageBox.critical(self, 'error', '解压失败')
-
- def on_button_clicked1(self):
- selected_file = bool(self.file_path)
- if selected_file:
- extract_path = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
- if bool(extract_path):
- self.compress_to_zip(selected_file, extract_path)
- QMessageBox.information(self, 'success', '压缩成功!')
- else:
- QMessageBox.critical(self, 'error', '压缩失败')
- else:
- self.select_folder()
- extract_path = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
- if bool(extract_path):
- self.compress_to_zip(self.file_path, extract_path)
- QMessageBox.critical(self, 'success', '压缩成功!')
- else:
- QMessageBox.critical(self, 'error', '压缩失败')
-
- def select_file(self): # 选择文件
- file_dialog = QFileDialog()
- file_path, _ = file_dialog.getOpenFileName(self, "选择文件", "", "All Files (*)")
- if file_path:
- self.file_path = file_path
-
- def select_folder(self):
- file_path, _ = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
- if file_path:
- self.file_path = file_path
-
- def compress_to_zip(self, file_path, zip_path):
- with zipfile.ZipFile(zip_path, 'w') as zipf:
- zipf.write(file_path, arcname='compressed_file.txt')
-
-
- # 合并表格
- # noinspection PyMethodMayBeStatic
- class SelectDirectory(Layout):
- def __init__(self):
- super().__init__()
- self.layout = QVBoxLayout()
- self.button_extract = QPushButton("选择合并文件路径", self)
- self.button_extract.setFont(QFont('等线', 11))
- self.button_extract.setGeometry(10, 60, 130, 30)
- self.button_extract.clicked.connect(self.choose_folder)
-
- self.progress_bar = QProgressBar(self)
- self.progress_bar.setGeometry(155, 100, 668, 28)
- self.progress_bar.setVisible(False)
-
- self.red_light = QLabel(self)
- self.red_light.setGeometry(10, 100, 20, 20)
- self.red_light.setStyleSheet("background-color: red")
- self.red_light.setVisible(False)
-
- self.green_light = QLabel(self)
- self.green_light.setGeometry(35, 100, 20, 20)
- self.green_light.setStyleSheet("background-color: green")
- self.green_light.setVisible(False)
-
- def choose_folder(self):
- # noinspection PyUnresolvedReferences
- self.red_light.setVisible(True)
- self.progress_bar.setVisible(True)
- folder = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
- save_path = folder+'\\'+'合并结果.xlsx'
- # 清空前一次合并结果
- if os.path.isfile(save_path):
- os.remove(save_path)
- else:
- pass
- file_name = os.listdir(folder)
- consolidated_number = file_name # 合并索引,后期需要可优化
- total = len(consolidated_number)
- workbook = xlsxwriter.Workbook(save_path, options={'strings_to_urls': False})
- worksheet = workbook.add_worksheet('Sheet1')
- count_1 = 0
- n = 0
- for i, _ in enumerate(tqdm(consolidated_number)):
- path = folder + '\\' + _.replace('\n', '')
- book = openpyxl.load_workbook(path)
- sheet = book['Sheet1']
- rows = sheet.max_row
- columns = sheet.max_column
- a = 1
- if n == 0:
- a = 0
- for k in range(a, rows):
- for j in range(columns):
- worksheet.write(n, j, str(sheet.cell(k + 1, j + 1).value))
- n = n + 1
- count_1 += 1
- book.close()
- # 更新进度条的值
- progress = int((i + 1) / total * 100)
- self.progress_bar.setValue(progress)
- QApplication.processEvents() # 刷新界面
- workbook.close()
- self.red_light.setVisible(False)
- self.green_light.setVisible(True)
- QMessageBox.information(self, "success", f'合并完成,合并文件路径:{folder}/合并文件.xlsx')
- time.sleep(3)
- self.green_light.setVisible(False)
-
- @staticmethod
- def save_excel(_ls, path, _sheet_name='Sheet1', batch_size=10000):
- _book = xlsxwriter.Workbook(path, options={'strings_to_urls': False})
- _sheet = _book.add_worksheet(_sheet_name)
- batch_size = batch_size
- for i in range(0, len(_ls), batch_size):
- batch_data = _ls[i:i + batch_size]
- for _i in range(len(batch_data)):
- for __i in range(len(batch_data[_i])):
- _sheet.write(_i + i, __i, json.dumps(batch_data[_i][__i], ensure_ascii=False, indent=2))
- _book.close()
-
-
- if __name__ == "__main__":
- app = QApplication(sys.argv)
- window = Layout()
- window.show()
- sys.exit(app.exec())
-

最后,如果你觉得本教程对你有所帮助,不妨点赞并关注我的CSDN账号。我会持续为大家带来更多有趣且实用的教程和资源。谢谢大家的支持!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。