赞
踩
对于做数据治理、数据集成平台都会涉及到数据标准化、mdm主数据管理,说白了就是字典对照,那么对于一些又多又杂的数据字典(如医疗中的诊断、检验项目)人工对照实在是耗时耗力耗眼睛。
针对上面的问题,本文介绍了以下几点内容:
1.python第三方库 fuzzywuzzy
2.python打包可执行文件
fuzzywuzzy是一个Python库,用于模糊字符串匹配,可以帮助你在处理文本数据时,找到最相似的字符串。它使用Levenshtein距离算法来计算两个字符串之间的相似度。
fuzzywuzzy库提供了两个主要的函数:fuzz 和 process
fuzz 函数用于计算两个字符串之间的相似度,它返回一个介于0和100之间的整数,表示两个字符串的相似度,示例如下:
- from fuzzywuzzy import fuzz
-
- string1 = "apple"
- string2 = "appel"
- similarity = fuzz.ratio(string1, string2)
- print(similarity) # 输出: 91
process函数用于在一个字符串列表中查找与给定字符串最相似的字符,示例如下:
- from fuzzywuzzy import process
-
- string = "apple"
- choices = ["appel", "banana", "orange", "pear"]
-
- result = process.extractOne(string, choices)
- print(result) # 输出: ('appel', 91)
在dic_map表中存有检验项目标准字典,我将源数据字典存储在了source_lab_item表中
1.连接数据库查询dic_map、source_lab_item表的数据
- import pymysql
-
-
- mydb = pymysql.connect(
- host="127.0.0.1",
- port=3306,
- user="user",
- password="pwd",
- database="database_name"
- )
- mycursor = mydb.cursor()
- mycursor.execute("SELECT lab_item FROM dic_map")
- result_a = mycursor.fetchall()
- mycursor.execute("SELECT source_lab_item FROM source_lab_item")
- result_b = mycursor.fetchall()
2.遍历表dic_map中的每一条数据,使用fuzzywuzzy库中的ratio函数计算表dic_map中的数据与表source_lab_item中的数据的相似度,取相似度最高的值存放在matched_item变量中。
- for item_a in result_a:
- max_ratio = 0
- matched_item = ""
- for item_b in result_b:
- ratio = fuzz.ratio(item_a[0], item_b[0])
- if ratio > max_ratio:
- max_ratio = ratio
- matched_item = item_b[0]
3.建个临时表mdm_lab_item(lab_item, source_lab_item) ,将匹配结果存放在该表中
- sql = "INSERT INTO mdm_lab_item (lab_item, source_lab_item) VALUES (%s, %s)"
- val = (item_a[0], matched_item)
- mycursor.execute(sql, val)
- mydb.commit()
4.上代码
- from fuzzywuzzy import fuzz
- import pandas as pd
- import pymysql
-
-
- mydb = pymysql.connect(
- host="127.0.0.1",
- port=3306,
- user="user",
- password="pwd",
- database="database_name"
- )
-
- # 查询表A
- mycursor = mydb.cursor()
- mycursor.execute("SELECT lab_item FROM dic_map where type = 'lab'")
- result_a = mycursor.fetchall()
- mycursor.execute("SELECT source_lab_item FROM source_lab_item")
- result_b = mycursor.fetchall()
-
- for item_a in result_a:
- max_ratio = 0
- matched_item = ""
- for item_b in result_b:
- ratio = fuzz.ratio(item_a[0], item_b[0])
- if ratio > max_ratio:
- max_ratio = ratio
- matched_item = item_b[0]
- # 将最接近的匹配结果存放在表C
- sql = "INSERT INTO mdm_lab_item (lab_item, source_lab_item) VALUES (%s, %s)"
- val = (item_a[0], matched_item)
- mycursor.execute(sql, val)
- mydb.commit()
1.安装pyinstaller库,win+R cmd
pip install pyinstaller
2.将.py文件放在单独的文件夹下,我这里放在D盘的demo(在其他目录下太乱了怕你找不到)
同样的在winddows命令窗口切换到该路径下,执行
pyinstaller -F -p D:\python\venv\Lib\site-packages 检验项目相似度匹配.py
- p 后边的路径是需要用到的第三方库,例如本次使用的pymysql、fuzzywuzzy、pandas
3.执行后能看到生成了两个文件夹build、dist,在dist中有.exe可执行文件,将整个文件夹demo打包后,放在其他没有python环境的电脑也能执行了
4.自动对照的数据示例:
补充:我在后边完善了下,总的代码如下
- import sys
- from PyQt5.QtWidgets import QApplication, QWidget, QLabel, QLineEdit, QPushButton, QVBoxLayout,QGridLayout ,QMessageBox
- import pymysql
- from fuzzywuzzy import fuzz
-
- class MySQLConnector(QWidget):
- def __init__(self):
- super().__init__()
- self.initUI()
-
- def initUI(self):
- # 创建标签和文本框
- self.ip_label = QLabel('IP地址')
- self.ip_edit = QLineEdit()
- self.port_label = QLabel('端口号')
- self.port_edit = QLineEdit()
- self.username_label = QLabel('用户名')
- self.username_edit = QLineEdit()
- self.password_label = QLabel('密码')
- self.password_edit = QLineEdit()
- self.dbname_label = QLabel('数据库名称')
- self.dbname_edit = QLineEdit()
- self.sourcetabname_label = QLabel('源字典表名')
- self.sourcetabname_edit = QLineEdit()
- self.sourcecolumn_label = QLabel('源字典字段名')
- self.sourcecolumn_edit = QLineEdit()
- self.systabname_label = QLabel('本地字典表名')
- self.systabname_edit = QLineEdit()
- self.syscolumn_label = QLabel('本地字典字段名')
- self.syscolumn_edit = QLineEdit()
-
- # 创建连接按钮
- self.connect_button = QPushButton('连接')
- self.connect_button.clicked.connect(self.connectMySQL)
- self.mate_button = QPushButton('匹配')
- self.mate_button.clicked.connect(self.get_approx_score)
-
- # 创建布局
- vbox = QGridLayout()
- vbox.addWidget(self.ip_label,0,0)
- vbox.addWidget(self.ip_edit,0,1)
- vbox.addWidget(self.port_label)
- vbox.addWidget(self.port_edit)
- vbox.addWidget(self.username_label)
- vbox.addWidget(self.username_edit)
- vbox.addWidget(self.password_label)
- vbox.addWidget(self.password_edit)
- vbox.addWidget(self.dbname_label)
- vbox.addWidget(self.dbname_edit)
- vbox.addWidget(self.connect_button)
- vbox.addWidget(self.mate_button)
- vbox.addWidget(self.sourcetabname_label)
- vbox.addWidget(self.sourcetabname_edit)
- vbox.addWidget(self.sourcecolumn_label)
- vbox.addWidget(self.sourcecolumn_edit)
- vbox.addWidget(self.systabname_label)
- vbox.addWidget(self.systabname_edit)
- vbox.addWidget(self.syscolumn_label)
- vbox.addWidget(self.syscolumn_edit)
-
- self.setLayout(vbox)
- self.setWindowTitle('匹配近似值')
- self.show()
-
- def connectMySQL(self):
- # 获取输入框的值
- ip = self.ip_edit.text()
- port = self.port_edit.text()
- username = self.username_edit.text()
- password = self.password_edit.text()
- dbname = self.dbname_edit.text()
-
- # 连接MySQL
- try:
- conn = pymysql.connect(host=ip, port=int(port), user=username, password=password, db=dbname)
- print('MySQL连接成功')
- msg_box = QMessageBox(QMessageBox.Question, '确定', 'MySQL连接成功')
- msg_box.exec_()
- except Exception as e:
- print('MySQL连接失败:', e)
- msg_box = QMessageBox(QMessageBox.Question, '确定', 'MySQL连接失败')
- msg_box.exec_()
-
- def get_approx_score(self):
- ip = self.ip_edit.text()
- port = self.port_edit.text()
- username = self.username_edit.text()
- password = self.password_edit.text()
- dbname = self.dbname_edit.text()
- sourcetab = self.sourcetabname_edit.text()
- sourcecolumn = self.sourcecolumn_edit.text()
- systab = self.systabname_edit.text()
- syscolumn = self.syscolumn_edit.text()
- mydb = pymysql.connect(
- host=ip,
- port=int(port),
- user=username,
- password=password,
- database=dbname
- )
- # 查询表A
- mycursor = mydb.cursor()
- mycursor.execute("SELECT "+ sourcecolumn+" FROM "+sourcetab)
- result_a = mycursor.fetchall()
- mycursor.execute("SELECT "+ syscolumn+" FROM "+systab)
- result_b = mycursor.fetchall()
-
- for item_a in result_a:
- max_ratio = 0
- matched_item = ""
- for item_b in result_b:
- ratio = fuzz.ratio(item_a[0], item_b[0])
- if ratio > max_ratio:
- max_ratio = ratio
- matched_item = item_b[0]
-
- # 将最接近的匹配结果存放在表C
- sql_create_table ="CREATE TABLE if not exists `get_valueapporx_sorce` (`source_value` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,`value` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, `sorce` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL)"
- mycursor.execute(sql_create_table)
- sql = "INSERT INTO get_valueapporx_sorce (value, source_value,sorce) VALUES (%s, %s, %s)"
- val = (item_a[0], matched_item, max_ratio)
- mycursor.execute(sql, val)
- mydb.commit()
- msg_box = QMessageBox(QMessageBox.Question, '确定', '生成完成!请在get_valueapporx_sorce表中查看')
- msg_box.exec_()
-
-
- if __name__ == '__main__':
- app = QApplication(sys.argv)
- ex = MySQLConnector()
- sys.exit(app.exec_())
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。