赞
踩
import pandas as pd import numpy as np from math import log2 # 读入数据集 data = { 'have_house': ['yes', 'no', 'no', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no'], 'marital_status': ['single', 'married', 'single', 'married', 'divorced', 'married', 'divorced', 'single', 'married', 'single'], 'annual_income': [125, 100, 70, 120, 95, 60, 220, 85, 75, 90], 'late_payment': ['no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes', 'no', 'yes']} df = pd.DataFrame(data) # 将文本属性转为数值属性 df['have_house'] = df['have_house'].apply(lambda x: 1 if x == 'yes' else 0) # 如果有房产,转换为1,否则转换为0 df['marital_status'] = df['marital_status'].map({ 'single': 0, 'married': 1, 'divorced': 2}) # 将婚姻状况转换为0-2的数字,表示单身、已婚和离异 df['late_payment'] = df['late_payment'].apply(lambda x: 1 if x == 'yes' else 0) # 如果拖欠贷款,转换为1,否则转换为0 # 定义C4.5算法所需的函数 def calc_entropy(data): """计算信息熵""" counts = data.value_counts() # 统计各类别样本的数量 probs = counts / len(data) # 计算各类别样本的概率 entropy = -sum(probs * np.log2(probs)) # 根据公式计算信息熵 return entropy def calc_conditional_entropy(data, feature, threshold): """计算条件熵""" low_subset = data[data[feature] < threshold]['late_payment'] # 获取年收入小于阈值的目标变量列 high_subset = data[data[feature] >= threshold]['late_payment'] # 获取年收入大于等于阈值的目标变量列 prob_low = len(low_subset) / len(data) # 计算前一部分的出现概率 prob_high = len(high_subset) / len(data) # 计算后一部分的出现概率 entropy = prob_low * calc_entropy(low_subset) + prob_high * calc_entropy(high_subset) # 计算条件熵 return entropy def calc_information_gain(data, feature): """计算信息增益""" base_entropy = calc_entropy(data['late_payment']) # 计算全局信息熵 sorted_values = sorted(data[feature].unique()) # 对连续属性的取值进行排序 thresholds = [(sorted_values[i] + sorted_values[i+1]) / 2 for i in range(len(sorted_values)-1)] # 计算各个分割点的阈值 info_gains = [] for threshold in thresholds: cond_entropy = calc_conditional_entropy(data, feature, threshold) info_gain = base_entropy - cond_entropy info_gains.append(info_gain) best_threshold_index =
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。