        2.1 缺失值处理

        2.2 异常值处理

        2.3 特征工程











        2.1 缺失值处理


  1. Train = pd.read_csv('E:\\data1.csv', delimiter=',') # 训练集
  2. Test = pd.read_csv('E:\\data2.csv', delimiter=',') # 测试集
  3. print('Train shape: ', Train.shape)
  4. print('Test shape: ', Test.shape)
  5. Combine = pd.concat([Train, Test]) # 将测试集和训练集合并
  6. print('Combine shape: ', Combine.shape)


print(Combine.isna().sum())  # 统计数据集各列缺失值个数




Combine = Combine.fillna(Combine.mode().iloc[0, :])  # 用每一列出现最多的数据填充。

        2.2 异常值处理





  1. # 匿名特征处理:D_11。拆分为length、width和high
  2. series1 = Combine['D_11'].str.split('*', expand=True)
  3. Combine['length'] = series1[0]
  4. Combine['width'] = series1[1]
  5. Combine['high'] = series1[2]
  6. Combine['length'] = Combine['length'].astype(float)
  7. Combine['width'] = Combine['width'].astype(float)
  8. Combine['high'] = Combine['high'].astype(float)




Combine.drop(['D_0', 'D_3', 'carid', 'D_6', 'D_11', 'D_14'], axis=1, inplace=True)  # 删除不利于模型训练的变量


        2.3 特征工程



        跟据上图,挑选离散变量'carCode', 'color', 'country', 'maketype', 'oiltype', 'D_7', 'D_8', 'D_9', 'D_10', 'D_13'进行One-hot编码:


  1. def One_Hot(OneHotCol):
  2. new_cols = []
  3. for old_col in OneHotCol:
  4. new_cols += sorted(['{0}_{1}'.format(old_col, str(x).lower()) for x in set(Combine[old_col].values)])
  5. ec = OneHotEncoder()
  6. ec.fit(Combine[OneHotCol].values)
  7. # list(Combine.index.values) # 取出Combine的索引
  8. OneHotCode = pd.DataFrame(ec.transform(Combine[OneHotCol]).toarray(), columns=new_cols,
  9. index=list(Combine.index.values)).astype(int)
  10. return OneHotCode


  1. OneHotCol = ['carCode', 'color', 'country', 'maketype', 'oiltype', 'D_7', 'D_8', 'D_9', 'D_10', 'D_13']
  2. OneHotCode = One_Hot(OneHotCol)
  3. # 合并Combine和OneHotCode
  4. Combine = pd.concat([Combine, OneHotCode], axis=1)





  1. def date_proc(x):
  2. month = int(x[4:6])
  3. if month == 0:
  4. month = 1
  5. if len(x) == 6:
  6. return x[:4] + '-' + str(month)
  7. else:
  8. return x[:4] + '-' + str(month) + '-' + x[6:]


  1. def date_transform(df, fea_col):
  2. for f in tqdm(fea_col):
  3. df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
  4. df[f + '_year'] = df[f].dt.year
  5. df[f + '_month'] = df[f].dt.month
  6. df[f + '_day'] = df[f].dt.day
  7. df[f + '_dayofweek'] = df[f].dt.dayofweek
  8. return (df)


        对'registerDate', 'tradeTime', 'licenseDate'这三个变量进行标准日期格式转换并提取日期特征:

  1. Date = ['registerDate', 'tradeTime', 'licenseDate']
  2. Combine = date_transform(Combine, Date)


  1. # 匿名特征处理 D_12
  2. Combine = Combine[Combine['D_12'].notna()]
  3. Combine['D_12'].astype('str').apply(date_proc)
  4. Combine['D_12'] = pd.to_datetime(Train['D_12'])
  5. Combine['D_12_year'] = Combine['D_12'].dt.year
  6. Combine['D_12_month'] = Combine['D_12'].dt.month



  1. # 对提取的日期特征进行One-hot编码
  2. OneHotCol2 = ['registerDate_year', 'registerDate_month', 'registerDate_dayofweek', 'tradeTime_year', 'tradeTime_month',
  3. 'tradeTime_dayofweek', 'licenseDate_year', 'licenseDate_month', 'licenseDate_dayofweek', 'D_12_year',
  4. 'D_12_month']
  5. OneHotCode2 = One_Hot(OneHotCol2)
  6. Combine = pd.concat([Combine, OneHotCode2], axis=1)



  1. # 构建特征:汽车使用天数
  2. Combine['used_time1'] = (pd.to_datetime(Combine['tradeTime'], format='%Y%m%d', errors='coerce') -
  3. pd.to_datetime(Combine['registerDate'], format='%Y%m%d', errors='coerce')).dt.days
  4. # 构建特征:汽车注册日期距今天数
  5. Combine['used_time2'] = (
  6. pd.datetime.now() - pd.to_datetime(Combine['registerDate'], format='%Y%m%d', errors='coerce')).dt.days
  7. # 构建特征:汽车上线日期距今天数
  8. Combine['used_time3'] = (pd.datetime.now() - pd.to_datetime(Combine['tradeTime'], format='%Y%m%d', errors='coerce')).dt.days


        编写日期分桶函数函数,并对'used_time1', 'used_time2', 'used_time3'进行分桶:

  1. # 数据分桶函数
  2. def cut_group(df, cols, num_bins=50):
  3. for col in cols:
  4. all_range = int(df[col].max() - df[col].min())
  5. # ceil():返回一个数的上取整数;floor():返回一个数的下舍整数
  6. bin = [np.ceil(df[col].min() - 1) + np.floor(i * all_range / num_bins) for i in range(num_bins + 2)]
  7. # bin是一个列表,区间两端的选取就是跟据bin里的数据决定。如第一个区间就是[bin[0], bin[1]]
  8. df[col + '_bin'] = pd.cut(df[col], bin, labels=False)
  9. return df
  10. # 对汽车使用天数,汽车注册日期距今天数 ,汽车上线日期距今天数进行数据分桶
  11. CutCol = ['used_time1', 'used_time2', 'used_time3']
  12. Combine = cut_group(Combine, CutCol, 50)




  1. list1 = [1, 2, 4, 5, 7, 8, 9, 10, 12, 13]
  2. for i in ['D_' + str(m) for m in list1]:
  3. for j in ['D_' + str(n) for n in list1]:
  4. Combine[str(i) + '+' + str(j)] = Combine[i] + Combine[j]
  5. for i in ['brand', 'serial', 'model', 'mileage', 'color', 'cityId', 'carCode', 'transferCount', 'seatings', 'country',
  6. 'maketype', 'modelyear', 'displacement', 'gearbox', 'oiltype', 'newprice', 'length', 'width', 'high']:
  7. for j in ['D_' + str(n) for n in list1]:
  8. Combine[str(i) + '*' + str(j)] = Combine[i] * Combine[j]



  1. AllCol = Combine.columns
  2. Train = Combine.iloc[:len(Train), :][AllCol]
  3. a = dict(Train.corr()['price']) # 各变量与price变量的相关性
  4. asortlist = sorted(a.items(), key=lambda x: x[1], reverse=True) # 以字典的值为基准对字典的项进行排序
  5. for i in asortlist:
  6. print(i)



  1. # 特征交叉函数
  2. def cross_feature(df, fea_col, Nfea_col):
  3. for i in tqdm(fea_col): # 遍历分类特征
  4. for j in tqdm(Nfea_col): # 遍历数值特征
  5. # 调用groupby()函数,以参数i分组,之后,用agg函数对数据做一些聚合操作(求最大值、最小值、中位数)
  6. feat = df.groupby(i, as_index=False)[j].agg({
  7. '{}_{}_max'.format(i, j): 'max', # 最大值
  8. '{}_{}_min'.format(i, j): 'min', # 最小值
  9. '{}_{}_median'.format(i, j): 'median', # 中位数
  10. })
  11. df = df.merge(feat, on=i, how='left')
  12. return (df)
  13. # 挑选与Price相关程度高的非匿名变量和匿名变量作特征交叉
  14. Cross_fea = ['newprice', 'displacement', 'width', 'length', 'maketype', 'maketype_3', 'modelyear']
  15. Cross_Nfea = ['D_1', 'D_10_3', 'D_7', 'D_7_5', 'D_10', 'D_4', 'D_12']
  16. Combine = cross_feature(Combine, Cross_fea, Cross_Nfea)


  1. # 还原训练集和测试集
  2. InputCol = Combine.columns.drop('price')
  3. XTrain = Combine.iloc[:len(Train), :][InputCol]
  4. YTrain = Train['price']
  5. XTest = Combine.iloc[len(Train):, :][InputCol]
  6. print("XTrain shape: ", XTrain.shape)
  7. print("XTestshape: ", XTest.shape)


        对于高基数特征,可以使用平均数编码, 有监督地确定最适合这个定性特征的编码方式。高基数特征,简单地说就是一个特征有很多个取值。对于这些不适合使用One-hot编码的特征,有人提出了平均数编码这一概念。如果想详细了解平均数编码的话可以看一下这篇博客,我就是照搬这个大佬写的平均数编码的代码:

平均数编码:针对高基数定性特征(类别特征)的数据预处理/特征工程https://blog.csdn.net/juzexia/article/details/78581462?spm=1001.2014.3001.5506        平均数编码实现代码:

  1. class MeanEncoder:
  2. def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
  3. self.categorical_features = categorical_features
  4. self.n_splits = n_splits
  5. self.learned_stats = {}
  6. if target_type == 'classification':
  7. self.target_type = target_type
  8. self.target_values = []
  9. else:
  10. self.target_type = 'regression'
  11. self.target_values = None
  12. if isinstance(prior_weight_func, dict):
  13. self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
  14. elif callable(prior_weight_func):
  15. self.prior_weight_func = prior_weight_func
  16. else:
  17. self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
  18. @staticmethod
  19. def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
  20. X_train = X_train[[variable]].copy()
  21. X_test = X_test[[variable]].copy()
  22. if target is not None:
  23. nf_name = '{}_pred_{}'.format(variable, target)
  24. X_train['pred_temp'] = (y_train == target).astype(int) # classification
  25. else:
  26. nf_name = '{}_pred'.format(variable)
  27. X_train['pred_temp'] = y_train # regression
  28. prior = X_train['pred_temp'].mean()
  29. col_avg_y = X_train.groupby(variable)['pred_temp'].agg(['mean', 'size']).rename(
  30. columns={'mean': 'mean', 'size': 'beta'})
  31. col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
  32. col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
  33. col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
  34. nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
  35. nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
  36. return nf_train, nf_test, prior, col_avg_y
  37. def fit_transform(self, X, y):
  38. X_new = X.copy()
  39. if self.target_type == 'classification':
  40. skf = StratifiedKFold(self.n_splits)
  41. else:
  42. skf = KFold(self.n_splits)
  43. if self.target_type == 'classification':
  44. self.target_values = sorted(set(y))
  45. self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
  46. product(self.categorical_features, self.target_values)}
  47. for variable, target in product(self.categorical_features, self.target_values):
  48. nf_name = '{}_pred_{}'.format(variable, target)
  49. X_new.loc[:, nf_name] = np.nan
  50. for large_ind, small_ind in skf.split(y, y):
  51. nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
  52. X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target,
  53. self.prior_weight_func)
  54. X_new.iloc[small_ind, -1] = nf_small
  55. self.learned_stats[nf_name].append((prior, col_avg_y))
  56. else:
  57. self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
  58. for variable in self.categorical_features:
  59. nf_name = '{}_pred'.format(variable)
  60. X_new.loc[:, nf_name] = np.nan
  61. for large_ind, small_ind in skf.split(y, y):
  62. nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
  63. X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None,
  64. self.prior_weight_func)
  65. X_new.iloc[small_ind, -1] = nf_small
  66. self.learned_stats[nf_name].append((prior, col_avg_y))
  67. return X_new
  68. def transform(self, X):
  69. X_new = X.copy()
  70. if self.target_type == 'classification':
  71. for variable, target in product(self.categorical_features, self.target_values):
  72. nf_name = '{}_pred_{}'.format(variable, target)
  73. X_new[nf_name] = 0
  74. for prior, col_avg_y in self.learned_stats[nf_name]:
  75. X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
  76. nf_name]
  77. X_new[nf_name] /= self.n_splits
  78. else:
  79. for variable in self.categorical_features:
  80. nf_name = '{}_pred'.format(variable)
  81. X_new[nf_name] = 0
  82. for prior, col_avg_y in self.learned_stats[nf_name]:
  83. X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
  84. nf_name]
  85. X_new[nf_name] /= self.n_splits
  86. return X_new


  1. MeanEncol = ['model', 'brand', 'registerDate', 'tradeTime']
  2. # 如果是回归场景,那么target_type='regression';如果是分类场景,那么target_type='classification'
  3. MeanFit = MeanEncoder(MeanEncol, target_type='regression')
  4. XTrain = MeanFit.fit_transform(XTrain, YTrain)
  5. XTest = MeanFit.transform(XTest)



  1. # K折目标编码,
  2. # 回归场景中,对目标进行编码的常用方式:最小值、最大值、中位数、均值、求和、标准差、偏度、峰度、中位数绝对偏差
  3. XTrain['price'] = Train['price']
  4. EncCol = []
  5. StatDefaultDict = {
  6. 'max': XTrain['price'].max(),
  7. 'min': XTrain['price'].min(),
  8. 'median': XTrain['price'].median(),
  9. 'mean': XTrain['price'].mean(),
  10. 'sum': XTrain['price'].sum(),
  11. 'std': XTrain['price'].std(),
  12. 'skew': XTrain['price'].skew(),
  13. 'kurt': XTrain['price'].kurt(),
  14. 'mad': XTrain['price'].mad()
  15. }
  16. # 采用最大值、最小值、均值对目标特征price分别进行编码
  17. EncStat = ['max', 'min', 'mean']
  18. # 分为10折
  19. KF = KFold(n_splits=10, shuffle=True, random_state=2022)
  20. for f in tqdm(['serial', 'brand', 'registerDate_year', 'tradeTime_year', 'mileage', 'model']):
  21. EncDict = {}
  22. for stat in EncStat:
  23. EncDict['{}_target_{}'.format(f, stat)] = stat
  24. XTrain['{}_target_{}'.format(f, stat)] = 0
  25. XTest['{}_target_{}'.format(f, stat)] = 0
  26. EncCol.append('{}_target_{}'.format(f, stat))
  27. for i, (TrnIndex, ValIndex) in enumerate(KF.split(XTrain, YTrain)):
  28. TrnX, ValX = XTrain.iloc[TrnIndex].reset_index(drop=True), XTrain.iloc[ValIndex].reset_index(drop=True)
  29. EncDF = TrnX.groupby(f, as_index=False)['price'].agg(EncDict)
  30. ValX = ValX[[f]].merge(EncDF, on=f, how='left')
  31. TestX = XTest[[f]].merge(EncDF, on=f, how='left')
  32. for stat in EncStat:
  33. ValX['{}_target_{}'.format(f, stat)] = ValX['{}_target_{}'.format(f, stat)].fillna(StatDefaultDict[stat])
  34. TestX['{}_target_{}'.format(f, stat)] = TestX['{}_target_{}'.format(f, stat)].fillna(StatDefaultDict[stat])
  35. XTrain.loc[ValIndex, '{}_target_{}'.format(f, stat)] = ValX['{}_target_{}'.format(f, stat)].values
  36. XTest['{}_target_{}'.format(f, stat)] += TestX['{}_target_{}'.format(f, stat)].values / KF.n_splits


  1. print("XTrain shape: ", XTrain.shape)
  2. print("XTest shape: ", XTest.shape)






        上式中, 为样本数据最大值, 为样本数据最小值。

        基于上述分析,我们先调用 sklearn 库中 preprocessing 模块中的 MinMaxScalar()函数对特征进行归一化处理,接着调用sklearn库中decomposition模块中的PCA算法包对数据进行降维。其步骤为:


        其中,为标准化后的矩阵, 为矩阵A的均值。









  1. # 归一化(极差法)
  2. Scaler = MinMaxScaler()
  3. Scaler.fit(pd.concat([XTrain, XTest]).values)
  4. CombineScaler = Scaler.transform(pd.concat([XTrain, XTest]).values)
  5. print('CombineScaler shape: ', CombineScaler.shape)
  6. # 调用sklearn库中decomposition模块中的PCA算法包对数据进行降维操作
  7. # PCA降维
  8. PCA = decomposition.PCA(n_components=550)
  9. CombinePCA = PCA.fit_transform(CombineScaler)
  10. XTrainPCA = CombinePCA[:len(XTrain)]
  11. XTestPCA = CombinePCA[len(XTrain):]
  12. YTrain = Train['price'].values
  13. print('CombinePCA shape: ', CombinePCA.shape)



  1. def NN_model(input_dim):
  2. # 参数随机初始化
  3. init = keras.initializers.glorot_uniform(seed=1)
  4. model = keras.models.Sequential()
  5. model.add(Dense(units=300, use_bias=True, input_dim=input_dim, kernel_initializer=init, activation='softplus'))
  6. model.add(Dense(units=300, use_bias=True, kernel_initializer=init, activation='softplus')) # ReLU
  7. model.add(Dense(units=64, use_bias=True, kernel_initializer=init, activation='softplus'))
  8. model.add(Dense(units=32, use_bias=True, kernel_initializer=init, activation='softplus'))
  9. model.add(Dense(units=8, use_bias=True, kernel_initializer=init, activation='softplus'))
  10. model.add(Dense(units=1))
  11. return model
  12. class Metric(Callback):
  13. def __init__(self, model, callbacks, Combine):
  14. super().__init__()
  15. self.model = model
  16. self.callbacks = callbacks
  17. self.Combine = Combine
  18. def on_train_begin(self, logs=None):
  19. for callback in self.callbacks:
  20. callback.on_train_begin(logs)
  21. def on_train_end(self, logs=None):
  22. for callback in self.callbacks:
  23. callback.on_train_end(logs)
  24. def on_epoch_end(self, batch, logs=None):
  25. X_train, y_train = self.Combine[0][0], self.Combine[0][1]
  26. y_pred3 = self.model.predict(X_train)
  27. y_pred = np.zeros((len(y_pred3),))
  28. y_true = np.zeros((len(y_pred3),))
  29. for i in range(len(y_pred3)):
  30. y_pred[i] = y_pred3[i]
  31. for i in range(len(y_pred3)):
  32. y_true[i] = y_train[i]
  33. trn_s = metrics.mean_absolute_error(y_true, y_pred)
  34. logs['trn_score'] = trn_s
  35. X_val, y_val = self.Combine[1][0], self.Combine[1][1]
  36. y_pred3 = self.model.predict(X_val)
  37. y_pred = np.zeros((len(y_pred3),))
  38. y_true = np.zeros((len(y_pred3),))
  39. for i in range(len(y_pred3)):
  40. y_pred[i] = y_pred3[i]
  41. for i in range(len(y_pred3)):
  42. y_true[i] = y_val[i]
  43. val_s = metrics.mean_absolute_error(y_true, y_pred)
  44. logs['val_score'] = val_s
  45. print('trn_score', trn_s, 'val_score', val_s)
  46. for callback in self.callbacks:
  47. callback.on_epoch_end(batch, logs)













  1. def scheduler(epoch):
  2. # 每隔20个epoch,学习率减小为原来的二分之一
  3. if epoch % 20 == 0 and epoch != 0:
  4. lr = K.get_value(model.optimizer.lr)
  5. K.set_value(model.optimizer.lr, lr * 0.5)
  6. print("lr changed to {}".format(lr * 0.5))
  7. return K.get_value(model.optimizer.lr)
  8. reduce_lr = LearningRateScheduler(scheduler)


  1. N = 10 # 分10折交叉验证
  2. kfold = KFold(n_splits=N, shuffle=True)
  3. BSize = 2000
  4. MaxEpochs = 140
  5. RinPred = np.zeros((len(XTrainPCA),))
  6. for fold, (trn_idx, val_idx) in enumerate(kfold.split(XTrainPCA, YTrain)):
  7. print('fold:', fold+1)
  8. X_train, y_train = XTrainPCA[trn_idx], YTrain[trn_idx]
  9. X_val, y_val = XTrainPCA[val_idx], YTrain[val_idx]
  10. model = NN_model(X_train.shape[1])
  11. # 学习率初始设为0.01
  12. simple_adam = Adam(lr=0.01)
  13. model.compile(loss='mae', optimizer=simple_adam, metrics=['mae'])
  14. es = EarlyStopping(monitor='val_score', patience=10, verbose=1, mode='min', restore_best_weights=True, )
  15. es.set_model(model)
  16. metric = Metric(model, [es], [(X_train, y_train), (X_val, y_val)])
  17. # batch_size:每一次权重更新需要batch_size个数据进行运算得到损失函数,每运算batch_size个数据相当于一次迭代,每次进行迭代将会更新参数的权重。
  18. # epochs:被定义为向前和向后传播中所有批次的单次训练迭代。简单说,epochs指的就是训练过程中数据将被“轮”多少次
  19. # 假设训练集有1000个样本,batchsize=10,那么训练完整个样本集需要: 100次iteration,1次epoch
  20. model.fit(X_train, y_train, batch_size=BSize, epochs=MaxEpochs,
  21. validation_data=(X_val, y_val),
  22. callbacks=[reduce_lr], shuffle=True, verbose=1)
  23. y_pred3 = model.predict(X_val)
  24. y_pred = np.zeros((len(y_pred3),))
  25. for i in range(len(y_pred3)):
  26. y_pred[i] = y_pred3[i]
  27. RinPred[val_idx] = y_pred
  28. np.set_printoptions(suppress=True) # 不以科学计数法输出
  29. # 训练集真实值
  30. # print(np.around(YTrain[val_idx], 2))
  31. # 训练集预测值
  32. # print(np.around(y_pred, 2))
  33. # 输出data2中二手车价格的预测值
  34. print(np.around(model.predict(XTestPCA), 2))
  35. print(Evaluate(YTrain[val_idx], y_pred))




  1. # 评价模型
  2. def Evaluate(y_tre, y_pre):
  3. # y_tre:真实值;y_pre:预测值
  4. m = len(y_tre)
  5. count1 = 0
  6. Ape = []
  7. for i in range(0, m):
  8. Ape.append(np.abs(y_pre[i] - y_tre[i]) / y_tre[i])
  9. Mape = sum(Ape) / m
  10. for i in Ape:
  11. if i <= 0.05:
  12. count1 += 1
  13. Accuracy = count1 / m
  14. print('Mape:', Mape)
  15. print('Accuracy', Accuracy)
  16. print('score', 0.2 * (1 - Mape) + 0.8 * Accuracy)





