赞
踩
# 导包
import pandas as pd
import numpy as np
# 导入数据
train_data = pd.read_csv('modified_bank-additional-train.csv')
test_data = pd.read_csv('modified_bank-additional-test.csv')
# 显示数据维数
train_data.shape
(7873, 21)
test_data.shape
(3964, 21)
# 查看数据集的前4个特征、后2个特征和标签(SalePrice)
train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]
age | job | marital | education | euribor3m | nr.employed | y | |
---|---|---|---|---|---|---|---|
0 | 56 | services | married | high.school | 4.857 | 5191.0 | no |
1 | 54 | retired | married | basic.9y | 4.857 | 5191.0 | no |
2 | 35 | blue-collar | married | basic.6y | 4.857 | 5191.0 | no |
3 | 39 | management | single | basic.9y | 4.857 | 5191.0 | no |
# 除去最后一列的特征数据集
all_features = pd.concat((train_data.iloc[:, :-1], test_data.iloc[:, :-1]))
all_features.shape
(11837, 20)
# 查看数据集基本信息
all_features.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11837 entries, 0 to 3963 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 11837 non-null int64 1 job 11837 non-null object 2 marital 11837 non-null object 3 education 11837 non-null object 4 default 11837 non-null object 5 housing 11837 non-null object 6 loan 11837 non-null object 7 contact 11837 non-null object 8 month 11837 non-null object 9 day_of_week 11837 non-null object 10 duration 11837 non-null int64 11 campaign 11837 non-null int64 12 pdays 11837 non-null int64 13 previous 11837 non-null int64 14 poutcome 11837 non-null object 15 emp.var.rate 11837 non-null float64 16 cons.price.idx 11837 non-null float64 17 cons.conf.idx 11837 non-null float64 18 euribor3m 11837 non-null float64 19 nr.employed 11837 non-null float64 dtypes: float64(5), int64(5), object(10) memory usage: 1.9+ MB
# 查看空值信息
all_features.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 dtype: int64
# 查看重复值信息
all_features.duplicated().sum()
1
# 预处理数据
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# 非object类型数据标准化处理
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 标准化后,每个特征的均值变为0,所以可以直接用0来填充
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 离散数值转成指示特征(dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征)
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape
(11837, 73)
# 转换为numpy
n_train = train_data.shape[0]
# 拆分数据集
train_features = np.array(all_features[:n_train].values)
test_features = np.array(all_features[n_train:].values)
train_labels = np.array(train_data.y.values)
test_labels = np.array(test_data.y.values)
# 支持向量机预测
from sklearn.svm import SVC
# from sklearn.linear_model import SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
clf = SVC(kernel='linear', C=1)
clf.fit(train_features, train_labels)
predictions = clf.predict(test_features)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print(classification_report(test_labels, predictions))
print("AC", accuracy_score(test_labels, predictions))
precision recall f1-score support
no 0.90 0.86 0.88 2420
yes 0.80 0.85 0.82 1544
accuracy 0.86 3964
macro avg 0.85 0.86 0.85 3964
weighted avg 0.86 0.86 0.86 3964
AC 0.8572149344096872
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。