赞
踩
本关任务:通过二分类,确定一个人是否年收入超过5万美元。
为了完成本关任务,你需要掌握:1. python 语言基础; 2. 机器学习。
二分类任务:确定一个人是否年收入超过5万美元。
贝克尔(Barry Becker)从1994年的人口普查数据库中提取的数据。利用以下条件提取出一组相当清晰的记录: ((AGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0))
总共是提供了两个数据集:train.csv,test.csv
。并且一共是有以下属性: age, workclass, fnlwgt, education, education num, marital-status, occupation relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, make over 50K a year or not(income)
更多信息可以自行查看数据文件。
对于数据集:X_train,Y_train,X_test
:
注意:所谓onehot编码,例如sex属性中只有male,female两类属性值,使用onehot编码则是,对于任意一行数据表示一个人的信息,那么如果这个人的sex信息为male,则编码为[1,0]。
请预测test.csv中16281资料。
结果输出到csv文件中:
第一行必须为id,label,从第二行开始为预测结果
每行分别为id以及预测的label,请以逗号分隔
如下图的展示格式。
首先通过numpy加载数据集: 给出加载训练集示例:
mport numpy as np
X_train_fpath = 'data/X_train'
X_train = np.genfromtxt(X_train_fpath,delimiter = ',',skip_header = 1)
这里给出两种标准化的选择,给出了第一种实现范例,请使用第二种正态分布的规格化。并且可以自行选择对哪些列进行规格化,例如[0,1,3,4,5,7,10,12,25,26,27,28]
。
即代码示例:
def _normalize_column(X, train=True, specified_column=None, X_min=None, X_max=None):
# 归一化,将指定列的数据每一列进行归一化
if train:
if specified_column == None:
# 如果没有指定列,则对全部列进行归一化处理
specified_column = np.arange(X.shape[1])
length = len(specified_column)
X_max = np.reshape(np.max(X[:, specified_column], 0), (1, length))
X_min = np.reshape(np.min(X[:, specified_column], 0), (1, length))
X[:, specified_column] = np.divide(
np.subtract(X[:, specified_column], X_min),np.subtract(X_max,X_min))
return X, X_max, X_min
Z=σX−μ
代码示例:X[:,specified_column] = np.divide(np.subtract(X[:,specified_column],X_mean), X_std)
从算法描述,我们实现:
np.clip(1 / (1.0 + np.exp(-z)), 1e-6, 1-1e-6)
_sigmoid(np.add(np.matmul(X, w), b))
np.round(get_prob(X, w, b))
若是考虑正则化,那么: w_grad = -np.mean(np.multiply(pred_error.T, X.T), 1)+lamda*w
各个偏置b的参数如下:
∑n−(yn−fw,b(xn))
即,参数b梯度求平均,如下:b_grad = -np.mean(pred_error)
当数据量不足时,使用交叉验证,可以一定上提高训练效果。
示例数据分割成两部分一部分训练集,一部分验证集:分割值具体自己设置。
def train_dev_split(X, y, dev_size=0.25):
# 按照dev_size比例分割数据,用于交叉验证
train_len = int(round(len(X) * (1-dev_size)))
return X[0:train_len], y[0:train_len], X[train_len:], y[train_len:]
打乱顺序的函数:
def _shuffle(X, y):
# 打乱数据顺序
randomize = np.arange(len(X))
np.random.shuffle(randomize)
return X[randomize], y[randomize]
在这里设置:(也可以自己设置,查看效果)
max_iter = 40 # 最大迭代次数
batch_size = 32 # 每一次迭代训练的数据量
learning_rate = 0.01 # 学习率
for epoch in range(max_iter):
X_train, y_train = _shuffle(X_train, y_train)
step = 1 # 更新学习率
# 逻辑回归
for i in range(int(np.floor(len(y_train) / batch_size))):
X = X_train[i*batch_size:(i+1)*batch_size]
y = y_train[i*batch_size:(i+1)*batch_size]
# 计算梯度
w_grad, b_grad = gradient_regularization(X, y, w, b, lamda)
# 更新w、b
w = w - learning_rate / np.square(step) * w_grad
b = b - learning_rate / np.square(step) * b_grad
step = step + 1
并且在每个epoch训练中记录下训练误差 以及验证集中的误差用于画图数据:(示例训练集误差)
# 计算训练集的误差和准确率
y_train_pre = get_prob(X_train, w, b)
acc_train.append(accuracy(np.round(y_train_pre), y_train))
loss_train.append(compute_loss(
y_train, y_train_pre, lamda, w) / num_train)
准确度函数如下:
def accuracy(y_pre, y):
acc = np.sum(y_pre == y) / len(y_pre)
return acc
导入画图函数包:
import matplotlib.pyplot as plt
%matplotlib inline
画出训练集以及验证集中loss 以及 准确率 的对比图形:
通过读入测试集数据X_test
数据,利用训练好的模型,来完成年薪的预测,并且将结果输入到output.csv
文件中。
X_test_fpath='data/X_test'
output_fpath='output.csv'
result=infer(X_test,v,b)
with open(output_fpath,'v') as f:
f.write("id,label\n")
for i,v in enumerate(result):
f.write("%d,%d\n"%(i+1,v))
以下方法可以查看各个特征项对预测结果的贡献度如下:
# 输出贡献最大的10个特征
ind = np.argsort(np.abs(w))[::-1]
with open(X_test_fpath) as f:
content=f.readline().rstrip("\n")
features=np.array([x for x in content.split(',')])
for i in ind[0:10]:
print(features[i], w[i])
根据提示,在右侧编辑器 / Begin /
到 / End /
处补充代码。
数据集在:/data/workspace/myshixun/step1/train.csv
;/data/workspace/myshixun/step1/test.csv
完成加载数据和网络的编写,结果具有随机性,图片展示结果也会具有一定的差异性,本关是通过检查保存文件验证是否通关。
平台会对你编写的代码进行测试:
预期输出:
id, label
1 , 0.0
2 , 0.0
3 , 0.0
4 , 0.0
5 , 0.0
6 , 0.0
7 , 0.0
8 , 1.0
9 , 0.0
10 , 0.0
16282
id,label
开始你的任务吧,祝你成功!
- import csv
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
-
-
- def dataProcess_X(data):
- # income和sex列的值可以直接使用1位二进制码表示,不需要进行one-hot编码
- if "income" in data.columns:
- Data = data.drop(["income", "sex"], axis=1)
- else:
- Data = data.drop(["sex"], axis=1)
-
- # 离散属性列
- listObjectData = [
- col for col in Data.columns if Data[col].dtypes == "object"]
- # 连续属性列
- listNonObjectData = [
- col for col in Data.columns if col not in listObjectData]
-
- ObjectData = Data[listObjectData]
- NonObjectData = Data[listNonObjectData]
-
- # 插入sex列,0代表male,1代表female
- NonObjectData.insert(0, "sex", (data["sex"] == " Female").astype(np.int))
- ObjectData = pd.get_dummies(ObjectData) # one-hot编码
-
- Data = pd.concat([NonObjectData, ObjectData], axis=1) # 合并离散属性和连续属性
- Data = Data.astype("int64")
-
- Data = (Data - Data.mean()) / Data.std() # 标准化
- return Data
-
-
- def dataProcess_Y(data):
- # income属性,0代表小于等于50K,1代表大于50K
- return (data["income"] == " >50K").astype(np.int)
-
-
- def normalize_column(X, train=True, specified_column=None, X_mean=True, X_std=True):
- # 归一化,将指定列的数据归一到0附近,且符合正态分布
- if train:
- if specified_column == None:
- # 如果没有指定列,则对全部列进行归一化处理
- specified_column = np.arange(X.shape[1])
- length = len(specified_column)
- X_mean = np.reshape(np.mean(X[:, specified_column], 0), (1, length))
- X_std = np.reshape(np.std(X[:, specified_column], 0), (1, length))
-
- X[:, specified_column] = np.divide(
- np.subtract(X[:, specified_column], X_mean), X_std)
-
- return X, X_mean, X_std
-
-
- def _sigmoid(z):
- return np.clip(1 / (1.0 + np.exp(-z)), 1e-6, 1-1e-6)
-
-
- def get_prob(X, w, b):
- return _sigmoid(np.add(np.matmul(X, w), b))
-
-
- def infer(X, w, b):
- return np.round(get_prob(X, w, b))
-
-
- def gradient(X, y, w, b):
- # 梯度计算
- y_pre = get_prob(X, w, b)
- pre_error = y - y_pre
- w_grad = -np.sum(np.multiply(pre_error, X.T), 1)
- b_grad = -np.sum(pre_error)
- return w_grad, b_grad
-
-
- def gradient_regularization(X, y, w, b, lamda):
- # 进行正则化的梯度计算
- y_pre = get_prob(X, w, b)
- pre_error = y - y_pre
- w_grad = -np.sum(np.multiply(pre_error, X.T), 1) + lamda * w
- b_grad = -np.sum(pre_error)
- return w_grad, b_grad
-
-
- def _cross_entropy(y, y_pre):
- cross_entropy = -np.dot(y, np.log(y_pre)) - \
- np.dot((1 - y), np.log(1 - y_pre))
- return cross_entropy
-
-
- def compute_loss(y, y_pre, lamda, w):
- return _cross_entropy(y, y_pre) + lamda * np.sum(np.square(w))
-
-
- def train_dev_split(X, y, dev_size=0.25):
- # 按照dev_size比例分割数据,用于交叉验证
- train_len = int(round(len(X) * (1-dev_size)))
- return X[0:train_len], y[0:train_len], X[train_len:], y[train_len:]
-
-
- def _shuffle(X, y):
- # 打乱数据顺序
- randomize = np.arange(len(X))
- np.random.shuffle(randomize)
- return X[randomize], y[randomize]
-
-
- def accuracy(y_pre, y):
- acc = np.sum(y_pre == y) / len(y_pre)
- return acc
-
-
- # --------------- Begin --------------- #
- # 加载训练数据集
- # train_data =
- train_data = pd.read_csv("/data/workspace/myshixun/step1/train.csv")
- test_data = pd.read_csv("/data/workspace/myshixun/step1/test.csv")
- X_train_fpath = '/data/workspace/myshixun/step1/train.csv'
- X_train = np.genfromtxt(X_train_fpath,delimiter = ',',skip_header = 1)
- # --------------- End --------------- #
-
- # 训练数据将107维降为106维,以适应测试数据
- X_train = dataProcess_X(train_data).drop(
- ['native_country_ Holand-Netherlands'], axis=1).values
- y_train = dataProcess_Y(train_data).values
-
- col = [0, 1, 3, 4, 5, 7, 10, 12, 25, 26, 27, 28]
- X_train, X_mean, X_std = normalize_column(X_train, specified_column=col)
-
- # 分割数据为训练集和验证集
- X_train, y_train, X_dev, y_dev = train_dev_split(X_train, y_train)
- num_train = len(y_train) # 训练集大小
- num_dev = len(y_dev) # 验证集大小
-
- max_iter = 40 # 最大迭代次数
- batch_size = 32 # 每一次迭代训练的数据量
- learning_rate = 0.01 # 学习率
-
- loss_train = [] # 训练误差
- loss_validation = [] # 验证误差
- acc_train = [] # 训练准确率
- acc_validation = [] # 验证准确率
-
- w = np.zeros((X_train.shape[1],))
- b = np.zeros((1,))
-
- # 正则化
- regularize = True
- if regularize:
- lamda = 0.01
- else:
- lamda = 0
-
- # --------------- Begin --------------- #
- # 完善二分类模型
- for epoch in range(max_iter):
- X_train, y_train = _shuffle(X_train, y_train)
-
- step = 1 # 更新学习率
- # 逻辑回归
- for i in range(int(np.floor(len(y_train) / batch_size))):
- X = X_train[i*batch_size:(i+1)*batch_size]
- y = y_train[i*batch_size:(i+1)*batch_size]
-
- # 计算梯度
- w_grad, b_grad = gradient_regularization(X, y, w, b, lamda)
-
- # 更新w、b
- w = w - learning_rate / np.square(step) * w_grad
- b = b - learning_rate / np.square(step) * b_grad
-
- step = step + 1
-
- # 计算训练集的误差和准确率
- y_train_pre = get_prob(X_train, w, b)
- acc_train.append(accuracy(np.round(y_train_pre), y_train))
- loss_train.append(compute_loss(
- y_train, y_train_pre, lamda, w) / num_train)
-
- # 计算验证集的误差和准确率
- y_dev_pre = get_prob(X_dev, w, b)
- acc_validation.append(accuracy(np.round(y_dev_pre), y_dev))
- loss_validation.append(compute_loss(
- y_dev, y_dev_pre, lamda, w) / num_dev)
- # --------------- End --------------- #
-
- test_data = pd.read_csv("/data/workspace/myshixun/step1/test.csv")
- X_test = dataProcess_X(test_data)
- features = X_test.columns.values
- X_test = X_test.values
-
- X_test, _, _ = normalize_column(
- X_test, train=False, specified_column=col, X_mean=X_mean, X_std=X_std)
-
- result = infer(X_test, w, b)
-
- # 输出贡献最大的10个特征
- ind = np.argsort(np.abs(w))[::-1]
- for i in ind[0:10]:
- print(features[i], w[i])
-
- with open("/data/workspace/myshixun/step1/predict_test.csv", "w+") as csvfile:
- csvfile.write("id,label\n")
- print("id, label")
- for i, label in enumerate(result):
- csvfile.write("%d,%d\n" % (i+1, label))
- if i < 10:
- print(i+1, ", ", label)
-
- plt.plot(loss_train)
- plt.plot(loss_validation)
- plt.legend(['train', 'validation'])
- plt.savefig('/data/workspace/myshixun/step1/img1/test.jpg')
-
- plt.plot(acc_train)
- plt.plot(acc_validation)
- plt.legend(['train', 'validation'])
- plt.savefig('/data/workspace/myshixun/step1/img1/test2.jpg')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。