赞
踩
数据预处理一般包括如下几个步骤
import pandas as pd
import torch
'''
数据集:house_data.csv
任务:根据房间数和入户样式预测价格
'''
# 1.读取数据 -----------------------------
house_data_file = './house_data.csv'
data = pd.read_csv(house_data_file)
print(data)
# NumRooms Alley Price
# 0 NaN Pave 12500
# 1 2.0 NaN 10600
# 2 4.0 grassland 17800
# 3 NaN NaN 14000
# 2.拆分输入、输出数据 ------------------------------------
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
print(inputs)
# NumRooms Alley
# 0 NaN Pave
# 1 2.0 NaN
# 2 4.0 grassland
# 3 NaN NaN
print(outputs)
# 0 12500
# 1 10600
# 2 17800
# 3 14000
# Name: Price, dtype: int64
# 3.缺失数据取均值【对字符串格式的数据不生效】 ---------------
inputs = inputs.fillna(inputs.mean())
print(inputs)
# NumRooms Alley
# 0 3.0 Pave
# 1 2.0 NaN
# 2 4.0 grassland
# 3 3.0 NaN
# 将所有不同的字符串值转成单独一个分类 ----------------------------
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs) # 入户小径、入户草地、入户没有抓到数据的情况各分一类
# NumRooms Alley_Pave Alley_grassland Alley_nan
# 0 3.0 1 0 0
# 1 2.0 0 0 1
# 2 4.0 0 1 0
# 3 3.0 0 0 1
# 转成 pytorch 张量 -------------------------------------------------
x, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
print(x)
# tensor([[3., 1., 0., 0.],
# [2., 0., 0., 1.],
# [4., 0., 1., 0.],
# [3., 0., 0., 1.]], dtype=torch.float64)
print(y)
# tensor([12500, 10600, 17800, 14000])
# 到此为止就可以送入模型训练了 -----
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。