赞
踩
# -*- coding: UTF-8 -*-
import numpy as np
from sklearn import preprocessing
# 导入数据
data = np.array([[3,-1.5,2,-5.4],[0,4,-0.3,2.1],[1,3.3,-1.9,-4.3]])
print("data:\n", data)
# 数据预处理
data_standardized = preprocessing.scale(data)
print("preprocessing.scale:\n", data_standardized)
# 特征值的平均值, 几乎为0
print("Mean =", data_standardized.mean(axis=0))
# 标准差,都为1
print("Std deviation =", data_standardized.std(axis=0))
a=17
b=20
c=23
# 计算均值
mean=(a+b+c)/3
# 例如有一列特征值表示年龄:[17,20,23]
mean=(17+20+23)/3=20
a1=17-20=-3
b1=20-20=0
c1=23-20=3
# 均值为0
a1=-3
b1=0
c1=3
s=std(a1,b1,c1)
result = [a1/s,b1/s,c1/s]
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled = data_scaler.fit_transform(data)
print("Min max scaled data =", data_scaled)
data_scaler = preprocessing.normalizer(norm='l1').fit(data)
data_normalized = data_scaler.transform(data)
print("Normalized data =", data_normalized)
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
print("Binarized data:\n", data_binarized)
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]])
encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print("Encoded vector =", encoded_vector)
from sklearn import preprocessing
# 标记编码器
print("#"*10,"标记编码器", "#"*10)
label_encoder = preprocessing.LabelEncoder()
# 创建标记
input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']
# 标记编码
label_encoder.fit(input_classes)
for i, item in enumerate(label_encoder.classes_):
print(item, "\t=>\t", i)
print("#"*10,"标记转数字", "#"*10)
labels = ['toyota', 'ford', 'audi']
encoded_labels = label_encoder.transform(labels)
print("Labels =", labels)
print("Encoded labels =", list(encoded_labels))
# 方法一,直接用包划分
from sklearn.model_selection import train_test_split
# 划分训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
# 方法二,手工划分
# 划分训练集与测试集
X_train, X_test, y_train, y_test = X[0:100], X[100:], y[0:100], y[100:]
# 取80%的样本作为训练数据
num_training = int(0.8 * len(X))
num_test = len(X) - num_training
# 训练数据 80% reshape:(行数,列数)
# 行数:样本数
# 列数:特征数
X_train = np.array(X[:num_training]).reshape((num_training,1))
y_train = np.array(y[:num_training])
# 测试数据 20%
X_test = np.array(X[num_training:]).reshape((num_test,1))
y_test = np.array(y[num_training:])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。