赞
踩
# dataset :archive.ics.uci.edu/ml/datasets/Adult
import os
import pandas as pd
data_folder = os.path.join(os.getcwd(),"Data/adult.data")
adult = pd.read_csv(data_folder,header=None,names=["Age","Work-Class","fnlwgt","Education","Education-Num","Marital-Status","Occupation","Relationship","Race","Sex","Capital-gain","Capital-loss","Hours-per-week","Native-country","Earnings-Raw"])
adult.dropna(how='all',inplace=True)
# print(adult.columns)
# print(adult["Hours-per-week"].describe())
# print(adult["Work-Class"].unique())
# 创建特征
adult["LongHours"] = adult["Hours-per-week"]>40
# print(adult.columns)
x = adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values
y = (adult["Earnings-Raw"]==' >50K').values
print(y)
from sklearn.feature_selection import SelectKBest,chi2
transformer =SelectKBest(score_func=chi2,k=3)
Xt_chi2 = transformer.fit_transform(x,y)
print(transformer.scores_)
from scipy.stats import pearsonr
import numpy as np
def multivariate_pearsonr(x,y):
scores, pvalues = [],[]
for column in range(x.shape[1]):
cur_score,cur_p = pearsonr(x[:,column],y)
scores.append(cur_score)
pvalues.append(cur_p)
return (np.array(scores),np.array(pvalues))
transformer = SelectKBest(score_func=multivariate_pearsonr,k=3)
Xt_pearsonr = transformer.fit_transform(x,y)
print(transformer.scores_)
#
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
score_chi2 = cross_val_score(clf,Xt_chi2,y,scoring='accuracy')
score_pearsonr = cross_val_score(clf,Xt_pearsonr,y,scoring='accuracy')
print("{0:.3f},{0:.3f}".format(np.mean(score_chi2*100),np.mean(score_pearsonr*100)))
import os
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
data_file = os.path.join(os.getcwd(),"Data/ad.data")
def convert_number(x):
try:
return float(x)
except ValueError:
return np.nan
converters = defaultdict(convert_number)
converters[1558] = lambda x:1 if x.strip() == "ad." else 0
#问号处理
for i in range(0,1558):
converters[i]=lambda x: np.nan if x.strip() == "?" else x
ads = pd.read_csv(data_file, header=None, converters=converters, low_memory=False)
print(ads[:5])
ads.dropna(inplace=True)
X = ads.drop(1558,axis=1).values
y = ads[1558]
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
Xd = pca.fit_transform(X)
np.set_printoptions(precision=5,suppress=True)
print(pca.explained_variance_ratio_)
clf = DecisionTreeClassifier(random_state=14)
scores_reduce = cross_val_score(clf,Xd,y,scoring='accuracy')
print(np.mean(scores_reduce*100))
from matplotlib import pyplot as plt
classes = set(y)
colors = ['red','green']
for cur_class ,color in zip(classes,colors):
mask = (y==cur_class).values
plt.scatter(Xd[mask,0],Xd[mask,1],marker='o',color=color,label=int(cur_class))
plt.legend()
plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。