赞
踩
df.info()
df.isnull().sum()
df['name'].fillna(0)
- #不用函数的保留方法 对缺失值进行0填充
- df[df['Age']==None] = 0
- df[df['Age'].isnull()] = 0
- df[df['Age'] == np.nan] = 0
df[df.duplicates()]
df.drop_duplicates()
df.to_csv('test_clear.csv')
df['AgeBand']=pd.cut(df['Age'], 5, labels=[1, 2, 3, 4, 5])
df['AgeBand']=pd.cut(df['Age'], [0, 5, 15, 30, 50, 80], right=False, labels=[1, 2, 3, 4, 5])
df['AgeBand']=pd.qcut(df['Age'], [0.1, 0.3, 0.5, 0.7, 0.9, 1.], labels=[1, 2, 3, 4, 5])
2.2.1查看类别文本变量名及种类
df['Sex'].value_counts()
df['Sex'].unique()
df['Sex'].nunique()
unique():以数组形式(numpy.ndarray)返回列的所有唯一值(特征的所有唯一值)
nunique(): Return number of unique elements in the object.即返回的是唯一值的个数
2.2.2将文本变量Sex, Cabin ,Embarked用数值变量12345表示
- df['Sex_num'] = df['Sex'].replace(['male','female'],[1,2])
- # 适用于种类较少的
- df['Sex_num'] = df['Sex'].map({'male':1, 'female':2})
- #跟方法1一样,适用于种类较少的。
- from sklearn.preprocessing import LabelEncoder
- for feat in ['Cabin', 'Ticket']:
- lbl = LabelEncoder()
- label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
- df[feat + "_labelEncode"] = df[feat].map(label_dict)
- df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
- #这种就无所谓种类的的多少了,均适用
2.2.3将文本变量Sex, Cabin, Embarked用one-hot编码表示
- for feat in ["Age", "Embarked"]:
- #x = pd.get_dummies(df["Age"] // 6)
- # x = pd.get_dummies(pd.cut(df['Age'],5))
- x = pd.get_dummies(df[feat], prefix=feat)
- df = pd.concat([df, x], axis=1)
- #df[feat] = pd.get_dummies(df[feat], prefix=feat)
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.', expand=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。