赞
踩
使用的数据集包含两列,name
(姓名),sex
(性别), 数量45000
, name
列数据唯一。
代码实现:
import random import nltk import pandas as pd from pathlib import Path from sklearn import model_selection from numpy import mean current_path = Path.cwd() # 特征提取 def gender_features(name): name = name.lower() if len(name) == 2: return { 'last_name': name[-1] } if len(name) >= 3: return { 'last_name': name[-1], 'last2_name': name[-2], 'last12_name': name[-2:] } # 获取featuresets def get_featuresets(X, y): labeled_names = [] for i in range(len(X)): labeled_names.append((X.values[i], y.values[i])) # 数据打乱 random.shuffle(labeled_names) # 我们使用特征提取器来处理数据 featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names] return featuresets if __name__=='__main__': labeled_names = [] df = pd.read_csv(Path(current_path, '中文姓名性别预测.csv'), encoding='utf8') # K折交叉验证 kf = model_selection.KFold(n_splits=10) # 使用10折交叉验验证划分数据集,返回一个生成器对象(即索引) digits_gen = kf.split(df) accuracy_list = [] for train_idx, test_idx in digits_gen: X_train = df['name'].iloc[train_idx] #训练集 X_test = df['name'].iloc[test_idx] #测试集 y_train = df['sex'].iloc[train_idx] #训练集标签 y_test = df['sex'].iloc[test_idx] #测试集标签 featuresets_train = get_featuresets(X_train, y_train) featuresets_test = get_featuresets(X_test, y_test) # 该训练集用于训练一个新的“naive Bayes”分类器。 classifier = nltk.NaiveBayesClassifier.train(featuresets_train) accuracy_list.append(nltk.classify.accuracy(classifier, featuresets_test)) print(accuracy_list) print(mean(accuracy_list))
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。