一文了解深度学习实战——预测篇_深度学习预测

作者：运维做开发 | 2024-08-17 17:09:30

踩

深度学习预测

本文将从四个案例 房价预测、泰坦尼克号生还预测、股票预测、影评情感预测 入手，让童鞋们从实战角度快速入门深度学习的预测部分！

房价预测

基于决策树回归器（DecisionTreeRegressor）

数据文件在这：
链接：https://pan.baidu.com/s/1mPr60cFUSc5m7pmF8Ju4vw 提取码：j2b0

#基于DecisionTreeRegressor预测北京房价

import numpy 
import pandas as pd
import matplotlib
import seaborn
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
import tensorflow 
import numpy as np


#定义一堆函数
# 定义网格搜索最佳模型函数
def gridSearchVC_fit_model(X, y):
    
    # 清洗和分割数据对象定义，
    # 参数一：n_splits表示重新清洗和分割数据的迭代次数，默认值就是10
    # 参数二：test_size=0.2表示有0.2的数据用于测试，也就是20%的测试数据，80%的训练数据
    # 参数三：random_state表示随机数生成器的种子，如果希望第二次调用ShuffleSplit()方法时
    #        和第一次调用的结果一致，那么就可以设置一个值，多少都可以，生产环境不要设值
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

    # 创建决策树回归器对象
    regressor = DecisionTreeRegressor(random_state=0)

    # 创建一个字典，表示max_depth的值是从1到10
    # 注意：如果是Python2的话，这个list()函数调用去掉
    params = { "max_depth" : list(range(1, 10)) }

    # 通过make_scorer()函数将上面定义的performance_metric()函数转换成计算分值函数
    scoring_fnc = make_scorer(score_func=performance_metric)

    # 创建网格搜索对象
    # 参数一：评估器，就是回归器，这里表示的是决策树回归器
    # 参数二：网格搜索参数
    # 参数三：计算分值函数
    # 参数四：cv（Cross-Validation）交叉验证，传入交叉验证生成器，或者可迭代对象
    grid = GridSearchCV(estimator=regressor, param_grid=params, 
                        scoring=scoring_fnc, cv=cv)

    # 根据数据计算/训练适合网格搜索对象的最佳模型
    grid = grid.fit(X, y)

    # 返回计算得到的最佳模型
    return grid.best_estimator_


# 预测房屋价格
def PredictHousingPrice(X, y, fitter):
    
    # 迭代10次
    epochs = 10
    # 存储预测的价格
    y_predict_test_price = None
    # 分割训练集和测试集数据
    X_train, X_test, y_train, y_test = train_test_split(X, y,
            test_size=0.2, random_state=0)
    # 迭代训练
    for epoch_i in range(epochs):
        # 根据数据训练模型，并返回最佳模型
        reg = fitter(X_train, y_train)
        # 预测测试数据
        predicted_price = reg.predict(X_test)
        y_predict_test_price = predicted_price
        print("迭代第{}次。".format(epoch_i+1))
    return y_test, y_predict_test_price
    

# 显示真实的房价和预测房价对比图
def plotVersusFigure(y_true_price, y_predict_price):
    # 创建一个10x7英寸的窗口大小
    plt.figure(figsize=(10, 7))
    # 绘制的图1是真实的房价
    X_show = np.rint(np.linspace(1, 
                                 np.max(y_true_price), 
                                 len(y_true_price))
                    ).astype(int)
    # 绘制图1线，plot()方法：
    #  参数1：X轴方向的值，真实房价最低价和最高价
    #  参数2：y轴方向的值，真实房价的值
    #  参数3：绘制出来的线的样式风格，比如这里的"o"表示一个圆圈标记，而"-"表示实线
    #  参数4：绘制的线的颜色，这里是青色
    plt.plot(X_show, y_true_price, 'o-', color='c')
    # 绘制的图2是预测的房价，叠加在图1上
    X_show_predicted = np.rint(np.linspace(1, 
                                           np.max(y_predict_price), 
                                           len(y_predict_price))
                              ).astype(int)
    # 绘制图2线，plot()方法：
    #  参数1：X轴方向的值，预测房价最低价和最高价
    #  参数2：y轴方向的值，预测房价的值
    #  参数3：绘制出来的线的样式风格，比如这里的"o"表示一个圆圈标记，而"-"表示实线
    #  参数4：绘制的线的颜色，这里是洋红色
    plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
    # 添加标题
    plt.title('Housing Prices Prediction')
    # 添加图例
    plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
    # 添加X轴的标题
    plt.xlabel("House's Price Tendency By Array")
    # 添加y轴的标题
    plt.ylabel("House's Price")
    # 显示绘制
    plt.show()

#开搞！
# 根据北京的房价数据来预测
  
# 加载数据集
df = pd.read_csv('bj_housing.csv')
df.describe()

bj_prices = df['Value']
bj_prices.head()
bj_features = df.drop('Value', axis=1)
bj_features.head()

y_true_bj_price, y_predict_bj_price = \
PredictHousingPrice(bj_features, bj_prices, gridSearchVC_fit_model)

y_true_bj_price.reset_index().drop('index', axis=1).head()
pd.Series(y_predict_bj_price).head()

# 北京房屋价格对比图
plotVersusFigure(y_true_bj_price, y_predict_bj_price)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

基于Keras

# 使用Keras来预测波士顿的房价预测

import tensorflow as tf
from tensorflow import keras
import numpy as np

# 加载波士顿的房价数据
(train_data, train_labels), (test_data, test_labels) = \
keras.datasets.boston_housing.load_data()

# 清洗训练集数据
# np.random.random()表示在0.0到1.0之间返回指定个数的随机浮点数
# np.argsort()表示返回对数组进行排序的索引
order = np.argsort(np.random.random(train_labels.shape))
train_data = train_data[order]
train_labels = train_labels[order]

# 归一化处理数据
# 对不同的范围和比例进行归一化处理，并且每个元素都要减去均值除以标准差
# 模型虽然在没有特征归一化时也可以得到收敛，但是这会让训练更加困难，
# 而且会是结果模型很依赖于训练数据
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

print("train_data.shape: {}, train_labels.shape: {}."
      .format(train_data.shape, train_labels.shape)) 
print("test_data.shape: {}, test_labels.shape: {}."
      .format(test_data.shape, test_labels.shape)) 

# 创建模型函数
def build_model():
    model = keras.Sequential([
      keras.layers.Dense(64, activation=tf.nn.relu,
                         input_shape=(train_data.shape[1],)),
      keras.layers.Dense(64, activation=tf.nn.relu),
      keras.layers.Dense(1)
    ])

    optimizer = tf.train.RMSPropOptimizer(0.001)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae'])
    return model

model = build_model()
# 查看模型的架构
model.summary()

# 自定义一个回调类，在每次epoch（代）结束时都会调用该函数
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

EPOCHS = 500

# 训练模型
history = model.fit(train_data, train_labels, epochs=EPOCHS,
                    validation_split=0.2, verbose=0,
                    callbacks=[PrintDot()])

import matplotlib.pyplot as plt

# 绘制图来显示训练的误差历史
def plot_history(history):
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error [1000$]')
    plt.plot(history.epoch, np.array(history.history['mean_absolute_error']),
             label='Train Loss')
    plt.plot(history.epoch, np.array(history.history['val_mean_absolute_error']),
             label='Val loss')
    plt.legend()
    plt.ylim([0, 5])
    plt.show()

plot_history(history)



# 评估模型
[loss, mae] = model.evaluate(test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: ${:7.2f}".format(mae * 1000))

# 预测模型
test_predictions = model.predict(test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [1000$]')
plt.ylabel('Predictions [1000$]')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
plt.plot([-100, 100], [-100, 100])
plt.show()

# 查看预测值与真实的值得误差
error = test_predictions - test_labels
plt.hist(error, bins=50)
plt.xlabel("Prediction Error [1000$]")
plt.ylabel("Count")
plt.show()


# 显示真实的房价和预测房价对比图
def plotVersusFigure(y_true_price, y_predict_price):
    # 创建一个10x7英寸的窗口大小
    plt.figure(figsize=(10, 7))
    # 绘制的图1是真实的房价
    X_show = np.rint(np.linspace(1, 
                                 np.max(y_true_price), 
                                 len(y_true_price))
                    ).astype(int)
    # 绘制图1线，plot()方法：
    #  参数1：X轴方向的值，真实房价最低价和最高价
    #  参数2：y轴方向的值，真实房价的值
    #  参数3：绘制出来的线的样式风格，比如这里的"o"表示一个圆圈标记，而"-"表示实线
    #  参数4：绘制的线的颜色，这里是青色
    plt.plot(X_show, y_true_price, 'o-', color='c')
    # 绘制的图2是预测的房价，叠加在图1上
    X_show_predicted = np.rint(np.linspace(1, 
                                           np.max(y_predict_price), 
                                           len(y_predict_price))
                              ).astype(int)
    # 绘制图2线，plot()方法：
    #  参数1：X轴方向的值，预测房价最低价和最高价
    #  参数2：y轴方向的值，预测房价的值
    #  参数3：绘制出来的线的样式风格，比如这里的"o"表示一个圆圈标记，而"-"表示实线
    #  参数4：绘制的线的颜色，这里是洋红色
    plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
    # 添加标题
    plt.title('Housing Prices Prediction')
    # 添加图例
    plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
    # 添加X轴的标题
    plt.xlabel("House's Price Tendency By Array")
    # 添加y轴的标题
    plt.ylabel("House's Price")
    # 显示绘制
    plt.show()

# 对比真实的值和预测的值的图
plotVersusFigure(test_labels, test_predictions)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

泰坦尼克号生还预测

提供1309行泰坦尼克号乘客数据，其中891行是训练数据，418行是测试数据，一共有12列，其中有一列表示乘客是否生还。
下面用sklearn（决策树、逻辑回归、梯度提升、多层感知机）和keras（DNN）实现乘客生还预测。

数据文件在这：
链接：https://pan.baidu.com/s/1o_FUa_4VxmqXVBMBGh4rog 提取码：apzg

基于Sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 加载数据
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)

# 预览前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train.info()

# 先看下数据集的 Age 分布状态
sns.distplot(X_train['Age'].dropna(), hist=True, kde=True)
# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)

# Cabin 的缺失值太多，从 Dataframe 中移除后，也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)

# 我们来看下乘客都在哪些站登船的
# S 表示：Southampton，英国南安普敦
# C 表示：Cherbourg-Octeville，法国瑟堡-奥克特维尔
# Q 表示：Queenstown，爱尔兰昆士敦
X_train.Embarked.value_counts()

# 登船情况
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 数据集有一个缺失数据，我们把它找出来，然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查询从 英国南安普敦 上传，级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare


X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

X_train = pd.get_dummies(X_train)
# 预览 one-hot encoding 前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))


# 使用决策树预测模型
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
# 创建决策树模型
def createDecisionTreeClassifier():
    model = DecisionTreeClassifier()
    # 训练模型
    model.fit(train_X, train_y)
    # 预测
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    # 计算精确度
    train_accuracy = accuracy_score(train_y, train_pred)
    test_accuracy = accuracy_score(test_y, test_pred)
    print('The training accuracy is {}.'.format(train_accuracy))
    print('The test accuracy is {}'.format(test_accuracy))
    # ROC curve and AUC
    y_score_dt = model.predict_proba(test_X)
    fpr_dt, tpr_dt, thresholds_dt = metrics.roc_curve(test_y, y_score_dt[:,1])
    print('Decision Tree Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_dt[:,1])))
    return fpr_dt, tpr_dt
fpr_dt, tpr_dt = createDecisionTreeClassifier()



# 创建逻辑回归预测模型
from sklearn.linear_model import LogisticRegression
def createLogisticRegressionModel():
    model = LogisticRegression()
    model.fit(train_X, train_y)

    print('Logistic Regression Accuracy for training data is: {:.3f}'.format(model.score(train_X, train_y)))
    print('Logistic Regression Accuracy for testing data is: {:.3f}'.format(model.score(test_X, test_y)))
    
    y_score_lr = model.decision_function(test_X)
    print('Logistic Regression AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_lr)))

    fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(test_y, y_score_lr)
    return fpr_lr, tpr_lr

fpr_lr, tpr_lr = createLogisticRegressionModel()



# 创建梯度提升模型
from sklearn.ensemble import GradientBoostingClassifier
def createGradientBoostingClassifierModel():
    model = GradientBoostingClassifier(n_estimators = 500)
    model.fit(train_X, train_y)
    # 预测
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    print('Gradient Boosting Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
    print('Gradient Boosting Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))
    # ROC 曲线 和 AUC
    y_score_gb = model.predict_proba(test_X)
    fpr_gb, tpr_gb, thresholds_gb = metrics.roc_curve(test_y, y_score_gb[:,1])
    print('Gradient Boosting Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_gb[:,1])))
    return fpr_gb, tpr_gb
fpr_gb, tpr_gb = createGradientBoostingClassifierModel()



# 创建多层感知器的预测模型
from sklearn.neural_network import MLPClassifier
def createMLPClassifierModel():
    model = MLPClassifier(hidden_layer_sizes=128, batch_size=64, max_iter=1000, solver="adam")
    model.fit(train_X, train_y)
     
    # 预测
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    
    print('Neural Network classifier  Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
    print('Neural Network classifier  Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))

    # ROC curve and AUC
    y_score_nn = model.predict_proba(test_X)
    fpr_nn, tpr_nn, thresholds_nn = metrics.roc_curve(test_y, y_score_nn[:,1])
    print('Neural Network Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_nn[:,1])))
    return fpr_nn, tpr_nn
  
fpr_nn, tpr_nn = createMLPClassifierModel()

# 全部模型的训练曲线画图！
fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(111)
ax1 = ax.plot(fpr_dt, tpr_dt, c='c', lw=2, label="Decision Tree")
ax2 = ax.plot(fpr_lr, tpr_lr, c='y', lw=2, label="Logistic Regression")
ax3 = ax.plot(fpr_gb, tpr_gb, c='r', lw=2, label="Gradient Boosting")
ax4 = ax.plot(fpr_nn, tpr_nn, c='b', lw=2, label="Neural Network")

ax.grid()
lns = ax1 + ax2 + ax3 + ax4
ax.legend(lns, loc=0)
plt.show()

train_X.shape

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

基于Keras

# Keras的神经网络模型来预测
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import utils as np_utils

# 加载数据
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)

# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)
# Cabin 的缺失值太多，从 Dataframe 中移除后，也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)

X_train.Embarked.value_counts()
# 登船情况
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 数据集有一个缺失数据，我们把它找出来，然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查询从 英国南安普敦 上传，级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare


X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

X_train = pd.get_dummies(X_train)
# 预览 one-hot encoding 前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))

def createKerasModel(X, y):
    # 创建模型
    model = Sequential()
    # 内核初始化器就使用截断正态分布
    initializers = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
    # 输入层维度是 X.shape[1]
    model.add(Dense(input_dim=X.shape[1], units=128, kernel_initializer=initializers, bias_initializer='zeros'))
    model.add(Activation("relu"))
    model.add(Dropout(0.2))
    model.add(Dense(32))
    model.add(Activation("relu"))
    model.add(Dense(2))
    # 输出的结果是要么1，要么0，所以使用 sigmoid激活函数
    model.add(Activation("sigmoid"))
    # 编译使用二进制交叉熵，adam优化器自行调整
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # 将训练数据的y进行独热编码（one-hot encoding）
    y_train_categorical = np_utils.to_categorical(y)
    # 训练模型，epochs表示要训练150次，verbose表示训练每批次时输出日志信息
    model.fit(X.values, y_train_categorical, epochs=150, verbose=1)
    return model
   
keras_model = createKerasModel(train_X, train_y)


y_test_categorical = np_utils.to_categorical(test_y)
loss_and_accuracy = keras_model.evaluate(test_X.values, y_test_categorical)
print("Loss={}, Accuracy={}.".format(loss_and_accuracy[0], loss_and_accuracy[1]))

predictions_classes = keras_model.predict_classes(test_X.values)

submission = pd.DataFrame({
    "PassengerId": test_X["PassengerId"],
    "Survived": predictions_classes})
print(submission[0:15])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

股票预测

根据3000多条的百度股票数据，预测出股票曲线。
数据通过quandl开源库获取，使用Facebook开源的fbprophet库来进行股票价格预测。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install quandl
import quandl 

!pip install fbprophet
import fbprophet



def init_api_key():
    quandl.save_key("Your API Key")
    print(quandl.ApiConfig.api_key)
init_api_key()

quandl.read_key()
print(quandl.ApiConfig.api_key)

def init_stock(stock_name):
	#获取股票数据
    stock = quandl.get("WIKI/{}".format(stock_name))
    #设置列Date为第一列
    stock = stock.reset_index(level=0)
    return stock
  
#获取百度所有数据
stock_name = "BIDU"
baiduStock = init_stock(stock_name)
baiduStock.head()
print("baiduStock共计{}条。".format(len(baiduStock)))

min_date = min(baiduStock['Date'])
max_date = max(baiduStock['Date'])
print("百度的股票数据从{}到{}。".format(min_date, max_date))

print(type(baiduStock))
baiduStock.to_csv("baiduStock.csv", index=False)
baidu_df = pd.read_csv("baiduStock.csv")
baidu_df.head()

#数据可视化
def plot_basic_stock_history(df, start_date, end_date, stock_name):
    stats_Ajd_Close = 'Adj. Close'
    stat_min = min(df[stats_Ajd_Close])
    stat_max = max(df[stats_Ajd_Close])
    stat_mean = np.mean(df[stats_Ajd_Close])
    date_stat_min = df[df[stats_Ajd_Close] == stat_min]['Date']
    date_stat_min = date_stat_min[date_stat_min.index[0]].date()
    date_stat_max = df[df[stats_Ajd_Close] == stat_max]['Date']
    date_stat_max = date_stat_max[date_stat_max.index[0]].date()
    print("{}在{}最小，价格是：{}美元。".format(stats_Ajd_Close, date_stat_min, stat_min))
    print("{}在{}最高，价格是：{}美元。".format(stats_Ajd_Close, date_stat_max, stat_max))
    print("{}在{}当前价格是：{}美元。".format(stats_Ajd_Close, end_date.date(), df.loc[df.index[-1], 'Adj. Close']))
    plt.style.use("default")
    plt.plot(df["Date"], 
             df[stats_Ajd_Close], 
             color='r', 
             linewidth=3, 
             label=stats_Ajd_Close)
    plt.xlabel("Date")
    plt.ylabel("US $")
    plt.title("{} Stock History".format(stock_name))
    plt.grid()
    plt.show()

start_date = min_date
end_date = max_date
plot_basic_stock_history(baiduStock, start_date, end_date, stock_name)


#计算购买的股票收益
def plot_potential_profit(df, 
                          start_date, 
                          end_date, 
                          stock_name, 
                          line_color, 
                          text_color, 
                          myshares=1):
    start_price = float(df[df["Date"] == start_date]["Adj. Open"])
    end_price = float(df[df["Date"] == end_date]["Adj. Close"])
    df["profits"] = (df["Adj. Close"] - start_price) * myshares
    total_hold_profit = (end_price - start_price) * myshares
    print("从{}到{}，购买{}股，总收益是：{}美元。".format(start_date.date(), 
                                                  end_date.date(), 
                                                  myshares, 
                                                  total_hold_profit))
    plt.style.use("default")
    plt.plot(df["Date"], df["profits"], color=line_color, linewidth=3)
    plt.xlabel("Date")
    plt.ylabel("Profit $")
    plt.title("My Shares From {} to {} on {}.".format(start_date.date(), end_date.date(), stock_name))
    text_location_x = (end_date - pd.DateOffset(months=1)).date()
    text_location_y = total_hold_profit + (total_hold_profit / 40)
    plt.text(text_location_x, 
             text_location_y, 
             "${}".format(int(total_hold_profit)), 
             color=text_color,
             size=15)
    plt.grid()
    plt.show()

start_date = min_date
end_date = max_date
plot_potential_profit(baiduStock, start_date, end_date, stock_name, 'm', 'g', 100)



# 倘若在2012年到2013年之间持股的话，差不多就会亏损一半哦，可是谁又知道了？他们最后涨了那么多
start_date = pd.to_datetime("2012-08-07")
end_date = pd.to_datetime("2013-03-05")
baiduStockLowerPricePhase = baiduStock[
                            (baiduStock['Date'] >= start_date.date()) & 
                            (baiduStock['Date'] <= end_date.date())
                            ]
plot_potential_profit(baiduStockLowerPricePhase, start_date, end_date, stock_name, 'c', 'r', 100)


#训练和评估模型
def train_model(stock_history, days=0, weekly_seasonality=False, monthly_seasonality=False):
    model = fbprophet.Prophet(daily_seasonality=False,  
                              weekly_seasonality=False, 
                              yearly_seasonality=True,
                              changepoint_prior_scale=0.05)
    if monthly_seasonality:
        model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.fit(stock_history)
    future = model.make_future_dataframe(periods=days)
    future = model.predict(future)
    return model, future
  
  
def create_prophet_model(df, 
                         stock_name, 
                         days=0,
                         weekly_seasonality=False, 
                         monthly_seasonality=False):
    stock_history = df[df["Date"] > (max_date - pd.DateOffset(years=3)).date()]
    model, future = train_model(stock_history, days, weekly_seasonality, monthly_seasonality)

    plt.style.use("default") 
    fig, ax = plt.subplots(1, 1) 
    fig.set_size_inches(10, 5)
    # 绘制真实的值
    ax.plot(stock_history['ds'], 
            stock_history['y'], 
            'v-', 
            linewidth=1.0, 
            alpha=0.8, 
            ms=1.8, 
            label='Observations')
    # 绘制预测的值
    ax.plot(future['ds'], 
            future['yhat'], 
            'o-',
            linewidth=1., 
            label='Modeled')
    # 使用带状绘制一个不确定的区间值
    ax.fill_between(future['ds'].dt.to_pydatetime(), 
                    future['yhat_upper'], 
                    future['yhat_lower'], 
                    alpha=0.3, 
                    facecolor='g', 
                    edgecolor='k', 
                    linewidth=1.0, 
                    label='Confidence Interval') 
    plt.legend(loc=2, prop={'size': 10})  
    plt.title("{} Historical and Modeled Stock Price".format(stock_name)) 
    plt.xlabel('Date') 
    plt.ylabel('Price $') 
    plt.grid(linewidth=0.6, alpha=0.6) 
    plt.show() 
    return model, future
  

baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
model, future_data = create_prophet_model(baiduStock, stock_name, monthly_seasonality=True)

model.plot_components(future_data)
plt.show()

model, future_data = create_prophet_model(baiduStock, stock_name, weekly_seasonality=True, monthly_seasonality=True)

model.plot_components(future_data)
plt.show()


#股票预测，基于时间序列预测未来180天的百度股票价格
model, future = create_prophet_model(baiduStock, stock_name, days=180)

#股票买入策略
import prophet_evaluator
baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
prophet_evaluator.evaluator(baiduStock, min_date, max_date, train_model, stock_name, 1000)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

影评的情感分析

情感分析在自然语言处理（NLP）领域是很复杂的，有主观的，也有客观的。基于当前环境，针对不同的人或物，我们应该做出什么样的情感反应。下面讲解如何通过分析情感文本数据，预测出说话者在当时的情况下的情绪状态是积极的，还是消极的。
生活中就有很多例子，比如在京东、淘宝等电商平台购物后，用户都会被请求对收到的货物进行拍照、点赞、评论和评价星级等。平台收集这些数据后去做情感分析，从而通过了解买家对于产品的喜好和满意度来改善产品和服务。这为平台提供了一些潜在的用户会购买哪些产品的数据。
下面使用循环神经网络（RNN）来编写该神经网络模型的代码，创建此网络模型会使用到长短期记忆网络（LSTM）和嵌入层（Embedding Layers），最后的输出层会使用sigmoid激活函数，因为我们预测的结果要么是积极的，要么是消极的。

数据文件在这：
链接:https://pan.baidu.com/s/1DQdAROwzOT6nXdWBYeT2bw 密码:1rn7

基于TensorFlow

import numpy as np
import tensorflow as tf

# 定义加载数据的函数
def loadData():
    # 加载评论（字符串）
    with open('reviews.txt', 'r') as f:
        reviews = f.read()

    # 加载评论（字符串）的对应标签，是积极的还是消极的
    with open('labels.txt', 'r') as f:
        labels = f.read()
        
    # 返回评论和标签
    return reviews, labels

# 调用函数
reviews, labels = loadData()

# 查看评论的前150个字符是什么
reviews[:150]
# 查看评论的对应标签的前150个字符是什么
labels[:150]

from string import punctuation 

# 定义数据预处理函数
def dataPreprocess(reviews_str):
    # 通过列表推导式将reviews_str字符串里的包含各种标点符号去掉，并返回一个字符组成的数组
    # 然后通过join()函数将数组里的元素都连接成一个长长的字符串
    all_text = ''.join(
        [review for review in reviews_str if review not in punctuation])
    # 将该字符串通过\n换行符分割成数组
    review_list = all_text.split('\n')
    # 将数组里的元素通过空格连接起来，形成一个长长的字符串
    all_text = ' '.join(review_list)
    # 然后通过使用split()函数的默认分隔符-空格来将字符串分割成一个个单词的数组
    words = all_text.split()
    
    return review_list, all_text, words
    
# 调用函数
reviews, all_text, words = dataPreprocess(reviews)
reviews[:2]

# 查看前20个元素（单词）
words[:20]
# 查看前150个字符串
all_text[:150]

# 单词编码
from collections import Counter
# 统计单词的重复个数
word_counter = Counter(words)
# 将变量word_counter根据默认顺序进行逆序排序（从大到小），使用sorted方法，逆序设置参数reverse=True
sorted_vocab = sorted(word_counter, key=word_counter.get, reverse=True)

# 定义显示前10个单词以及它的重复个数的函数
def showTop10Item(dict_obj):
    word_index = 0
    for k, v in dict_obj.items():
        if word_index >= 10:
            break
        print("{}:{}".format(k, v))
        word_index+=1

# 显示变量word_counter里的单词和它对应的数量
showTop10Item(word_counter)
# 按照单词出现的数量从大到小的排序，查看前15个单词的出现次数
word_counter.most_common(15)
# 查看排序后的前15个单词，和上面显示的结果一样
sorted_vocab[:15]
# 创建单词对应的索引关系字典
vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 1)}
# 然后显示前10个单词以及它的个数
showTop10Item(vocab_to_int)

# 将每个单词的索引位置取出来，然后添加到reviews_ints数组里
# 也就是说，现在字符串里的每个单词，不是原来的单词字符串了，而是一个数值，表示它的索引
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])
print(reviews_ints[:1])
len(reviews_ints)

# 标签编码
# 对positive进行编码为1，negative为0
labels = labels.split('\n')
labels = np.array([1 if label == 'positive' else 0 for label in labels])
# 查看前10个编码标签值
labels[:10]

from collections import Counter

review_lens = Counter([len(x) for x in reviews_ints])
print("评论的最小长度是: {}".format(review_lens[0]))
print("评论的最大长度是: {}".format(max(review_lens)))
# 过滤掉评论的字符串长度为0的情况，并返回长度非零的索引，形成数组并返回
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]
# 去掉字符串长度为0的情况后，还有多少个评论
print(len(non_zero_idx))
# 通过变量non_zero_idx索引数组，过滤掉变量reviews_ints里的字符串为0的情况
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
# 过滤掉由于上面的字符串长度为0的那一行评论后，它对应的标签也需要过滤掉
labels = np.array([labels[i] for i in non_zero_idx])

# 现在，我们要创建一个features的变量来作为特征向量（Feature Vector），这个数据就是我们要传递到神经网络中的，
# 数据来自于reviews_ints变量。因为我们要传递整型的数值到神经网络中，且每行的数值不能
# 超过200个；所以就是，不足200长度的评论，前面使用0来填充；超过200长度的，我们截断前
# 200个字符串的长度。

# 定义一个评论的字符串最大长度是200
seq_len = 200
# 创建一个矩阵，里面的值都默认是0
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
# 将reviews_ints里的值都截断在200的长度，并填充到变量features里。
# 不足200长度的，就是它本身长度
for i, row in enumerate(reviews_ints):
    # 评论长度不足200的，我们在前面使用0来填充
    features[i, -len(row):] = np.array(row)[:seq_len]

# 查看第一个
features[0:1]
features.shape

# 拆分训练集、验证集和测试集数据
# 定义80%的数据用于训练
split_train_ratio = 0.8
# 特征向量的长度
features_len = len(features)
# 训练集的个数
train_len = int(features_len * split_train_ratio)
# 分割出训练集和验证集的数据
train_x, val_x = features[:train_len], features[train_len:]
train_y, val_y = labels[:train_len], labels[train_len:] 
# 将验证集的数量折半
val_x_half_len = int(len(val_x) / 2)
# 将验证集数据分成一半验证集，另一半测试集
val_x, test_x = val_x[:val_x_half_len], val_x[val_x_half_len:]
val_y, test_y = val_y[:val_x_half_len], val_y[val_x_half_len:]

# 输出打印
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

# 定义超参数
lstm_size = 256
lstm_layers = 2
batch_size = 512
learning_rate = 0.01


# 获取单词的总长度
n_words = len(vocab_to_int) + 1
# 创建默认计算图对象
tf.reset_default_graph()
# 给计算图上的张量的输入占位符添加一个前缀inputs
with tf.name_scope('inputs'):
    # 输入特征占位符
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    # 输入标签占位符
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    # 保留率占位符
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    
  
# 嵌入向量的大小
embed_size = 300 
# 给计算图上的张量的嵌入层变量和查找表添加一个前缀Embeddings
with tf.name_scope("Embeddings"):
    # 均匀分布初始化嵌入层的变量，范围是-1到1之间
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    # 将输入特征占位符传入嵌入查找表
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
def lstm_cell():
    # 创建基础LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # 添加dropout层到cell上
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

# 给graph上的tensors的RNN层添加一个前缀RNN_layers
with tf.name_scope("RNN_layers"):
    # 创建多个LSTM层
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # 获取一个初始化状态，默认值都是0
    initial_state = cell.zero_state(batch_size, tf.float32)

with tf.name_scope("RNN_forward"):
    # 通过dynamic_rnn可以返回每一步的输出和隐藏层的最后状态
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
with tf.name_scope('predictions'):
    # 创建输出层，由于我们预测的输出是1或者0，所以sigmoid激活函数是最好的选择
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    
with tf.name_scope('cost'):
    # 定义均方差训练损失函数
    cost = tf.losses.mean_squared_error(labels_, predictions)

with tf.name_scope('train'):
    # 定义训练优化器
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
with tf.name_scope('validation'):
    # 计算验证精确度
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# 定义获取数据批次的生成器函数
def get_batches(x, y, batch_size=100):
    # 计算得出有多少个批次，这里是整除，所以假如x的总数不能被batch_size整除，
    # 那么会剩下很小的一部分数据暂时会被丢弃
    n_batches = len(x)//batch_size
    # 然后再次确定x和y的数据集的数据
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    # 通过for循环，使用yield关键字构建生成器函数
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

# 设置迭代次数，8次
epochs = 8
# 创建检查点保存对象
saver = tf.train.Saver()

# 创建一个TensorFlow会话
with tf.Session() as sess:
    # 初始化全局变量
    sess.run(tf.global_variables_initializer())
    
    iteration = 1
    # 开始迭代
    for e in range(epochs):
        # 首次计算初始化状态
        state = sess.run(initial_state)
        
        # 将所有的数据都进行训练，get_batches()函数会获取数据生成器，然后进行迭代
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            # 每训练5次时，打印一次训练日志
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            # 每训练25次时，打印一次验证日志
            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                # 对验证集的所有数据进行计算分值
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = \
                        sess.run([accuracy, final_state], feed_dict=feed)
                    # 每25次训练后，完全的验证一次，得到验证分值，保存在数组val_acc里，
                    val_acc.append(batch_acc)
                # 打印每25次训练后，验证的均值
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            
            # 每批次时都记录检查点
            saver.save(sess, "checkpoints/sentiment.ckpt")
    # 当所有的数据迭代训练完毕后，最后记录一次检查点
    saver.save(sess, "checkpoints/sentiment.ckpt")

test_acc = []
with tf.Session() as sess:
    # 从检查点恢复已训练的模型
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    # 在计算测试集数据前，先创建一个空的状态
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    # 获取测试集数据生成器
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        # 开始批次计算测试集数据
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        # 将每个批次的得分保存到数组
        test_acc.append(batch_acc)
    # 最后输出测试得分均值，即精确度
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294

基于Keras

#基于Keras

import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

# 为了确保可复现性，我们设置一个随机种子
numpy.random.seed(7)

# 设置5000的意思是，只保留前面5000个以内常见的单词，其它的都为0
top_words = 5000

# 加载数据集
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# 设置单个影评的最大长度是500
review_max_length = 500

# 影评长度不够500的用0填充，超过500的截断
X_train = sequence.pad_sequences(X_train, maxlen=review_max_length)
X_test = sequence.pad_sequences(X_test, maxlen=review_max_length)

# 创建模型
embedding_vecor_length = 32
model = Sequential()
# 添加输入嵌入层
model.add(Embedding(top_words, embedding_vecor_length, input_length=review_max_length))
# 添加LSTM隐藏层
model.add(LSTM(100))
# 添加输出层（全连接层），二分类问题，使用sigmoid激活函数
model.add(Dense(1, activation='sigmoid'))
# 编译模型，二分类问题，使用二进制交叉熵来计算损失
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 输出显示模型架构
model.summary()

# 训练模型，所有的训练数据集都要经过3次训练，每次训练时的每批次大小是64个
model.fit(X_train, y_train, epochs=3, batch_size=64)

# 最后评估模型
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {}".format((scores[1]*100)))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/运维做开发/article/detail/993715

一文了解深度学习实战——预测篇_深度学习预测

目录

房价预测

基于决策树回归器（DecisionTreeRegressor）

基于Keras

泰坦尼克号生还预测

基于Sklearn

基于Keras

股票预测

影评的情感分析

基于TensorFlow

基于Keras