1.数据指标 No:序列号 year:年份 month:月份 day:日 hour:小时 pm2.5:PM2.5浓度 DEWP:露点温度 TEMP:温度 PRES:压力 cbwd:风向 Iws:风速 Is:积雪的时间 Ir:累积的下雨时数
2.导入库
import pandas as pd
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
3.导入数据
filepath = r"data_model/PRSA_data_2010.1.1-2014.12.31.csv"
data = pd.read_csv(filepath, index_col=0)
data.shape
data.head()
year | month | day | hour | pm2.5 | DEWP | TEMP | PRES | cbwd | Iws | Is | Ir | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
No | ||||||||||||
1 | 2010 | 1 | 1 | 0 | NaN | -21 | -11.0 | 1021.0 | NW | 1.79 | 0 | 0 |
2 | 2010 | 1 | 1 | 1 | NaN | -21 | -12.0 | 1020.0 | NW | 4.92 | 0 | 0 |
3 | 2010 | 1 | 1 | 2 | NaN | -21 | -11.0 | 1019.0 | NW | 6.71 | 0 | 0 |
4 | 2010 | 1 | 1 | 3 | NaN | -21 | -14.0 | 1019.0 | NW | 9.84 | 0 | 0 |
5 | 2010 | 1 | 1 | 4 | NaN | -20 | -12.0 | 1018.0 | NW | 12.97 | 0 | 0 |
4.数据清洗
4.1 缺失值处理
data.isnull().sum()
year 0
month 0
day 0
hour 0
pm2.5 2067
DEWP 0
TEMP 0
PRES 0
cbwd 0
Iws 0
Is 0
Ir 0
dtype: int64
data["pm2.5"] = data.groupby(['year', 'month'])['pm2.5'].transform(lambda x: x.fillna(x.mean()))
# 查看填充后数据缺失情况
data.isnull().sum() # 可以发现不存在缺失值了
year 0
month 0
day 0
hour 0
pm2.5 0
DEWP 0
TEMP 0
PRES 0
cbwd 0
Iws 0
Is 0
Ir 0
dtype: int64
4.2 分类数据处理
# 将cbwd指标数据类型转换成category
cbwd_category = data['cbwd'].astype('category')
# 使用标签的编码作为真正的数据
data['cbwd'] = cbwd_category.cat.codes
data.head(10)
year | month | day | hour | pm2.5 | DEWP | TEMP | PRES | cbwd | Iws | Is | Ir | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
No | ||||||||||||
1 | 2010 | 1 | 1 | 0 | 90.442573 | -21 | -11.0 | 1021.0 | 1 | 1.79 | 0 | 0 |
2 | 2010 | 1 | 1 | 1 | 90.442573 | -21 | -12.0 | 1020.0 | 1 | 4.92 | 0 | 0 |
3 | 2010 | 1 | 1 | 2 | 90.442573 | -21 | -11.0 | 1019.0 | 1 | 6.71 | 0 | 0 |
4 | 2010 | 1 | 1 | 3 | 90.442573 | -21 | -14.0 | 1019.0 | 1 | 9.84 | 0 | 0 |
5 | 2010 | 1 | 1 | 4 | 90.442573 | -20 | -12.0 | 1018.0 | 1 | 12.97 | 0 | 0 |
6 | 2010 | 1 | 1 | 5 | 90.442573 | -19 | -10.0 | 1017.0 | 1 | 16.10 | 0 | 0 |
7 | 2010 | 1 | 1 | 6 | 90.442573 | -19 | -9.0 | 1017.0 | 1 | 19.23 | 0 | 0 |
8 | 2010 | 1 | 1 | 7 | 90.442573 | -19 | -9.0 | 1017.0 | 1 | 21.02 | 0 | 0 |
9 | 2010 | 1 | 1 | 8 | 90.442573 | -19 | -9.0 | 1017.0 | 1 | 24.15 | 0 | 0 |
10 | 2010 | 1 | 1 | 9 | 90.442573 | -20 | -8.0 | 1017.0 | 1 | 27.28 | 0 | 0 |
4.3 构造数据集
Goal: 使用前一天24小时内PM2.5的相关数据,对第二天两小时内PM2.5进行预测分析
Factor: X:pm2.5、DEWP、TEMP、PRES、cbwd、Iws、Is和Ir共8个指标;y:第二天2小时内的pm2.5
X_data = data[['pm2.5', 'DEWP', 'TEMP', 'PRES', 'cbwd', 'Iws', 'Is', 'Ir']] # 提取特征数据
Y_data = data[['pm2.5']] # 提取pm2.5数据
X = np.zeros((X_data.shape[0] // 24 - 1,
24,
X_data.shape[-1]))
Y = np.zeros((Y_data.shape[0] // 24 - 1,
2))
rows = range(0, X_data.shape[0] - 24, 24)
for i, row in enumerate(rows):
X[i, :, :] = X_data.iloc[row : row + 24]
Y[i, :] = [Y_data.iloc[row + 24], Y_data.iloc[row + 24 + 1]]
X.shape
Y.shape
C:\Users\ppppp\AppData\Local\Temp/ipykernel_17484/2297659606.py:12: DeprecationWarning: setting an array element with a sequence. This was supported in some cases where the elements are arrays with a single element. For example `np.array([1, np.array([2])], dtype=int)`. In the future this will raise the same ValueError as `np.array([1, [2]], dtype=int)`.
Y[i, :] = [Y_data.iloc[row + 24], Y_data.iloc[row + 24 + 1]]
(1825, 2)
4.4 拆分数据集
# 这里将80%数据作为训练集,20%数据作为测试集,则训练集有1460个样本,测试集有365个样本
X_train = X[:int(X.shape[0] * 0.8)]
Y_train = Y[:int(X.shape[0] * 0.8)]
X_val = X[int(X.shape[0] * 0.8)::]
Y_val = Y[int(X.shape[0] * 0.8)::]
4.5 数据标准化
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)
Y_mean = Y_train.mean(axis=0)
Y_std = Y_train.std(axis=0)
X_train_norm = (X_train - X_mean) / X_std
Y_train_norm = (Y_train - Y_mean) / Y_std
X_val_norm = (X_val - X_mean) / X_std
Y_val_norm = (Y_val - Y_mean) / Y_std
5 建模
5.1 构造模型
# 使用3层LSTM,输出层为2输出的Dense层
model = Sequential()
model.add(LSTM(32,
input_shape=(X_train_norm.shape[1], X_train_norm.shape[-1]),
return_sequences=True))
model.add(Dropout(0.2)) # 防止过拟合
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(2))
model.compile(loss='mse',
optimizer='rmsprop')
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 24, 32) 5248
dropout (Dropout) (None, 24, 32) 0
lstm_1 (LSTM) (None, 24, 32) 8320
dropout_1 (Dropout) (None, 24, 32) 0
lstm_2 (LSTM) (None, 32) 8320
dropout_2 (Dropout) (None, 32) 0
dense (Dense) (None, 2) 66
=================================================================
Total params: 21,954
Trainable params: 21,954
Non-trainable params: 0
_________________________________________________________________
history = model.fit(X_train_norm, Y_train_norm,
epochs=60,
batch_size=128,
validation_data=(X_val_norm, Y_val_norm))
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(range(len(loss)), loss, 'b-', label='训练集损失')
plt.plot(range(len(loss)), val_loss, 'r-', label='测试集损失')
plt.legend(loc='best')
plt.show();

```python
model_pred = model.predict(X_val_norm)
val_pred = model_pred * Y_std + Y_mean # 别忘了,数据进行了标准化处理,因此预测值需要处理,再计算R方
# 计算R2
R_2_0 = metrics.r2_score(Y_val[:,0], val_pred[:, 0]) # 计算0时预测的R方
R_2_1 = metrics.r2_score(Y_val[:,1], val_pred[:, 1]) # 计算1时预测的R方
# 实际值与预测值对比图
fig = plt.subplots(figsize=(12, 6))
gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1], hspace=0)
ax1 = plt.subplot(gs[0])
plt.plot(range(Y_val.shape[0]), Y_val[:, 0], 'b-', label='0时PM2.5实际图')
plt.plot(range(Y_val.shape[0]), val_pred[:, 0], 'r-', label='0时PM2.5预测图')
plt.legend(loc='best')
plt.text(150, 400, '拟合R2:{0}%'.format(round(R_2_0 * 100, 2)))
ax2 = plt.subplot(gs[1], sharex=ax1)
plt.plot(range(Y_val.shape[0]), Y_val[:, 1], 'b-', label='1时PM2.5实际图')
plt.plot(range(Y_val.shape[0]), val_pred[:, 1], 'r-', label='1时PM2.5预测图')
ax2.set_xlabel(' ')
plt.legend(loc='best')
plt.text(150, 400, '拟合R2:{0}%'.format(round(R_2_1 * 100, 2)))
plt.show();
12/12 [==============================] - 1s 6ms/step
本文章使用limfx的vscode插件快速发布