I am dealing with pattern prediction from a formatted CSV dataset with three columns (time_stamp, X and Y - where Y is the actual value). I wanted to predict the value of X from Y based on time index from past values and here is how I approached the problem with LSTM Recurrent Neural Networks in Python with Keras.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
np.random.seed(7)
df = pd.read_csv('test32_C_data.csv')
n_features=100
values = df.values
for i in range(0,n_features):
df['X_t'+str(i)] = df['X'].shift(i)
df['X_tp'+str(i)] = (df['X'].shift(i) - df['X'].shift(i+1))/(df['X'].shift(i))
print(df)
pd.set_option('use_inf_as_null', True)
#df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
df.dropna(inplace=True)
X = df.drop('Y', axis=1)
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)
X_train = X_train.drop('time', axis=1)
X_train = X_train.drop('X_t1', axis=1)
X_train = X_train.drop('X_t2', axis=1)
X_test = X_test.drop('time', axis=1)
X_test = X_test.drop('X_t1', axis=1)
X_test = X_test.drop('X_t2', axis=1)
sc = MinMaxScaler()
X_train = np.array(df['X'])
X_train = X_train.reshape(-1, 1)
X_train = sc.fit_transform(X_train)
y_train = np.array(df['Y'])
y_train=y_train.reshape(-1, 1)
y_train = sc.fit_transform(y_train)
model_data = TimeseriesGenerator(X_train, y_train, 100, batch_size = 10)
# Initialising the RNN
model = Sequential()
# Adding the input layerand the LSTM layer
model.add(LSTM(4, input_shape=(None, 1)))
# Adding the output layer
model.add(Dense(1))
# Compiling the RNN
model.compile(loss='mse', optimizer='rmsprop')
# Fitting the RNN to the Training set
model.fit_generator(model_data)
# evaluate the model
#scores = model.evaluate(X_train, y_train)
#print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# Getting the predicted values
predicted = X_test
predicted = sc.transform(predicted)
predicted = predicted.reshape((-1, 1, 1))
y_pred = model.predict(predicted)
y_pred = sc.inverse_transform(y_pred)
When I plot the prediction as this
plt.figure
plt.plot(y_test, color = 'red', label = 'Actual')
plt.plot(y_pred, color = 'blue', label = 'Predicted')
plt.title('Prediction')
plt.xlabel('Time [INdex]')
plt.ylabel('Values')
plt.legend()
plt.show()
The following plot is what I get.
However, if we plot each column separately,
groups = [1, 2]
i = 1
# plot each column
plt.figure()
for group in groups:
plt.subplot(len(groups), 1, i)
plt.plot(values[:, group])
plt.title(df.columns[group], y=0.5, loc='right')
i += 1
plt.show()
The following plots are what we get.
How can we improve the prediction accuracy?