import pandas as pd
import numpy as np
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps, n_test):
X, y = list(), list()
for i in range(0,len(sequences),100):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if i!=0 and end_ix > len(sequences):
break
sequences[i:end_ix,0]=np.insert(np.diff(sequences[i:end_ix,0]),0,0)
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix-n_test], sequences[end_ix-n_test:end_ix]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
df = pd.read_csv('time-series-19-covid-combined.csv')
df = df.drop(['Lat','Long'], axis = 1)
df.columns = ['day','country', 'territory','confirmed','recovered','deaths']
data=df[df.country.isin(['Australia','Costa Rica','Greece','Hungary','Israel'])][['confirmed','recovered','deaths']]
is_brazil = (df['country']=='Brazil')
data2=df[(is_brazil)][['confirmed','recovered','deaths']]
date=df[(is_brazil)][['day','confirmed']]
date.day = pd.to_datetime(date.day,format='%Y%m%d', errors='ignore')
date.set_index('day', inplace=True)
n_features = data.shape[1] # this is number of parallel inputs
n_timesteps = date.shape[0] # this is number of timesteps
n_test = int(n_timesteps*0.25)
X, Y = split_sequences(data.values, n_timesteps, n_test)
#normalization#####################################################
alld=np.concatenate((X,Y),1)
alld=alld.reshape(alld.shape[0]*alld.shape[1],alld.shape[2])
scaler = MinMaxScaler()
scaler.fit(alld)
X=[scaler.transform(x) for x in X]
y=[scaler.transform(y) for y in Y]
X=np.array(X)
y=np.array(y)[:,:,0]
# define model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_timesteps - n_test, n_features)))
model.add(Dense(y.shape[1]))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit(X, y, epochs=200, verbose=1)
# evaluation
data2x=data2
truth = data2
data2x.values[0:len(data2x),0]=np.insert(np.diff(data2x.values[0:len(data2x),0]),0,0)
data2x=scaler.transform(data2x)
X_test = np.expand_dims(data2x, axis=0)
yhat = model.predict(X_test[:,-n_timesteps + n_test:,:], verbose=0)
print (data2x[-n_timesteps + n_test:,0], yhat)
actual_predictions = scaler.inverse_transform(np.tile(yhat, (1, 1, 3))[0])[:,0]
Sizes and values:
X: array of float-64 (16,108,3)
X_test: array of float-64 (1,144,3)
Y: array of float-64 (16,36,3)
alld: array of float-64 (2304,3)
data: Dataframe (1728,3)
data2: Dataframe (144,3)
data2x: array of float-64 (144,3)
date: Dataframe (144,1)
df: Dataframe (38448,6)
is_brazil: Series (38448,)
n_features: 3 (int)
n_test: 36 (int)
n_timesteps: 144 (int)
truth: Dataframe (144,3)
y: Array of float-64 (16,36)
yhat: Array of float-32 (1,36)
What I intend to do on my project is to train an LSTM with data from confirmed cases, recovered patients and deaths from a certain set of countries and try to predict the number of cases in another country. For example: training the LSTM with data from Australia, Costa Rica, Greece, Hungary and Israel and trying to predict the number of cases in Brazil.
Found original code here and tried to code it using Keras, but in the last line of code above, when I try to reverse normalization, I'm having the error: ValueError: operands could not be broadcast together with shapes (1,108) (3,) (1,108)
I have no clue of what can be done to solve this. Searched in other threads, but with no success. Any solution will be greatly appreciated.
Best regards,
Higo.