I'm about to predict IMDB score (film rate) using Support Vector Regression in Scikit-Learn. The problem is it always gives the same prediction result for every input.
When i predict using data training, it gives various result. But when using data testing, it always gives the same value.
Data training prediction:
Data testing prediction:
Here is the link for dataset: IMDB 5000 Movie Dataset
My codes:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn import metrics as met
df = pd.read_csv("movie_metadata.csv")
df.head()
original = df.shape[0]
df = df.drop_duplicates(["movie_title"])
notDuplicated = df.shape[0]
df.reset_index(drop = True, inplace = True)
print(original, notDuplicated)
df["num_critic_for_reviews"].fillna(0, inplace = True)
df["num_critic_for_reviews"] = df["num_critic_for_reviews"].astype("int")
df["director_facebook_likes"].fillna(0, inplace = True)
df["director_facebook_likes"] = df["director_facebook_likes"].astype("int")
df["actor_3_facebook_likes"].fillna(0, inplace = True)
df["actor_3_facebook_likes"] = df["actor_3_facebook_likes"].astype(np.int64)
df["actor_2_facebook_likes"].fillna(0, inplace = True)
df["actor_2_facebook_likes"] = df["actor_2_facebook_likes"].astype(np.int64)
df["actor_1_facebook_likes"].fillna(0, inplace = True)
df["actor_1_facebook_likes"] = df["actor_1_facebook_likes"].astype(np.int64)
df["movie_facebook_likes"].fillna(0, inplace = True)
df["movie_facebook_likes"] = df["movie_facebook_likes"].astype(np.int64)
df["content_rating"].fillna("Not Rated", inplace = True)
df["content_rating"].replace('-', "Not Rated", inplace = True)
df["content_rating"] = df["content_rating"].astype("str")
df["imdb_score"].fillna(0.0, inplace = True)
df["title_year"].fillna(0, inplace = True)
df["title_year"].replace("NA", 0, inplace = True)
df["title_year"] = df["title_year"].astype("int")
df["genres"].fillna("", inplace = True)
df["genres"] = df["genres"].astype("str")
df2 = df[df["title_year"] >= 1980]
df2.reset_index(drop = True, inplace = True)
nRow = len(df2)
print("Number of data:", nRow)
nTrain = np.int64(np.floor(0.7 * nRow))
nTest = nRow - nTrain
print("Number of data training (70%):", nTrain, "\nNumber of data testing (30%):", nTest)
dataTraining = df2[0:nTrain]
dataTesting = df2[nTrain:nRow]
dataTraining.reset_index(drop = True, inplace = True)
dataTesting.reset_index(drop = True, inplace = True)
xTrain = dataTraining[["num_critic_for_reviews", "director_facebook_likes", "actor_3_facebook_likes", "actor_2_facebook_likes", "actor_1_facebook_likes", "movie_facebook_likes"]]
yTrain = dataTraining["imdb_score"]
xTest = dataTesting[["num_critic_for_reviews", "director_facebook_likes", "actor_3_facebook_likes", "actor_2_facebook_likes", "actor_1_facebook_likes", "movie_facebook_likes"]]
yTest = dataTesting["imdb_score"]
movieTitle = dataTesting["movie_title"].reset_index(drop = True)
from sklearn.svm import SVR
svrModel = SVR(kernel = "rbf", C = 1e3, gamma = 0.1, epsilon = 0.1)
svrModel.fit(xTrain,yTrain)
predicted = svrModel.predict(xTest)
[print(movieTitle[i], ":", predicted[i]) for i in range(10)]