So I'm trying to train a model to read the greetings from the sample dataset collected from Tripadvisor and I've been getting the following error when I am trying to train the model sets.
Here's the link to the dataset - https://nextit-public.s3-us-west-2.amazonaws.com/rsics.html?fbclid=IwAR0CktLQtuPBaZNk03odCKdrjN3LjYl_ouuFBbWvyj-yQ-BvzJ0v_n9w9xo
Here's my code;
import streamlit as st
import numpy as np
import pandas as pd
# NLP Pkgs
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import os
# Main Stuff
st.title("Greetings NLP - Presence")
st.subheader("Created using Streamlit - Harshil Parikh ")
# Loading the data into streamlit
@st.cache
def load_data(nrows):
#data = pd.read_csv('/Users/harshilparikh/Desktop/INT/data/selections.csv', nrows=nrows)
dataset = st.cache(pd.read_csv)('/Users/harshilparikh/Desktop/INT/data/selections.csv')
return dataset
data_load_state = st.text('Loading data...')
dataset = load_data(1000)
data_load_state.text('Data loaded.')
#Displaying all data first
if st.checkbox('Show Raw data'):
st.subheader('Raw Data')
st.write(dataset)
# GREETING TAB
st.subheader('Greetings')
greet = st.sidebar.multiselect("Select Greeting", dataset['Greeting'].unique())
select = dataset[(dataset['Greeting'].isin(greet))]
# SEPARATING ONLY TWO COLUMNS FROM THE DATA
greet_select = select[['Greeting','Selected']]
select_check= st.checkbox("Display records with greeting")
if select_check:
st.write(greet_select)
#Text- Preprocessing - Range from 0 to 6758 total feedback
nltk.download('stopwords')
corpus = []
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)
#BAG OF WORDS
cv = CountVectorizer(max_features = 6758)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
st.write(X)
st.write(y)
st.write(cv)
#Training sets (800 values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
#X_train[0, 0:10] #First 10 rows of the first column of X_train.
# NLP - Naive Bayes algorithm
classifier = GaussianNB()
classifier.fit(X_train, y_train)
I'm trying to learn simple NPL. Any helps will be appreciated.
Error I'm getting
ValueError: Found input variables with inconsistent numbers of samples: [1, 6759] Traceback: File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/streamlit/script_runner.py", line 332, in _run_script exec(code, module.dict) File "/Users/harshilparikh/Desktop/INT/first.py", line 90, in X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_split.py", line 2127, in train_test_split arrays = indexable(*arrays) File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 292, in indexable check_consistent_length(*result) File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 255, in check_consistent_length raise ValueError("Found input variables with inconsistent numbers of"