I am working with Gensim FASTText modeling and have the following questions.
- The output of "ft_model.save(BASE_PATH + MODEL_PATH + fname)" saves the following 3 files. Is this correct? is there a way to combine all three files?
ft_gensim-v3 ft_gensim-v3.trainables.vectors_ngrams_lockf.npy ft_gensim-v3.wv.vectors_ngrams.npy
When I attempt to load the training file and then use it, I get the following error from if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
'function' object has no attribute 'wv'
Finally, both models, is there a way not to have to store the output of def read_train(path,label_path)
and def lemmetize(df_col)
so I do not have to run this part of the code every time I want to train the model or compare?
Thanks for the assistance.
Here is my FastText Train Model
import os
import logging
from config import BASE_PATH, DATA_PATH, MODEL_PATH
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath
#Read Training data
import pandas as pd
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
#print(d)
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
word_tokenized_corpus = read_train('Train Data.xlsx','SMEQueryText.csv')
#Train fasttext model
import tempfile
import os
from gensim.models import FastText
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("ft_gensime-v3")
def train_fastText(data, embedding_size = 60, window_size = 40, min_word = 5, down_sampling = 1e-2, iter=100):
ft_model = FastText(word_tokenized_corpus,
size=embedding_size,
window=window_size,
min_count=min_word,
sample=down_sampling,
sg=1,
iter=100)
#with tempfile.NamedTemporaryFile(prefix=BASE_PATH + MODEL_PATH + 'ft_gensim_v2-', delete=False) as tmp:
# ft_model.save(tmp.name, separately=[])
ft_model.save(BASE_PATH + MODEL_PATH + fname)
return ft_model
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = train_fastText(train_data)
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','QueryText.csv')
Here is my Usage Model
import pandas as pd
from gensim.models import FastText
import gensim
from config import BASE_PATH, DATA_PATH, MODEL_PATH
#Read Training data
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
def load_training():
return FT_gensim.load(BASE_PATH + MODEL_PATH +'ft_gensim-v3')
#compare similarity
def compare_similarity(model, real_data, labelled):
maxWord = ''
category = ''
maxSimilaity = 0
#print("train data",labelled[1])
for i in range(len(labelled)):
if model.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
#print('labelled',labelled['QueryText'][i], 'i', i)
maxWord = labelled['QueryText'][i]
category = labelled['Subjectmatter'][i]
maxSimilaity = model.similarity(real_data, labelled['QueryText'][i])
return maxWord, category, maxSimilaity
# Output from Main to excel
from pandas import ExcelWriter
def export_Excel(data, aFile = 'FASTTEXTOutput.xlsx'):
df = pd.DataFrame(data)
writer = ExcelWriter(aFile)
df.to_excel(writer,'Sheet1')
writer.save()
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = load_training
for i in range(len(test_data)):
output_df['test_query'][i] = test_data['query_text'][i]
#<first change>
maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
output_df['Similar word'][i] = maxWord
output_df['category'][i] = category
output_df['similarity'][i] = maxSimilaity
#<second change>
return output_df
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
export_Excel(output)
Here is the full tracible error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-57803b59c0b9> in <module>
1 # run main
2 if __name__ == "__main__":
----> 3 output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
4 export_Excel(output)
<ipython-input-21-17cb88ee0f79> in main(test_path, train_path, labelled)
13 output_df['test_query'][i] = test_data['query_text'][i]
14 #<first change>
---> 15 maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
16 output_df['Similar word'][i] = maxWord
17 output_df['category'][i] = category
<ipython-input-19-84d7f268d669> in compare_similarity(model, real_data, labelled)
6 #print("train data",labelled[1])
7 for i in range(len(labelled)):
----> 8 if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
9 #print('labelled',labelled['QueryText'][i], 'i', i)
10 maxWord = labelled['QueryText'][i]
AttributeError: 'function' object has no attribute 'wv'