I have a machine learning problem where I am calculating bigram Jaccard similarity of a pandas dataframe text column with values of a dictionary. Currently I am storing them as a list and then converting them to columns. This is proving to be very slow in production. Is there a more efficient way to do it?
Following are the steps I am currently following: For each key in dict: 1. Get bigrams for the pandas column and the dict[key] 2. Calculate Jaccard similarity 3. Append to an empty list 4. Store the list in the dataframe 5. Convert the list to columns
from itertools import tee, islice
def count_ngrams(lst, n):
tlst = lst
while True:
a, b = tee(tlst)
l = tuple(islice(a, n))
if len(l) == n:
yield l
next(b)
tlst = b
else:
break
def n_gram_jaccard_similarity(str1, str2,n):
a = set(count_ngrams(str1.split(),n))
b = set(count_ngrams(str2.split(),n))
intersection = a.intersection(b)
union = a.union(b)
try:
return len(intersection) / float(len(union))
except:
return np.nan
def jc_list(sample_dict,row,n):
sim_list = []
for key in sample_dict:
sim_list.append(n_gram_jaccard_similarity(sample_dict[key],row["text"],n))
return str(sim_list)
Using the above functions to build the bigram Jaccard similarity features as follows:
df["bigram_jaccard_similarity"]=df.apply(lambda row: jc_list(sample_dict,row,2),axis=1)
df["bigram_jaccard_similarity"] = df["bigram_jaccard_similarity"].map(lambda x:[float(i) for i in [a for a in [s.replace(',','').replace(']', '').replace('[','') for s in x.split()] if a!='']])
df[[i for i in sample_dict]] = pd.DataFrame(df["bigram_jaccard_similarity"].values.tolist(), index= df.index)
Sample input:
df = pd.DataFrame(columns=["id","text"],index=None)
df.loc[0] = ["1","this is a sample text"]
import collections
sample_dict = collections.defaultdict()
sample_dict["r1"] = "this is sample 1"
sample_dict["r2"] = "is sample"
sample_dict["r3"] = "sample text 2"
Expected output:
