I tried doing DBSCAN clustering using word2vec weighted tfidf vectors and used different thresholds of epsilon and minpts for DBSCAN. I also tried optics clustering method with different minpts, however it didn't yield any output at all.
#Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode # $ pip install unidecode
import gensim
import csv
import nltk
from sklearn.feature_extraction import text
import pandas as pd
import numpy as np
from collections import defaultdict
from string import lower
#read data
dat = pd.read_csv('D:\\data_800k.csv',encoding='latin',nrows=500000).Certi.tolist()
wnl = WordNetLemmatizer()
#nltk.download('punkt')
my_stop_words = text.ENGLISH_STOP_WORDSunion(['education','certification','certificate','certified'])
def tokenize_stop(row):
az = []
for j in nltk.word_tokenize(lower(unidecode(row))):
if j not in my_stop_words:
az.extend([j])
return az
def preprocess(dat):
return [tokenize_stop(row) for row in dat]
X = preprocess(dat)
#word2vec
model = gensim.models.Word2Vec(X, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
#
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
max_idf = max(tfidf.idf_)
#train model
def fit(X):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
return defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
#actual training//
word2weight = fit(X)
#multiply word2vec with tfidf
def transform_word2vec_tfidf(X,word2vec,word2weight):
return np.array([
np.mean([word2vec[w] * word2weight[w]
for w in words if w in word2vec] or
[np.zeros(dim)], axis=0)
for words in X
])
export_data_w2v_Tfidf = transform_word2vec_tfidf(X,w2v,word2weight)
np.savetxt('D:\Azim\data_500k_w2v_tfidf.csv',export_data_w2v_Tfidf,delimiter=',',fmt=('%1.15e'))
Below is ELKI screenshot. Can anyone share insights f they were able to do meaningful clustering of text data using DBSCAN or any other algorithm? Thanks