0
votes

I am using Iris dataset and DBSCAN clustering in sklearn to cluster the different data points in the dataset and then finally color the clustered data points according to the DBSCAN trained on the dataset using matplotlib in Python 3.

The code that I have is as follows-

import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


# Load CSV dataset-
iris_data = pd.read_csv("iris.csv")

# Get dimension of dataset-
iris_data.shape
# (150, 5)

# Get data types of all attributes in dataset-
iris_data.dtypes
'''
sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
class           object
dtype: object
'''

# Do label encoding for 'class' attribute-
le = LabelEncoder()
encoded_class = le.fit_transform(iris_data['class'])

# Delete 'class' attribute-
iris_data.drop('class', axis = 1, inplace=True)

# Add 'encoded_class' attribute-
iris_data['encoded_class'] = encoded_class


# Create an instance of DBSCAN with default values for
# 'eps' and 'min_samples' parameters-
dbscan = DBSCAN()

# Check default parameters being used-
dbscan
'''
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)
'''

# Train the dateset using DBSCAN clustering algorithm-
dbscan.fit(iris_data)

# To check outcome of DBSCAN clustering algorithm-
dbscan.labels_
# Noisy samples are given the label -1

# To get count of unique labels assigned to the different data points in dataset-
np.unique(dbscan.labels_)
array([-1, 0, 1, 2])

# A dict to count number of data points assigned to different labels by DBSCAN algorithm-
# label : number of data points assigned to label
count_elements = {}

count_elements[-1] = 0
count_elements[0] = 0
count_elements[1] = 0
count_elements[2] = 0

for i in dbscan.labels_:
    if i == -1:
        count_elements[-1] += 1
    elif i == 0:
        count_elements[0] += 1
    elif i == 1:
        count_elements[1] += 1
    elif i == 2:
        count_elements[2] += 1


count_elements
# {-1: 18, 0: 49, 1: 44, 2: 39}



print("\nNumber of data points and their computed labels are:\n{0}\n".format(count_elements))
'''
Number of data points and their computed labels are:
{-1: 18, 0: 49, 1: 44, 2: 39}
'''


# Visualize iris dataset using 'petallength' and 'petalwidth' attributes-
plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c=iris_data['encoded_class'])

plt.xlabel('petal length')
plt.ylabel('petal width')
plt.show()


plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = dbscan.labels_)

plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.legend(list(dbscan.labels_))
plt.show()


for i in range(iris_data.shape[0]):
    if dbscan.labels_[i] == 0:
        c1 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'green', marker = '+')
    elif dbscan.labels_[i] == 1:
        c2 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'red', marker = 'o')
    elif dbscan.labels_[i] == 2:
        c3 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'blue', marker = '*')
    elif dbscan.labels_[i] == -1:
        c4 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'black', marker = '.') 

plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Noise'])
plt.title("DBSCAN clustering finds 3 clusters and noise")
plt.show()

The last code for visualizing assigned clusters to each data point does not work. What's going wrong?

Thanks!

1

1 Answers

0
votes
data = np.load('clusterable_data.npy')
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, prediction_data=True).fit(data)
pal = sns.color_palette('deep', 8)
colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_,
                                                            clusterer.probabilities_)]
plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);