I came across the exact same problem some time ago. The way I managed to plot the damn dendogram was using the software package ete3. This package is able to flexibly plot trees with various options. The only difficulty was to convert sklearn
's children_
output to the Newick Tree format that can be read and understood by ete3
. Furthermore, I need to manually compute the dendrite's span because that information was not provided with the children_
. Here is a snippet of the code I used. It computes the Newick tree and then shows the ete3
Tree datastructure. For more details on how to plot, take a look here
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import ete3
def build_Newick_tree(children,n_leaves,X,leaf_labels,spanner):
"""
build_Newick_tree(children,n_leaves,X,leaf_labels,spanner)
Get a string representation (Newick tree) from the sklearn
AgglomerativeClustering.fit output.
Input:
children: AgglomerativeClustering.children_
n_leaves: AgglomerativeClustering.n_leaves_
X: parameters supplied to AgglomerativeClustering.fit
leaf_labels: The label of each parameter array in X
spanner: Callable that computes the dendrite's span
Output:
ntree: A str with the Newick tree representation
"""
return go_down_tree(children,n_leaves,X,leaf_labels,len(children)+n_leaves-1,spanner)[0]+';'
def go_down_tree(children,n_leaves,X,leaf_labels,nodename,spanner):
"""
go_down_tree(children,n_leaves,X,leaf_labels,nodename,spanner)
Iterative function that traverses the subtree that descends from
nodename and returns the Newick representation of the subtree.
Input:
children: AgglomerativeClustering.children_
n_leaves: AgglomerativeClustering.n_leaves_
X: parameters supplied to AgglomerativeClustering.fit
leaf_labels: The label of each parameter array in X
nodename: An int that is the intermediate node name whos
children are located in children[nodename-n_leaves].
spanner: Callable that computes the dendrite's span
Output:
ntree: A str with the Newick tree representation
"""
nodeindex = nodename-n_leaves
if nodename<n_leaves:
return leaf_labels[nodeindex],np.array([X[nodeindex]])
else:
node_children = children[nodeindex]
branch0,branch0samples = go_down_tree(children,n_leaves,X,leaf_labels,node_children[0])
branch1,branch1samples = go_down_tree(children,n_leaves,X,leaf_labels,node_children[1])
node = np.vstack((branch0samples,branch1samples))
branch0span = spanner(branch0samples)
branch1span = spanner(branch1samples)
nodespan = spanner(node)
branch0distance = nodespan-branch0span
branch1distance = nodespan-branch1span
nodename = '({branch0}:{branch0distance},{branch1}:{branch1distance})'.format(branch0=branch0,branch0distance=branch0distance,branch1=branch1,branch1distance=branch1distance)
return nodename,node
def get_cluster_spanner(aggClusterer):
"""
spanner = get_cluster_spanner(aggClusterer)
Input:
aggClusterer: sklearn.cluster.AgglomerativeClustering instance
Get a callable that computes a given cluster's span. To compute
a cluster's span, call spanner(cluster)
The cluster must be a 2D numpy array, where the axis=0 holds
separate cluster members and the axis=1 holds the different
variables.
"""
if aggClusterer.linkage=='ward':
if aggClusterer.affinity=='euclidean':
spanner = lambda x:np.sum((x-aggClusterer.pooling_func(x,axis=0))**2)
elif aggClusterer.linkage=='complete':
if aggClusterer.affinity=='euclidean':
spanner = lambda x:np.max(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2))
elif aggClusterer.affinity=='l1' or aggClusterer.affinity=='manhattan':
spanner = lambda x:np.max(np.sum(np.abs(x[:,None,:]-x[None,:,:]),axis=2))
elif aggClusterer.affinity=='l2':
spanner = lambda x:np.max(np.sqrt(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2)))
elif aggClusterer.affinity=='cosine':
spanner = lambda x:np.max(np.sum((x[:,None,:]*x[None,:,:]))/(np.sqrt(np.sum(x[:,None,:]*x[:,None,:],axis=2,keepdims=True))*np.sqrt(np.sum(x[None,:,:]*x[None,:,:],axis=2,keepdims=True))))
else:
raise AttributeError('Unknown affinity attribute value {0}.'.format(aggClusterer.affinity))
elif aggClusterer.linkage=='average':
if aggClusterer.affinity=='euclidean':
spanner = lambda x:np.mean(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2))
elif aggClusterer.affinity=='l1' or aggClusterer.affinity=='manhattan':
spanner = lambda x:np.mean(np.sum(np.abs(x[:,None,:]-x[None,:,:]),axis=2))
elif aggClusterer.affinity=='l2':
spanner = lambda x:np.mean(np.sqrt(np.sum((x[:,None,:]-x[None,:,:])**2,axis=2)))
elif aggClusterer.affinity=='cosine':
spanner = lambda x:np.mean(np.sum((x[:,None,:]*x[None,:,:]))/(np.sqrt(np.sum(x[:,None,:]*x[:,None,:],axis=2,keepdims=True))*np.sqrt(np.sum(x[None,:,:]*x[None,:,:],axis=2,keepdims=True))))
else:
raise AttributeError('Unknown affinity attribute value {0}.'.format(aggClusterer.affinity))
else:
raise AttributeError('Unknown linkage attribute value {0}.'.format(aggClusterer.linkage))
return spanner
clusterer = AgglomerativeClustering(n_clusters=2,compute_full_tree=True) # You can set compute_full_tree to 'auto', but I left it this way to get the entire tree plotted
clusterer.fit(X) # X for whatever you want to fit
spanner = get_cluster_spanner(clusterer)
newick_tree = build_Newick_tree(clusterer.children_,clusterer.n_leaves_,X,leaf_labels,spanner) # leaf_labels is a list of labels for each entry in X
tree = ete3.Tree(newick_tree)
tree.show()