diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 585b17acd..673d3edfb 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -436,3 +436,58 @@ def transform_dbscan( ) return g return emb, X, y, df + + + +def get_dendrogram_edges(df: pd.DataFrame, as_graph: bool = True) -> Union[pd.DataFrame, Any]: + """Converts a dataframe of feature embeddings to a dendrogram graph with edges between each merge + This will calculate what AgglomerativeClustering does under the hood, but using the linkage matrix + + Args: + :df: dataframe of feature embeddings + :as_graph: whether to return a graphistry graph or a dataframe of edges + Usage: + :: + g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') + g2 = g.umap().dbscan() # or g2 = g.featurize() + g3 = get_dendrogram_edges(g2.get_matrix(), as_graph=True) + """ + from scipy.cluster.hierarchy import linkage + import graphistry + + # df is the numeric dataframe from umap, or featurize + Z = linkage(df, 'ward') + # Convert to a DataFrame + df2 = pd.DataFrame(Z, columns=['src', 'dst', 'dist', 'size']) + + # Create a new node for each merge + num_samples = len(df) + df2['src'] = df2['src'].astype(int) + df2['dst'] = df2['dst'].astype(int) + + # The new node is the index + the number of samples + df2['new_node'] = df2.index + num_samples + + # Convert the dataframe to have each edge as a row + edges_src = pd.DataFrame({ + 'node1': df2['new_node'], + 'node2': df2['src'], + 'dist': df2['dist'], + }) + + edges_dst = pd.DataFrame({ + 'node1': df2['new_node'], + 'node2': df2['dst'], + 'dist': df2['dist'] + }) + + edges = pd.concat([edges_src, edges_dst]) + + # Handle data type + edges['node1'] = edges['node1'].astype(int) + edges['node2'] = edges['node2'].astype(int) + + if as_graph: + g = graphistry.edges(edges, 'node1', 'node2') + return g + return edges diff --git a/graphistry/tests/test_compute_cluster.py b/graphistry/tests/test_compute_cluster.py index c93d0e279..947f10ddc 100644 --- a/graphistry/tests/test_compute_cluster.py +++ b/graphistry/tests/test_compute_cluster.py @@ -4,7 +4,7 @@ import graphistry from graphistry.constants import DBSCAN from graphistry.util import ModelDict -from graphistry.compute.cluster import lazy_dbscan_import_has_dependency +from graphistry.compute.cluster import lazy_dbscan_import_has_dependency, get_dendrogram_edges has_dbscan, _, has_gpu_dbscan, _ = lazy_dbscan_import_has_dependency() @@ -67,7 +67,26 @@ def test_transform_dbscan(self): g3 = g2.transform_dbscan(ndf, ndf, verbose=True) self._condition(g3, kind) - + +class TestDendrogram(unittest.TestCase): + + @pytest.mark.skipif(not has_dbscan, reason="requires ai dependencies") + def setUp(self) -> None: + g = graphistry.nodes(ndf).edges(edf, 'src', 'dst') + gs = [] + for kind in ['nodes', 'edges']: + g2 = g.umap(kind=kind, n_topics=2, dbscan=False).dbscan(kind=kind, verbose=True) + gs.append(g2) + self.gs = gs + + @pytest.mark.skipif(not has_dbscan, reason="requires ai dependencies") + def testDendrogramToGraph(self): + for kind, g2 in zip(['nodes', 'edges'], self.gs): + g3 = get_dendrogram_edges(g2.get_matrix(kind=kind)) + self.assertTrue('node1' in g3._edges, 'dendrogram graph has no `node1` column') + self.assertTrue('node2' in g3._edges, 'dendrogram graph has no `node1` column') + + if __name__ == '__main__': unittest.main()