Robustness of the Leiden clustering

Robustness of the Leiden clustering#

Assessing variability using clustroids#

We now have a clustering of our data points; however, we wanted to assess the robustness of the clustering. For this, we used medoids, which are clustroids generated from the mean of each cluster.

We will perform cross validation of the previous clustering by :

Splitting the data into 5 folds
For each fold, we compute the Leiden clustering from scratch and assign a cluster to each training point.
We compute the clustroids in the original UMAP embedding space (using all data points, not just the ones in the fold). This is to make clusterings comparable across folds : we show the clustering assignments of the training points in a consistent latent space.
We use k-nearest neighbors to assign each test point to a cluster, again in the original UMAP embedding space.
We obtain test clustroids by computing the mean of each cluster in the test set.

For now, we simply show the original clustroids in the entire dataset.

import pandas as pd
import numpy as np

from pathlib import Path
import utils

# dimensionality reduction
from sklearn.decomposition import PCA
from umap import umap_ as UMAP

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

# clustering
from leiden_clustering import LeidenClustering

import utils

DATA_PATH = Path.cwd() / "../data"
FIG_PATH = Path.cwd() / "../src"
FILTER_TIMEOUT = False
if FILTER_TIMEOUT:
    FIG_PATH = FIG_PATH / "figures/leiden/"
else:
    FIG_PATH = FIG_PATH / "figures/leiden_with_timeout/"
FIG_PATH = FIG_PATH.resolve()
print(f"Figure path: {FIG_PATH}")
data = {
    Path(f).stem: pd.read_csv(f, index_col=0) for f in DATA_PATH.glob("combined_*.csv")
}
print(list(data.keys()))
if not FIG_PATH.exists():
    FIG_PATH.mkdir(parents=True)

EXPORT = False

Figure path: C:\Users\Cyril\Desktop\Code\ada-2023-project-adamants\src\figures\leiden_with_timeout
['combined_metrics_finished_edges', 'combined_metrics_finished_paths', 'combined_metrics_unfinished_edges', 'combined_metrics_unfinished_paths']

Reproducibility#

First, we will reproduce the results of the previous clustering and find the medoids using all the data.

features_finished_paths = data["combined_metrics_finished_paths"].reset_index(drop=True)
features_unfinished_paths = data["combined_metrics_unfinished_paths"].reset_index(
    drop=True
)

features_unfinished_paths = features_unfinished_paths[
    features_unfinished_paths["type"] != "timeout"
]


# extra step used for stratified sampling
features_unfinished_paths["origin"] = "unfinished"
features_finished_paths["origin"] = "finished"

combined_df = pd.concat([features_finished_paths, features_unfinished_paths], axis=0)
combined_df["finished"] = [1] * len(features_finished_paths) + [0] * len(
    features_unfinished_paths
)
combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING] = utils.normalize_features(
    combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING]
)

X = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy()
print(X.columns)
X = X.values
# find number of NaNs in each column
nans = np.isnan(X).sum(axis=0)
print(nans)
# drop rows with NaNs
X = X[~np.isnan(X).any(axis=1)]

Index(['durationInSec', 'backtrack', 'numberOfPath', 'position_mean',
       'position_std', 'path_length', 'coarse_mean_time',
       'semantic_similarity', 'ratio', 'path_closeness_abs_sum'],
      dtype='object')
[0 0 3 0 0 0 0 0 0 0]

# fix seed for reproducibility
np.random.seed(1)
clustering = LeidenClustering(
    leiden_kws={"n_iterations": -1, "seed": 0, "resolution_parameter": 0.2},
    pca_kws={"n_components": 7},
)
clustering.fit(X)
clustering.labels_

c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:346: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:348: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:358: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")

array([0, 1, 1, ..., 0, 3, 0])

# UMAP
umap = UMAP.UMAP(n_components=3, metric="euclidean")
result_umap_euc = umap.fit_transform(X)

c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:346: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:348: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")


c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\pynndescent\pynndescent_.py:358: NumbaPendingDeprecationWarning:

Code using Numba extension API maybe depending on 'old_style' error-capturing, which is deprecated and will be replaced by 'new_style' in a future release. See details at https://numba.readthedocs.io/en/latest/reference/deprecation.html#deprecation-of-old-style-numba-captured-errors
Exception origin:
  File "c:\Users\Cyril\anaconda3\envs\DLbiomed\lib\site-packages\numba\core\types\functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")

X_w_clusters = pd.concat(
    [
        combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING]
        .dropna()
        .reset_index(drop=True),
        pd.DataFrame({"cluster": clustering.labels_}),
        pd.DataFrame(result_umap_euc, columns=["umap_1", "umap_2", "umap_3"]),
    ],
    axis=1,
)
# X_w_clusters

# find the clustroid of each cluster
clustroids = []
for cluster in np.unique(clustering.labels_):
    clustroid = X_w_clusters[X_w_clusters["cluster"] == cluster].mean()
    clustroids.append(clustroid)
clustroids = pd.DataFrame(clustroids)
clustroids

	durationInSec	backtrack	numberOfPath	position_mean	position_std	path_length	coarse_mean_time	semantic_similarity	ratio	path_closeness_abs_sum	cluster	umap_1	umap_2	umap_3
0	0.424659	-0.283879	-0.360384	0.399034	0.546873	0.257930	0.529426	-0.118988	0.432146	0.235192	0.0	1.089718	2.370224	3.910013
1	-0.437841	-0.250777	-0.314141	-0.574306	-0.352495	0.339709	-0.667307	0.414844	0.217711	-0.450524	1.0	0.638870	2.518832	3.230923
2	-0.717510	-0.214685	-0.250738	-0.119958	-0.742841	-1.142415	-0.174853	-0.282738	-1.251060	0.405756	2.0	9.072618	1.501182	6.086436
3	0.916725	1.533694	-0.035864	0.174894	0.323838	0.561424	0.209586	-0.036058	0.398022	-0.574947	3.0	0.927876	2.925985	5.234965
4	-0.236821	-0.196367	2.127195	-0.096609	0.021772	0.026851	-0.245901	0.142964	0.143711	0.107448	4.0	2.523256	1.447267	4.649573

Plot of the clustroids in UMAP space#

from plotly import graph_objects as go

fig_clusters = px.scatter_3d(
    result_umap_euc,
    x=0,
    y=1,
    z=2,
    color=clustering.labels_.astype(str),
    title="UMAP, clustering by leiden_clustering, with clustroids",
    # reduce size points
    size_max=0.1,
    # make scatter alpha = 0.1
    opacity=0.01,
)
fig_clusters.add_trace(
    go.Scatter3d(
        x=clustroids["umap_1"],
        y=clustroids["umap_2"],
        z=clustroids["umap_3"],
        mode="markers",
        marker=dict(
            size=10,
            symbol="cross",
            color="white"
            # color=clustering.labels_.astype(str),
        ),
        name="Clustroids",
    )
)
utils.format_3d_plot(fig_clusters)

if EXPORT:
    fig_clusters.write_html(FIG_PATH / "umap_w_clustroids.html")

display(fig_clusters)

N_FOLDS = 5
N_CLUSTERS = 5
features_used = utils.FEATURES_COLS_USED_FOR_CLUSTERING

5-fold cross validation#

Here we perform the 5-fold cross validation described above.

It is important to note that due to some randomness in the clustering algorithm, we perform a dichotomic search-like hyperparameter sweep to ensure that the number of clusters is the same across folds.

This is simply to ensure that the clusterings are comparable across folds; we have decided it was beyond the scope of this project to explore the differences in the number of clusters across folds, their reasons for appearing and their individual relevance.

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1)

clustering_train_results = {}
clustering_test_results = {}
clustroids_train_results = {}
clustroids_test_results = {}

# run clustering on each fold, stratified by origin
cv_combined_df = combined_df.copy()[features_used + ["origin"]].dropna()
for fold, (train_idx, test_idx) in enumerate(
    kfold.split(cv_combined_df[features_used], cv_combined_df["origin"])
):
    print(f"Fold {fold}")
    X_train = cv_combined_df.iloc[train_idx][features_used].values
    X_test = cv_combined_df.iloc[test_idx][features_used].values

    print(f"Trains set : {X_train.shape}")
    print(f"Test set : {X_test.shape}")

    # find resolution parameter that leads to the previous number of clusters
    repeat = True
    resolution_parameter = 0.2
    resolution_parameters = [0.2]
    step=0.02
    while repeat:
        # fix seed for reproducibility
        print("Clustering train set")
        np.random.seed(1)
        train_clustering = LeidenClustering(
            leiden_kws={"n_iterations": -1, "seed": 0, "resolution_parameter": resolution_parameter},
            pca_kws={"n_components": 7},
        )
        train_clustering.fit(X_train)
        unique_labels=np.unique(train_clustering.labels_)
        print(f"Resolution parameter: {resolution_parameter}, n groups: {len(unique_labels)}")
        if len(unique_labels) == N_CLUSTERS:
            repeat = False
        # too many clusters, decrease resolution parameter
        elif len(unique_labels) > N_CLUSTERS:
            if resolution_parameter - step <= 0:
                #send error message
                raise ValueError("Could not find a resolution parameter that does not lead to more clusters than expected")
            if resolution_parameter - step in resolution_parameters:
                step=step/2.
            resolution_parameter -= step
        # too few clusters, increase resolution parameter
        else:
            if resolution_parameter + step >= 1:
                #send error message
                raise ValueError("Could not find a resolution parameter that does not lead to less clusters than expected")
            if resolution_parameter + step in resolution_parameters:
                step=step/2.
            resolution_parameter += step
        resolution_parameters.append(resolution_parameter)
            

        

    X_train_w_clusters = pd.concat(
        [
            combined_df[features_used].iloc[train_idx].dropna().reset_index(drop=True),
            pd.DataFrame({"cluster": train_clustering.labels_}),
            pd.DataFrame(
                result_umap_euc[train_idx], columns=["umap_1", "umap_2", "umap_3"]
            ),
        ],
        axis=1,
    )
    clustering_train_results[fold] = X_train_w_clusters
    # compute train clustroids
    print("Computing train clustroids")
    train_clustroids = []
    for cluster in np.unique(train_clustering.labels_):
        clustroid = X_train_w_clusters[X_train_w_clusters["cluster"] == cluster].mean()
        train_clustroids.append(clustroid)
    train_clustroids = pd.DataFrame(train_clustroids)
    clustroids_train_results[fold] = train_clustroids

    # test clustering using nearest neighbors
    print("Clustering test set using nearest neighbors")
    test_clustering = KNeighborsClassifier(n_neighbors=20)
    # fit on UMAP embeddings
    test_clustering.fit(result_umap_euc[train_idx], train_clustering.labels_)
    # predict on UMAP embeddings
    test_clustering.labels_ = test_clustering.predict(result_umap_euc[test_idx])

    X_test_w_clusters = pd.concat(
        [
            combined_df[features_used].iloc[test_idx].dropna().reset_index(drop=True),
            pd.DataFrame({"cluster": test_clustering.labels_}),
            pd.DataFrame(
                result_umap_euc[test_idx], columns=["umap_1", "umap_2", "umap_3"]
            ),
        ],
        axis=1,
    )
    clustering_test_results[fold] = X_test_w_clusters
    # find the clustroid of each cluster
    print("Computing test clustroids")
    test_clustroids = []
    for cluster in np.unique(train_clustering.labels_):
        clustroid = X_test_w_clusters[X_test_w_clusters["cluster"] == cluster].mean()
        test_clustroids.append(clustroid)
    test_clustroids = pd.DataFrame(test_clustroids)
    clustroids_test_results[fold] = test_clustroids
    colors = {
        "0": "red",
        "1": "blue",
        "2": "green",
        "3": "yellow",
        "4": "purple",
        "5": "orange",
    }
    # Train plot
    # fig = px.scatter_3d(
    #     result_umap_euc[train_idx],
    #     x=0,
    #     y=1,
    #     z=2,
    #     color=[colors[str(c)] for c in train_clustering.labels_],
    #     title=f"Train : UMAP, clustering by leiden_clustering\nFold {fold}",
    #     # reduce size points
    #     size_max=0.1,
    #     opacity=0.1,
    # )
    # fig
    # # Test plot
    # fig = px.scatter_3d(
    #     result_umap_euc[test_idx],
    #     x=0,
    #     y=1,
    #     z=2,
    #     color=[colors[str(c)] for c in test_clustering.labels_],
    #     title=f"Test : UMAP, clustering by leiden_clustering\nFold {fold}",
    #     # reduce size points
    #     size_max=0.1,
    #     opacity=0.1,
    # )
    # fig
    print("_" * 80)

Fold 0
Trains set : (50493, 10)
Test set : (12624, 10)
Clustering train set
Resolution parameter: 0.2, n groups: 5
Computing train clustroids
Clustering test set using nearest neighbors
Computing test clustroids
________________________________________________________________________________
Fold 1
Trains set : (50493, 10)
Test set : (12624, 10)
Clustering train set
Resolution parameter: 0.2, n groups: 5
Computing train clustroids
Clustering test set using nearest neighbors
Computing test clustroids
________________________________________________________________________________
Fold 2
Trains set : (50494, 10)
Test set : (12623, 10)
Clustering train set
Resolution parameter: 0.2, n groups: 5
Computing train clustroids
Clustering test set using nearest neighbors
Computing test clustroids
________________________________________________________________________________
Fold 3
Trains set : (50494, 10)
Test set : (12623, 10)
Clustering train set
Resolution parameter: 0.2, n groups: 5
Computing train clustroids
Clustering test set using nearest neighbors
Computing test clustroids
________________________________________________________________________________
Fold 4
Trains set : (50494, 10)
Test set : (12623, 10)
Clustering train set
Resolution parameter: 0.2, n groups: 4
Clustering train set
Resolution parameter: 0.22, n groups: 5
Computing train clustroids
Clustering test set using nearest neighbors
Computing test clustroids
________________________________________________________________________________

Assessing the variability of the clustering across folds#

We now face a problem linked to the implementation of Leiden clustering: the cluster labels are not consistent across folds.

Therefore, if we now try to compare distributions of cluster features across folds, we will not be able to compare properly the clusters.

Below we show the means of each feature across clusters, for each fold; where the issue is readily apparent.

Train set features means across folds#

# barplot of the mean of each feature for all the train sets
for fold, df in clustering_train_results.items():
    df["fold"] = fold
train_features_df = pd.concat(clustering_train_results.values(), axis=0)
train_features_df = train_features_df.groupby(["fold", "cluster"]).mean().reset_index()
with plt.style.context("dark_background"):
    fig, axs = plt.subplots(5, 2, figsize=(20, 40))
    fig.suptitle("Train set features means per cluster", fontsize=30)
    axs = axs.flatten()
    for i, feature in enumerate(features_used):
        sns.barplot(
            data=train_features_df,
            x="fold",
            y=feature,
            hue="cluster",
            errorbar="ci",
            palette="tab10",
            ax=axs[i],
        )
        axs[i].set_xlabel("Fold", fontsize=20)
        axs[i].set_ylabel("Feature value", fontsize=20)
        axs[i].set_title(utils.get_feature_names_labels()[i], fontsize=25)
        utils.set_axis_style(axs, i, add_xlabel=False, add_ylabel=False)

_images/d80b24b19ea226eed609a05951e4c463f2a41841a403f95710c889cee941b26c.png

Test set features means across folds#

for fold, df in clustering_test_results.items():
    df["fold"] = fold
test_features_df = pd.concat(clustering_test_results.values(), axis=0)
test_features_df = test_features_df.groupby(["fold", "cluster"]).mean().reset_index()
test_features_df
with plt.style.context("dark_background"):
    fig, axs = plt.subplots(5, 2, figsize=(20, 40))
    fig.suptitle("Test set features means per cluster", fontsize=30)
    axs = axs.flatten()
    for i, feature in enumerate(features_used):
        sns.barplot(
            data=test_features_df,
            x="fold",
            y=feature,
            hue="cluster",
            errorbar="ci",
            palette="tab10",
            ax=axs[i],
        )
        axs[i].set_xlabel("Fold", fontsize=20)
        axs[i].set_ylabel("Feature value", fontsize=20)
        axs[i].set_title(utils.get_feature_names_labels()[i], fontsize=25)
        utils.set_axis_style(axs, i, add_xlabel=False, add_ylabel=False)
plt.show()

_images/f5da71fb1b730802da4334b220e2f17b8daa3357d18d9e257ee4e61e3b2aa4dd.png

for fold in range(len(clustroids_test_results)):
    clustroids_test_results[fold]["fold"] = fold

clustroids_test_results_tmp = pd.concat(clustroids_test_results.values()).copy()
test_clustroids = clustroids_test_results_tmp.groupby(["fold", "cluster"]).mean()
clustroid_coordinates = test_clustroids[["umap_1", "umap_2", "umap_3"]]
display(clustroid_coordinates)

		umap_1	umap_2	umap_3
fold	cluster
0	0.0	-0.222507	2.921545	2.095573
	1.0	9.051306	1.097376	6.595613
	2.0	0.442238	2.191711	4.916422
	3.0	0.669249	3.365278	5.732391
	4.0	2.470189	1.275109	5.818295
1	0.0	0.530229	2.504511	4.375679
	1.0	9.091880	1.020848	6.567221
	2.0	-0.368833	2.894123	1.754738
	3.0	0.708486	3.349034	5.666735
	4.0	2.836004	1.284972	5.769408
2	0.0	-0.365075	2.969920	3.141656
	1.0	9.204134	1.119922	6.536933
	2.0	1.696871	1.903363	3.706254
	3.0	0.605741	3.364125	5.894669
	4.0	2.748121	1.368847	5.572644
3	0.0	0.513959	2.769251	4.130853
	1.0	9.431116	2.101022	6.301825
	2.0	2.482135	0.742329	2.370551
	3.0	0.700518	3.404507	5.635223
	4.0	2.801880	1.372310	5.619553
4	0.0	1.665734	1.934040	4.097106
	1.0	-0.250267	2.904638	2.904988
	2.0	9.432258	1.526280	6.356268
	3.0	0.516477	3.354327	5.775280
	4.0	2.868383	1.415569	5.682860

Matching to the rescue#

However, there is a way to prevent this issue : matching. To do this, we can assign the clustroids for a given cluster in a fold to the closest clustroids across other folds. This way, we can compare the clusters by assigning them to other clusters that are likely the same.

Note that this is done in the UMAP embedding space, not in the original space.

We can then verify the correctness of the matching by looking at the distribution of the features for the matched clusters, which should now be similar across folds.

nearest_clustroids = {}
# report which cluster to match with which in other folds
# in the format (fold_i, cluster_i) : {fold_j1, cluster_j1, distance_j1}, {fold_j2, cluster_j2, distance_j2}, ...
# of shape n_folds x n_clusters x n_clusters-1
for fold_i in range(N_FOLDS):
    for cluster_i in range(N_CLUSTERS):
        nearest_clustroids[(fold_i, cluster_i)] = []
        for fold_j in range(N_FOLDS):
            if fold_i == fold_j:
                continue
            distances = {}
            for cluster_j in range(N_CLUSTERS):
                distances[fold_j, cluster_j] = np.linalg.norm(
                    clustroid_coordinates.loc[fold_i, cluster_i]
                    - clustroid_coordinates.loc[fold_j, cluster_j]
                )
            # find minimum distance and append according to format
            # print(list(distances.values()))
            min_dist_id = np.argmin(list(distances.values()))
            nearest_clustroids[(fold_i, cluster_i)].append(
                (fold_j, min_dist_id, distances[fold_j, min_dist_id])
            )
display(nearest_clustroids)

{(0, 0): [(1, 2, 0.37193030254210946),
  (2, 0, 1.056860421247269),
  (3, 0, 2.1697784798021984),
  (4, 1, 0.8100669845921427)],
 (0, 1): [(1, 1, 0.09115345593678542),
  (2, 1, 0.16525172329922452),
  (3, 1, 1.112597436958438),
  (4, 2, 0.6215858651214851)],
 (0, 2): [(1, 0, 0.6308637105170848),
  (2, 3, 1.5356598760667617),
  (3, 0, 0.9776578754888323),
  (4, 3, 1.4473503508557508)],
 (0, 3): [(1, 3, 0.07819346147005983),
  (2, 3, 0.17426572509453697),
  (3, 3, 0.10935397515289522),
  (4, 3, 0.15905545956814562)],
 (0, 4): [(1, 4, 0.36919928640238986),
  (2, 4, 0.3825924393202428),
  (3, 4, 0.3987045063915007),
  (4, 4, 0.44342962152006804)],
 (1, 0): [(0, 2, 0.6308637105170848),
  (2, 2, 1.4732837075211924),
  (3, 0, 0.36095951301559276),
  (4, 0, 1.3009280165802424)],
 (1, 1): [(0, 1, 0.09115345593678542),
  (2, 1, 0.15275502724581208),
  (3, 1, 1.162881631275063),
  (4, 2, 0.6448412188120699)],
 (1, 2): [(0, 0, 0.37193030254210946),
  (2, 0, 1.3889926056258408),
  (3, 0, 2.537880652783642),
  (4, 1, 1.1563926211572984)],
 (1, 3): [(0, 3, 0.07819346147005983),
  (2, 3, 0.25047594629062114),
  (3, 3, 0.06429413654209792),
  (4, 3, 0.2206299625051465)],
 (1, 4): [(0, 4, 0.36919928640238986),
  (2, 4, 0.2312456765662018),
  (3, 4, 0.17677423896900898),
  (4, 4, 0.15998266602384933)],
 (2, 0): [(0, 0, 1.056860421247269),
  (1, 2, 1.3889926056258408),
  (3, 0, 1.3384618267464425),
  (4, 1, 0.2710243688665435)],
 (2, 1): [(0, 1, 0.16525172329922452),
  (1, 1, 0.15275502724581208),
  (3, 1, 1.034095955336564),
  (4, 2, 0.49980704440777013)],
 (2, 2): [(0, 2, 1.7668492993313973),
  (1, 0, 1.4732837075211924),
  (3, 0, 1.5262139743371972),
  (4, 0, 0.39328853131222913)],
 (2, 3): [(0, 3, 0.17426572509453697),
  (1, 3, 0.25047594629062114),
  (3, 3, 0.2791511363328339),
  (4, 3, 0.1493903542337602)],
 (2, 4): [(0, 4, 0.3825924393202428),
  (1, 4, 0.2312456765662018),
  (3, 4, 0.07143131883761729),
  (4, 4, 0.16968673346234558)],
 (3, 0): [(0, 2, 0.9776578754888323),
  (1, 0, 0.36095951301559276),
  (2, 0, 1.3384618267464425),
  (4, 0, 1.4231316160915775)],
 (3, 1): [(0, 1, 1.112597436958438),
  (1, 1, 1.162881631275063),
  (2, 1, 1.034095955336564),
  (4, 2, 0.5773162580174857)],
 (3, 2): [(0, 0, 3.4842047905330342),
  (1, 0, 3.3069272220131736),
  (2, 2, 1.9361671796201325),
  (4, 0, 2.251150787265667)],
 (3, 3): [(0, 3, 0.10935397515289522),
  (1, 3, 0.06429413654209792),
  (2, 3, 0.2791511363328339),
  (4, 3, 0.2366541397690336)],
 (3, 4): [(0, 4, 0.3987045063915007),
  (1, 4, 0.17677423896900898),
  (2, 4, 0.07143131883761729),
  (4, 4, 0.10149813468599767)],
 (4, 0): [(0, 2, 1.494863323358966),
  (1, 0, 1.3009280165802424),
  (2, 2, 0.39328853131222913),
  (3, 0, 1.4231316160915775)],
 (4, 1): [(0, 0, 0.8100669845921427),
  (1, 2, 1.1563926211572984),
  (2, 0, 0.2710243688665435),
  (3, 0, 1.4509018767628095)],
 (4, 2): [(0, 1, 0.6215858651214851),
  (1, 1, 0.6448412188120699),
  (2, 1, 0.49980704440777013),
  (3, 1, 0.5773162580174857)],
 (4, 3): [(0, 3, 0.15905545956814562),
  (1, 3, 0.2206299625051465),
  (2, 3, 0.1493903542337602),
  (3, 3, 0.2366541397690336)],
 (4, 4): [(0, 4, 0.44342962152006804),
  (1, 4, 0.15998266602384933),
  (2, 4, 0.16968673346234558),
  (3, 4, 0.10149813468599767)]}

# make a dataframe that shows for folds : cluster_from, cluster_to, fold, distance
matched_df = pd.DataFrame(
    columns=["fold_from", "cluster_from", "fold_to", "cluster_to", "distance"]
)
for fold_i in range(N_FOLDS):
    for cluster_i in range(N_CLUSTERS):
        for fold_j, cluster_j, distance in nearest_clustroids[(fold_i, cluster_i)]:
            matched_df = pd.concat(
                [
                    matched_df,
                    pd.DataFrame(
                        {
                            "fold_from": [fold_i],
                            "cluster_from": [cluster_i],
                            "fold_to": [fold_j],
                            "cluster_to": [cluster_j],
                            "distance": [distance],
                        }
                    ),
                ]
            )
display(matched_df)

	fold_from	cluster_from	fold_to	cluster_to	distance
0	0	0	1	2	0.371930
0	0	0	2	0	1.056860
0	0	0	3	0	2.169778
0	0	0	4	1	0.810067
0	0	1	1	1	0.091153
...	...	...	...	...	...
0	4	3	3	3	0.236654
0	4	4	0	4	0.443430
0	4	4	1	4	0.159983
0	4	4	2	4	0.169687
0	4	4	3	4	0.101498

100 rows × 5 columns

def extract_fold_coordinates(fold, matched_df=matched_df):
    matched_df = matched_df.loc[matched_df["fold_from"] == fold]
    coordinates_matched = []
    for cluster_from in matched_df["cluster_from"].unique():
        coordinates_matched.append([clustroid_coordinates.loc[fold, cluster_from]])
        other_coords = [
            clustroid_coordinates.loc[
                matched_df.loc[
                    (matched_df["cluster_from"] == cluster_from)
                    & (matched_df["cluster_to"] == cluster_to)
                ]["fold_to"].values[0],
                cluster_to,
            ]
            for cluster_to in matched_df.loc[
                (matched_df["cluster_from"] == cluster_from)
            ]["cluster_to"].values
        ]
        coordinates_matched[-1].extend(other_coords)
    coordinates_matched = np.array(
        coordinates_matched
    )  # shape is N_FOLDS x N_CLUSTERS x 3
    coordinates_matched = pd.DataFrame(
        coordinates_matched.reshape(N_CLUSTERS * N_FOLDS, 3),
        columns=["umap_1", "umap_2", "umap_3"],
    )
    coordinates_matched["cluster"] = np.repeat(np.arange(N_CLUSTERS), N_FOLDS)
    return coordinates_matched


coordinates_matched_fold0 = extract_fold_coordinates(0)

Plot of clustroids variation across folds#

Now that we have matched the clusters, we can start comparing them across folds. In the following plot, we show the clustroids of each fold, matched to the clustroids of the fourth fold. The clustroids are overall similar across folds, and match the ones in the original clustering in the data, demonstrating a degree of robustness of the clustering.

# we use the fourth fold as it contains the best matchings
coordinates_matched_fold4 = extract_fold_coordinates(4)
fig = px.scatter_3d(
    coordinates_matched_fold4,
    x="umap_1",
    y="umap_2",
    z="umap_3",
    color=coordinates_matched_fold4["cluster"].astype(str),
    title=f"Variability of clustroids across folds",
    opacity=0.7,
    symbol=list(range(5)) * 5,
    # color_continuous_scale=px.colors.sequential.haline,
)
fig.add_trace(
    go.Scatter3d(
        x=clustroids["umap_1"],
        y=clustroids["umap_2"],
        z=clustroids["umap_3"],
        mode="markers",
        marker=dict(
            size=10,
            symbol="cross",
            color="black"
            # color=clustering.labels_.astype(str),
        ),
        name="Original clustroids",
    )
)

fig.update_layout(
    {"plot_bgcolor": "#14181e", "paper_bgcolor": "#14181e"}
)
fig.update_layout(font_color="white")
fig.update_layout(
    scene=dict(
        xaxis=dict(showticklabels=False),
        yaxis=dict(showticklabels=False),
        zaxis=dict(showticklabels=False),
    )
)
fig.update_layout(legend_title_text="Cluster")
fig.update_layout(
    scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3")
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        font=dict(color="white"),
        bgcolor="rgba(0,0,0,0)",
        title=dict(text="Fold", font=dict(color="white")),
    )
)

if EXPORT:
    fig.write_html(FIG_PATH / "clustroids_variability.html")

display(fig)

Comparing the above figure with the original clustering, we can see that the variaibility of clustroids across folds is relatively low, as they remain roughly in the same region of the embedding space.

matched_df_fold4 = matched_df.loc[matched_df["fold_from"] == 4]
# display(matched_df_fold4)

# get the features using cluster matching from fold 4
features_matched = test_features_df
# reorder using the matched_df_fold4
# display(features_matched)
i = 0
matched_features_means = pd.DataFrame(columns=features_matched.columns)
for row in matched_df_fold4.iterrows():
    if i == row[1]["cluster_from"]:
        # append the row from fold 4
        new_row = (
            features_matched.loc[
                (features_matched["fold"] == 4)
                & (features_matched["cluster"] == row[1]["cluster_from"])
            ]
            .iloc[0]
            .to_dict()
        )
        # print(new_row)
        matched_features_means.loc[len(matched_features_means)] = new_row
        i += 1
    # find fold_to and cluster_to in fold and cluster columns of matched_df_fold4
    new_row = (
        features_matched.loc[
            (features_matched["fold"] == row[1]["fold_to"])
            & (features_matched["cluster"] == row[1]["cluster_to"])
        ]
        .iloc[0]
        .to_dict()
    )
    # print(new_row)
    matched_features_means.loc[len(matched_features_means)] = new_row
matched_features_means

	fold	cluster	durationInSec	backtrack	numberOfPath	position_mean	position_std	path_length	coarse_mean_time	semantic_similarity	ratio	path_closeness_abs_sum	umap_1	umap_2	umap_3
0	4.0	0.0	0.128477	-0.003097	-0.074372	0.126786	0.121787	0.053865	0.132193	-0.064552	0.073602	0.024466	1.665734	1.934040	4.097106
1	0.0	2.0	0.231577	0.004454	-0.073484	0.127134	0.233443	0.312482	0.114587	0.026263	0.161534	-0.396978	0.442238	2.191711	4.916422
2	1.0	0.0	0.217110	-0.028086	-0.103809	0.043844	0.116783	0.227241	0.154091	0.064086	0.171464	-0.221947	0.530229	2.504511	4.375679
3	2.0	2.0	0.249968	-0.020335	-0.145933	0.190363	0.188121	0.062080	0.291250	-0.135683	0.105230	0.072671	1.696871	1.903363	3.706254
4	3.0	0.0	0.097231	-0.032822	-0.058690	0.032180	0.081681	0.122511	0.066013	-0.008233	0.149466	-0.066274	0.513959	2.769251	4.130853
5	4.0	1.0	-0.063926	-0.055610	-0.015169	-0.080708	-0.010413	0.111005	-0.128595	0.071821	0.180073	-0.100466	-0.250267	2.904638	2.904988
6	0.0	0.0	-0.034107	-0.070394	-0.041902	0.013160	0.076710	0.091436	-0.066956	0.006308	0.290530	0.124542	-0.222507	2.921544	2.095573
7	1.0	2.0	-0.132791	-0.069731	-0.027119	-0.014553	0.065786	0.055099	-0.175973	0.012603	0.323879	0.182646	-0.368833	2.894122	1.754738
8	2.0	0.0	-0.035718	-0.085595	-0.036968	-0.060103	0.045504	0.172787	-0.115039	0.071317	0.281640	-0.091514	-0.365075	2.969920	3.141656
9	3.0	0.0	0.097231	-0.032822	-0.058690	0.032180	0.081681	0.122511	0.066013	-0.008233	0.149466	-0.066274	0.513959	2.769251	4.130853
10	4.0	2.0	-0.179948	-0.016012	-0.051763	-0.095219	-0.236951	-0.334659	-0.036912	-0.032880	-0.475565	0.101863	9.432258	1.526280	6.356268
11	0.0	1.0	-0.201638	-0.024594	-0.065738	-0.097648	-0.280816	-0.411736	-0.012442	-0.005666	-0.611842	0.108413	9.051306	1.097376	6.595613
12	1.0	1.0	-0.230623	-0.037027	-0.085927	-0.077564	-0.251181	-0.438571	-0.035920	-0.047505	-0.622894	0.122458	9.091880	1.020848	6.567221
13	2.0	1.0	-0.244096	-0.062702	-0.087708	-0.062091	-0.263388	-0.422544	-0.051974	-0.037946	-0.592933	0.153347	9.204134	1.119922	6.536932
14	3.0	1.0	-0.164569	-0.051389	-0.049980	-0.182646	-0.328923	-0.315587	-0.008140	0.012639	-0.510046	0.078346	9.431116	2.101022	6.301824
15	4.0	3.0	0.230110	0.371074	0.061384	0.026120	0.034558	0.156888	0.076354	-0.004570	0.127013	-0.135346	0.516477	3.354327	5.775280
16	0.0	3.0	0.415415	0.567852	-0.027587	0.041751	0.066346	0.247979	0.178877	-0.029607	0.172722	-0.228782	0.669249	3.365278	5.732392
17	1.0	3.0	0.376236	0.485671	0.011243	0.045355	0.084920	0.260586	0.155806	-0.021688	0.169559	-0.206591	0.708486	3.349034	5.666735
18	2.0	3.0	0.395049	0.627031	-0.017101	0.045529	0.063283	0.257393	0.152625	-0.025583	0.182607	-0.219050	0.605741	3.364125	5.894669
19	3.0	3.0	0.282822	0.374978	-0.025190	0.091279	0.053827	0.164162	0.154559	-0.015552	0.115309	-0.149541	0.700518	3.404507	5.635224
20	4.0	4.0	-0.099664	-0.096077	0.511264	0.078605	0.113303	0.039625	-0.113031	0.022589	0.181226	0.137006	2.868382	1.415570	5.682860
21	0.0	4.0	-0.015396	-0.068666	0.910530	0.029604	0.106260	0.086130	-0.048239	0.030499	0.221335	0.050367	2.470189	1.275109	5.818295
22	1.0	4.0	-0.038961	-0.022366	0.742955	-0.047049	0.054126	0.133663	-0.119726	0.095923	0.171580	0.024705	2.836004	1.284972	5.769408
23	2.0	4.0	-0.118191	-0.038595	0.871693	0.006262	0.078820	0.062640	-0.178330	0.064974	0.165133	0.102989	2.748121	1.368847	5.572645
24	3.0	4.0	-0.050993	-0.038264	0.494698	-0.115044	0.005541	0.118942	-0.105956	0.021012	0.157887	-0.007868	2.801880	1.372310	5.619553

Variability of features across folds in matched clusters#

Finally, we can compare the distributions of features across folds, in the matched clusters.

It is possible that due to how the matching was performed (using clustroid distances in the UMAP latent space generated from all data points), the clusters are not matched properly when we go back to feature space.

However, the distributions of features across folds are still very similar in most cases.

matched_features_means["matched_cluster"] = np.repeat(np.arange(5), 5)
feature_names_titles = utils.get_feature_names_labels()
with plt.style.context("dark_background"):
    # fig.tight_layout()
    # axs[-1].axis("off")
    fig, axs = plt.subplots(5, 2, figsize=(20, 40))
    axs = axs.flatten()
    for i, feature in enumerate(features_used):
        sns.barplot(
            data=matched_features_means,
            x="matched_cluster",
            y=feature,
            hue="fold",
            errorbar="ci",
            palette="tab10",
            ax=axs[i],
        )
        axs[i].axhline(y=0, color="white", linestyle="--", alpha=0.5)
        axs[i].set_xlabel("Matched clusters", fontsize=20)
        axs[i].set_ylabel("Feature value", fontsize=20)
        axs[i].set_title(feature_names_titles[i], fontsize=20)
        utils.set_axis_style(axs, i, add_xlabel=False, add_ylabel=False)
        leg = axs[i].get_legend()
        leg.set_title("Fold")

    # export with transparent background
    if EXPORT:
        fig.savefig(
            FIG_PATH / "CV_matched_clusters_feattures_means_results.png",
            transparent=True,
            bbox_inches="tight",
            pad_inches=0,
        )
    plt.suptitle("Features means per cluster", fontsize=30)
    # fig.tight_layout()
plt.show()

_images/b55bf0a6ea3e372b59d52461f1a75f1a5939b640b5d69b2febe06a743d58aeae.png

# same as above without subplots, FOR DATA STORY
################################################
# for i, feature in enumerate(features_used):
#     plt.figure(figsize=(20, 10))
#     sns.barplot(
#         data=matched_features_means,
#         x="matched_cluster",
#         y=feature,
#         hue="fold",
#         errorbar="ci",
#         palette="tab10",
#     )
#     plt.axhline(y=0, color="white", linestyle="--", alpha=0.5)
#     plt.xlabel("Matched clusters", fontsize=20)
#     plt.ylabel("Feature value", fontsize=20)
#     plt.title(feature_names_titles[i], fontsize=20)
#     # export with transparent background
#     if EXPORT:
#         plt.savefig(
#             FIG_PATH / f"CV_matched_clusters_features_means_results_{feature}.png",
#             transparent=True,
#             bbox_inches="tight",
#             pad_inches=0,
#         )
#     plt.show()