Clustering using DBSCAN and its extension OPTICS#

The goal here is to see if the clusters that we can see in the UMAP are interesting… And as we will see they are not.

import pandas as pd
import numpy as np
from pathlib import Path
import utils

# dimensionality reduction
from sklearn.decomposition import PCA
from umap import umap_ as UMAP

# graphing
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
# clustering
from sklearn.cluster import OPTICS, cluster_optics_dbscan

Preprocessing of the data#

DATA_PATH = Path.cwd() / "../data"
data = {
    Path(f).stem: pd.read_csv(f, index_col=0) for f in DATA_PATH.glob("combined_*.csv")
}
print(list(data.keys()))
['combined_metrics_finished_edges', 'combined_metrics_finished_paths', 'combined_metrics_unfinished_edges', 'combined_metrics_unfinished_paths']
features_finished_paths = data["combined_metrics_finished_paths"].reset_index(drop=True)
features_unfinished_paths = data["combined_metrics_unfinished_paths"].reset_index(
    drop=True
)
# drop timeout paths
features_unfinished_paths = features_unfinished_paths[features_unfinished_paths["type"] != "timeout"]
features_unfinished_paths
index hashedIpAddress timestamp durationInSec path target type backtrack numberOfPath path_length ... ratio average_time_on_page sucessive_pairs sucessive_pairs_encoded semantic_similarity path_degree_abs_sum path_clustering_abs_sum path_degree_centrality_abs_sum path_betweenness_abs_sum path_closeness_abs_sum
4 7 6d136e371e42474f 2011-02-07 18:07:50 175 ['4-2-0', 'United_States', 'Agriculture', 'Sug... Cane_Toad restart 0 170.0 5 ... 3 35.000000 [('4-2-0', 'United_States'), ('United_States',... [0.19640377163887024, 0.2371780127286911, 0.32... 0.245052 7.585732 -2.590255 -0.846121 -2.271594 -0.927471
7 12 781192ee1a708bec 2011-02-08 03:46:15 334 ['Saint_Kitts_and_Nevis', 'United_Kingdom', 'W... Sandy_Koufax restart 0 1.0 5 ... 3 66.800000 [('Saint_Kitts_and_Nevis', 'United_Kingdom'), ... [0.48519760370254517, 0.33719098567962646, 0.3... 0.331910 7.578032 -2.211302 -0.853821 -2.267709 -1.553485
9 14 781192ee1a708bec 2011-02-08 03:55:46 403 ['Symmetry', 'Science', 'Age_of_Enlightenment'... Scottish_Episcopal_Church restart 0 3.0 6 ... 1 67.166667 [('Symmetry', 'Science'), ('Science', 'Age_of_... [0.33117276430130005, 0.20039768517017365, 0.5... 0.336678 6.702728 -3.747010 -1.729125 -3.174033 -2.621027
10 16 5900aa2d71b99153 2011-02-08 04:23:24 115 ['Tasmanian_Devil', 'Dog', 'Postage_stamp', 'W... Love restart 0 1.0 4 ... 3 28.750000 [('Tasmanian_Devil', 'Dog'), ('Dog', 'Postage_... [0.28771334886550903, 0.27162858843803406, 0.1... 0.246961 4.382027 -1.092701 -4.049827 -6.726992 -2.428974
13 19 2d49dfd2673786bf 2011-02-08 06:47:04 245 ['Tim_Berners-Lee', 'England', 'Atlantic_Ocean... Volcanic_pipe restart 0 2.0 6 ... 3 40.833333 [('Tim_Berners-Lee', 'England'), ('England', '... [0.10544373840093613, 0.3065592646598816, 0.41... 0.339931 6.422029 -3.488079 -2.009824 -3.429649 -2.719443
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18974 24867 109ed71f571d86e9 2014-01-15 11:01:43 152 ['Montenegro', 'World_War_II', 'United_States'... Hurricane_Georges restart 0 20.0 7 ... 4 21.714286 [('Montenegro', 'World_War_II'), ('World_War_I... [0.16399337351322174, 0.3004598617553711, 0.40... 0.463695 6.294228 -2.251847 -2.137626 -3.529575 -2.658404
18975 24868 109ed71f571d86e9 2014-01-15 11:36:08 72 ['Wine', 'Georgia_%28country%29', 'Russia'] History_of_post-Soviet_Russia restart 0 27.0 3 ... 2 24.000000 [('Wine', 'Georgia_%28country%29'), ('Georgia_... [0.12671996653079987, 0.37717318534851074] 0.251947 NaN NaN NaN NaN NaN
18976 24869 109ed71f571d86e9 2014-01-15 12:00:12 182 ['Turks_and_Caicos_Islands', 'United_States', ... Iraq_War restart 0 35.0 6 ... 2 30.333333 [('Turks_and_Caicos_Islands', 'United_States')... [0.4149491488933563, 0.40031811594963074, 0.39... 0.421867 7.555088 -1.505405 -0.876766 -2.304742 -1.574654
18977 24870 109ed71f571d86e9 2014-01-15 12:06:45 180 ['Franz_Kafka', 'Tuberculosis', 'World_Health_... Cholera restart 1 37.0 6 ... 3 30.000000 [('Franz_Kafka', 'Tuberculosis'), ('Tuberculos... [0.2173137068748474, 0.3479086458683014, 0.420... 0.368187 3.669951 -3.305998 -4.761902 -7.644179 -4.383847
18980 24874 1cf0cbb3281049ab 2014-01-15 21:54:01 352 ['Mark_Antony', 'Rome', 'Tennis', 'Hawk-Eye', ... Feather restart 0 49.0 5 ... 2 70.400000 [('Mark_Antony', 'Rome'), ('Rome', 'Tennis'), ... [0.27006441354751587, 0.28349533677101135, 0.2... 0.237981 5.595124 -1.730546 -2.836729 -5.768889 -2.107167

11811 rows × 24 columns

combined_df = pd.concat([features_finished_paths, features_unfinished_paths], axis=0)
combined_df['finished'] = [1] * len(features_finished_paths) + [0] * len(features_unfinished_paths)
combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING] = utils.normalize_features(combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING])
X = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy().values
#find number of NaNs in each column
nans = np.isnan(X).sum(axis=0)
print(nans)
#drop rows with NaNs
X = X[~np.isnan(X).any(axis=1)]
[0 0 3 0 0 0 0 0 0 0]

Let’s find the dimensionality of the data#

# find the dimensionality of the data
with plt.style.context("dark_background"):
    pca = PCA(n_components=X.shape[1])
    pca.fit(X)
    # plot the explained variance
    plt.scatter(range(1, X.shape[1] + 1), pca.explained_variance_ratio_, c="cyan")
    plt.xlabel("Number of components", fontsize=16)
    plt.ylabel("Explained variance", fontsize=16)
    plt.title("Percentage of explained variance\nof each component of the PCA", fontsize=20)
    plt.show()
_images/da7488cad345ddc23da00e65d1d4b83200cd3ae00c1587e9ec7f13a29b451991.png

We will keep only the 7 first dimensions

Clustering#

We will run optics on a lower dimension to avoid the curse of the dimensionality#

Let’s find the latent space using a UMAP#

# UMAP
umap = UMAP.UMAP(n_components=7, metric="euclidean")
result_umap_euc = umap.fit_transform(
    X
)

Run the clustering in this space using OPTICS or fixed cutoff values with DBSCAN#

#code taken from https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html
clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05)

# Run the fit
clust.fit(result_umap_euc)

labels_050 = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=0.5,
)
labels_200 = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=2,
)

Visualize the results in the plan defined by the 2 first features and draw a reachability plot#

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

with plt.style.context("dark_background"):
    plt.figure(figsize=(10, 7))
    G = gridspec.GridSpec(2, 3)
    ax1 = plt.subplot(G[0, :])
    ax2 = plt.subplot(G[1, 0])
    ax3 = plt.subplot(G[1, 1])
    ax4 = plt.subplot(G[1, 2])

    # Reachability plot
    colors = ["g.", "r.", "b.", "y.", "c."]
    for klass, color in zip(range(0, 5), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        ax1.plot(Xk, Rk, color, alpha=0.3)
    ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=0.3)
    ax1.plot(space, np.full_like(space, 2.0, dtype=float), "k-", alpha=0.5)
    ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5)
    ax1.set_ylabel("Reachability (epsilon distance)")
    ax1.set_title("Reachability Plot")

    # OPTICS
    colors = ["g.", "r.", "b.", "y.", "c."]
    for klass, color in zip(range(0, 5), colors):
        Xk = X[clust.labels_ == klass]
        ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], "k+", alpha=0.1)
    ax2.set_title("Automatic Clustering\nOPTICS")

    # DBSCAN at 0.5
    colors = ["g.", "r.", "b.", "c."]
    for klass, color in zip(range(0, 4), colors):
        Xk = X[labels_050 == klass]
        ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], "k+", alpha=0.1)
    ax3.set_title("Clustering at 0.5 epsilon cut\nDBSCAN")

    # DBSCAN at 2.
    colors = ["g.", "m.", "y.", "c."]
    for klass, color in zip(range(0, 4), colors):
        Xk = X[labels_200 == klass]
        ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], "k+", alpha=0.1)
    ax4.set_title("Clustering at 2.0 epsilon cut\nDBSCAN")

    plt.tight_layout()
    plt.show()
_images/b472f0e7f8f9d67027c6d0f662114d6065423d555ed97682a9619027df74fc1f.png

Create a UMAP with 3 dimensions for visualization purposes#

from umap import umap_ as UMAP
# UMAP
umap = UMAP.UMAP(n_components=3, metric="euclidean")
result_umap_euc = umap.fit_transform(
    X
)
fig = px.scatter_3d(
    result_umap_euc,
    x=0,
    y=1,
    z=2,
    color=clust.labels_.astype(str),
    title="UMAP, showing OPTICS clustering",
    # reduce size points
    size_max=0.1,
    category_orders={"color": [str(i) for i in range(0, len(np.unique(clust.labels_)))]},
)
fig.update_layout({"plot_bgcolor": "#14181e", "paper_bgcolor": "#14181e"})
fig.update_layout(font_color="white")
fig.update_layout(scene=dict(xaxis=dict(showticklabels=False), yaxis=dict(showticklabels=False), zaxis=dict(showticklabels=False)))
fig.update_layout(legend_title_text="Cluster")
fig.update_layout(legend = dict(bgcolor = 'rgba(0,0,0,0)'))
fig.update_layout(scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3"))
fig.update_layout(scene = dict(
    xaxis = dict(
            backgroundcolor="rgba(0, 0, 0,0)",
            # gridcolor="rgba(0, 0, 0,0)", # gridcolor is for logo
            showbackground=True,
            zerolinecolor="white",),
    yaxis = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        # gridcolor="rgba(0, 0, 0,0)",
        showbackground=True,
        zerolinecolor="white"),
    zaxis = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        # gridcolor="rgba(0, 0, 0,0)",
        showbackground=True,
        zerolinecolor="white",),),
)
display(fig)

By providing a UMAP in 7 dimension to OPTICS we roughly find the communities that we can see by eye in the 3D UMAP

Visualization of the results#

#plot histogram features colored by cluster
with plt.style.context("dark_background"):
    plot_data = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy().dropna()
    plot_data["cluster"] = clust.labels_
    plot_data["cluster"] = plot_data["cluster"].astype("category")
    n_features = len(plot_data.columns) - 1
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
    axs = axs.flatten()
    for i, col in enumerate(plot_data.columns):
        if col == "cluster":
            continue
        sns.histplot(
            data=plot_data,
            x=col,
            hue="cluster",
            ax=axs[i],
            stat="density",
            common_norm=False,
            palette="tab10",
        )
        utils.set_axis_style(axs, i)

    plt.suptitle("Histograms of normalized features colored by cluster", fontsize=20)
    plt.subplots_adjust(top=0.95)
    plt.show()
_images/4f1df0539440af35109e4ea187e696bbdaa5f9dd90bd4f8c66fe0bba9b7ec6a8.png
with plt.style.context("dark_background"):
    combined_df = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING+["finished"]].dropna()
    combined_df["finished_normalized"] = (
        combined_df["finished"] - combined_df["finished"].mean()
    ) / combined_df["finished"].std()
    plot_data = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING + ["finished_normalized"]].copy()
    plot_data["cluster"] = clust.labels_
    plot_data["cluster"] = plot_data["cluster"].astype("category")
    n_features = len(plot_data.columns) - 1
    n_cols = 4
    n_rows = int(np.ceil(n_features / n_cols))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
    fig.suptitle("Boxplots of normalized features colored by cluster", fontsize=20)
    plt.subplots_adjust(top=0.95)
    axs = axs.flatten()
    for i, col in enumerate(plot_data.columns):
        if col == "cluster":
            continue
        sns.boxplot(data=plot_data, x="cluster", y=col, ax=axs[i], hue="cluster", palette="tab10")
        utils.set_axis_style(axs, i)
    plt.show()
_images/5375c48c2cb7b86189b172e0020f23e4770fa0534e64539ed30d21e1ea1b91c2.png
# heatmap of features means per cluster
feat_labels = utils.get_feature_names_labels()
with plt.style.context("dark_background"):
    means = plot_data.groupby(plot_data["cluster"]).mean()
    sns.heatmap(means, cmap="coolwarm", annot=True, fmt=".1f", vmin=-1, vmax=1)
    plt.title("Heatmap of normalized features\nmeans per cluster", fontsize=25)
    plt.ylabel("Cluster", fontsize=20)
    plt.xlabel("Feature", fontsize=20)
    plt.xticks(labels=feat_labels+["Finished (norm)"], ticks=np.arange(len(feat_labels)+1)+0.5, rotation=90, fontsize=15)
    plt.show()
_images/0982bf6d25f36430c2cab55fb1eba2ed8678f7fe0ee562697036fb72687b6a4a.png

Discussion and conclusion#

Here we can see that the clustering is less interesting. It really makes 4 clusters in which all the features used to do the clustering are either very low, low, high or very high.