Clustering using DBSCAN and its extension OPTICS

Clustering using DBSCAN and its extension OPTICS#

The goal here is to see if the clusters that we can see in the UMAP are interesting… And as we will see they are not.

import pandas as pd
import numpy as np
from pathlib import Path
import utils

# dimensionality reduction
from sklearn.decomposition import PCA
from umap import umap_ as UMAP

# graphing
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
# clustering
from sklearn.cluster import OPTICS, cluster_optics_dbscan

Preprocessing of the data#

DATA_PATH = Path.cwd() / "../data"
data = {
    Path(f).stem: pd.read_csv(f, index_col=0) for f in DATA_PATH.glob("combined_*.csv")
}
print(list(data.keys()))

['combined_metrics_finished_edges', 'combined_metrics_finished_paths', 'combined_metrics_unfinished_edges', 'combined_metrics_unfinished_paths']

features_finished_paths = data["combined_metrics_finished_paths"].reset_index(drop=True)
features_unfinished_paths = data["combined_metrics_unfinished_paths"].reset_index(
    drop=True
)
# drop timeout paths
features_unfinished_paths = features_unfinished_paths[features_unfinished_paths["type"] != "timeout"]
features_unfinished_paths

	index	hashedIpAddress	timestamp	durationInSec	path	target	type	backtrack	numberOfPath	path_length	...	ratio	average_time_on_page	sucessive_pairs	sucessive_pairs_encoded	semantic_similarity	path_degree_abs_sum	path_clustering_abs_sum	path_degree_centrality_abs_sum	path_betweenness_abs_sum	path_closeness_abs_sum
4	7	6d136e371e42474f	2011-02-07 18:07:50	175	['4-2-0', 'United_States', 'Agriculture', 'Sug...	Cane_Toad	restart	0	170.0	5	...	3	35.000000	[('4-2-0', 'United_States'), ('United_States',...	[0.19640377163887024, 0.2371780127286911, 0.32...	0.245052	7.585732	-2.590255	-0.846121	-2.271594	-0.927471
7	12	781192ee1a708bec	2011-02-08 03:46:15	334	['Saint_Kitts_and_Nevis', 'United_Kingdom', 'W...	Sandy_Koufax	restart	0	1.0	5	...	3	66.800000	[('Saint_Kitts_and_Nevis', 'United_Kingdom'), ...	[0.48519760370254517, 0.33719098567962646, 0.3...	0.331910	7.578032	-2.211302	-0.853821	-2.267709	-1.553485
9	14	781192ee1a708bec	2011-02-08 03:55:46	403	['Symmetry', 'Science', 'Age_of_Enlightenment'...	Scottish_Episcopal_Church	restart	0	3.0	6	...	1	67.166667	[('Symmetry', 'Science'), ('Science', 'Age_of_...	[0.33117276430130005, 0.20039768517017365, 0.5...	0.336678	6.702728	-3.747010	-1.729125	-3.174033	-2.621027
10	16	5900aa2d71b99153	2011-02-08 04:23:24	115	['Tasmanian_Devil', 'Dog', 'Postage_stamp', 'W...	Love	restart	0	1.0	4	...	3	28.750000	[('Tasmanian_Devil', 'Dog'), ('Dog', 'Postage_...	[0.28771334886550903, 0.27162858843803406, 0.1...	0.246961	4.382027	-1.092701	-4.049827	-6.726992	-2.428974
13	19	2d49dfd2673786bf	2011-02-08 06:47:04	245	['Tim_Berners-Lee', 'England', 'Atlantic_Ocean...	Volcanic_pipe	restart	0	2.0	6	...	3	40.833333	[('Tim_Berners-Lee', 'England'), ('England', '...	[0.10544373840093613, 0.3065592646598816, 0.41...	0.339931	6.422029	-3.488079	-2.009824	-3.429649	-2.719443
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18974	24867	109ed71f571d86e9	2014-01-15 11:01:43	152	['Montenegro', 'World_War_II', 'United_States'...	Hurricane_Georges	restart	0	20.0	7	...	4	21.714286	[('Montenegro', 'World_War_II'), ('World_War_I...	[0.16399337351322174, 0.3004598617553711, 0.40...	0.463695	6.294228	-2.251847	-2.137626	-3.529575	-2.658404
18975	24868	109ed71f571d86e9	2014-01-15 11:36:08	72	['Wine', 'Georgia_%28country%29', 'Russia']	History_of_post-Soviet_Russia	restart	0	27.0	3	...	2	24.000000	[('Wine', 'Georgia_%28country%29'), ('Georgia_...	[0.12671996653079987, 0.37717318534851074]	0.251947	NaN	NaN	NaN	NaN	NaN
18976	24869	109ed71f571d86e9	2014-01-15 12:00:12	182	['Turks_and_Caicos_Islands', 'United_States', ...	Iraq_War	restart	0	35.0	6	...	2	30.333333	[('Turks_and_Caicos_Islands', 'United_States')...	[0.4149491488933563, 0.40031811594963074, 0.39...	0.421867	7.555088	-1.505405	-0.876766	-2.304742	-1.574654
18977	24870	109ed71f571d86e9	2014-01-15 12:06:45	180	['Franz_Kafka', 'Tuberculosis', 'World_Health_...	Cholera	restart	1	37.0	6	...	3	30.000000	[('Franz_Kafka', 'Tuberculosis'), ('Tuberculos...	[0.2173137068748474, 0.3479086458683014, 0.420...	0.368187	3.669951	-3.305998	-4.761902	-7.644179	-4.383847
18980	24874	1cf0cbb3281049ab	2014-01-15 21:54:01	352	['Mark_Antony', 'Rome', 'Tennis', 'Hawk-Eye', ...	Feather	restart	0	49.0	5	...	2	70.400000	[('Mark_Antony', 'Rome'), ('Rome', 'Tennis'), ...	[0.27006441354751587, 0.28349533677101135, 0.2...	0.237981	5.595124	-1.730546	-2.836729	-5.768889	-2.107167

11811 rows × 24 columns

combined_df = pd.concat([features_finished_paths, features_unfinished_paths], axis=0)
combined_df['finished'] = [1] * len(features_finished_paths) + [0] * len(features_unfinished_paths)
combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING] = utils.normalize_features(combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING])

X = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy().values
#find number of NaNs in each column
nans = np.isnan(X).sum(axis=0)
print(nans)
#drop rows with NaNs
X = X[~np.isnan(X).any(axis=1)]

[0 0 3 0 0 0 0 0 0 0]

Let’s find the dimensionality of the data#

# find the dimensionality of the data
with plt.style.context("dark_background"):
    pca = PCA(n_components=X.shape[1])
    pca.fit(X)
    # plot the explained variance
    plt.scatter(range(1, X.shape[1] + 1), pca.explained_variance_ratio_, c="cyan")
    plt.xlabel("Number of components", fontsize=16)
    plt.ylabel("Explained variance", fontsize=16)
    plt.title("Percentage of explained variance\nof each component of the PCA", fontsize=20)
    plt.show()

_images/da7488cad345ddc23da00e65d1d4b83200cd3ae00c1587e9ec7f13a29b451991.png

We will keep only the 7 first dimensions

Clustering#

We will run optics on a lower dimension to avoid the curse of the dimensionality#

Let’s find the latent space using a UMAP#

# UMAP
umap = UMAP.UMAP(n_components=7, metric="euclidean")
result_umap_euc = umap.fit_transform(
    X
)

Run the clustering in this space using OPTICS or fixed cutoff values with DBSCAN#

#code taken from https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html
clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05)

# Run the fit
clust.fit(result_umap_euc)

labels_050 = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=0.5,
)
labels_200 = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=2,
)

Visualize the results in the plan defined by the 2 first features and draw a reachability plot#

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

with plt.style.context("dark_background"):
    plt.figure(figsize=(10, 7))
    G = gridspec.GridSpec(2, 3)
    ax1 = plt.subplot(G[0, :])
    ax2 = plt.subplot(G[1, 0])
    ax3 = plt.subplot(G[1, 1])
    ax4 = plt.subplot(G[1, 2])

    # Reachability plot
    colors = ["g.", "r.", "b.", "y.", "c."]
    for klass, color in zip(range(0, 5), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        ax1.plot(Xk, Rk, color, alpha=0.3)
    ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=0.3)
    ax1.plot(space, np.full_like(space, 2.0, dtype=float), "k-", alpha=0.5)
    ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5)
    ax1.set_ylabel("Reachability (epsilon distance)")
    ax1.set_title("Reachability Plot")

    # OPTICS
    colors = ["g.", "r.", "b.", "y.", "c."]
    for klass, color in zip(range(0, 5), colors):
        Xk = X[clust.labels_ == klass]
        ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], "k+", alpha=0.1)
    ax2.set_title("Automatic Clustering\nOPTICS")

    # DBSCAN at 0.5
    colors = ["g.", "r.", "b.", "c."]
    for klass, color in zip(range(0, 4), colors):
        Xk = X[labels_050 == klass]
        ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], "k+", alpha=0.1)
    ax3.set_title("Clustering at 0.5 epsilon cut\nDBSCAN")

    # DBSCAN at 2.
    colors = ["g.", "m.", "y.", "c."]
    for klass, color in zip(range(0, 4), colors):
        Xk = X[labels_200 == klass]
        ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], "k+", alpha=0.1)
    ax4.set_title("Clustering at 2.0 epsilon cut\nDBSCAN")

    plt.tight_layout()
    plt.show()

_images/b472f0e7f8f9d67027c6d0f662114d6065423d555ed97682a9619027df74fc1f.png

Create a UMAP with 3 dimensions for visualization purposes#

from umap import umap_ as UMAP
# UMAP
umap = UMAP.UMAP(n_components=3, metric="euclidean")
result_umap_euc = umap.fit_transform(
    X
)

fig = px.scatter_3d(
    result_umap_euc,
    x=0,
    y=1,
    z=2,
    color=clust.labels_.astype(str),
    title="UMAP, showing OPTICS clustering",
    # reduce size points
    size_max=0.1,
    category_orders={"color": [str(i) for i in range(0, len(np.unique(clust.labels_)))]},
)
fig.update_layout({"plot_bgcolor": "#14181e", "paper_bgcolor": "#14181e"})
fig.update_layout(font_color="white")
fig.update_layout(scene=dict(xaxis=dict(showticklabels=False), yaxis=dict(showticklabels=False), zaxis=dict(showticklabels=False)))
fig.update_layout(legend_title_text="Cluster")
fig.update_layout(legend = dict(bgcolor = 'rgba(0,0,0,0)'))
fig.update_layout(scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3"))
fig.update_layout(scene = dict(
    xaxis = dict(
            backgroundcolor="rgba(0, 0, 0,0)",
            # gridcolor="rgba(0, 0, 0,0)", # gridcolor is for logo
            showbackground=True,
            zerolinecolor="white",),
    yaxis = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        # gridcolor="rgba(0, 0, 0,0)",
        showbackground=True,
        zerolinecolor="white"),
    zaxis = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        # gridcolor="rgba(0, 0, 0,0)",
        showbackground=True,
        zerolinecolor="white",),),
)
display(fig)

By providing a UMAP in 7 dimension to OPTICS we roughly find the communities that we can see by eye in the 3D UMAP

Visualization of the results#

#plot histogram features colored by cluster
with plt.style.context("dark_background"):
    plot_data = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING].copy().dropna()
    plot_data["cluster"] = clust.labels_
    plot_data["cluster"] = plot_data["cluster"].astype("category")
    n_features = len(plot_data.columns) - 1
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
    axs = axs.flatten()
    for i, col in enumerate(plot_data.columns):
        if col == "cluster":
            continue
        sns.histplot(
            data=plot_data,
            x=col,
            hue="cluster",
            ax=axs[i],
            stat="density",
            common_norm=False,
            palette="tab10",
        )
        utils.set_axis_style(axs, i)

    plt.suptitle("Histograms of normalized features colored by cluster", fontsize=20)
    plt.subplots_adjust(top=0.95)
    plt.show()

_images/4f1df0539440af35109e4ea187e696bbdaa5f9dd90bd4f8c66fe0bba9b7ec6a8.png

with plt.style.context("dark_background"):
    combined_df = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING+["finished"]].dropna()
    combined_df["finished_normalized"] = (
        combined_df["finished"] - combined_df["finished"].mean()
    ) / combined_df["finished"].std()
    plot_data = combined_df[utils.FEATURES_COLS_USED_FOR_CLUSTERING + ["finished_normalized"]].copy()
    plot_data["cluster"] = clust.labels_
    plot_data["cluster"] = plot_data["cluster"].astype("category")
    n_features = len(plot_data.columns) - 1
    n_cols = 4
    n_rows = int(np.ceil(n_features / n_cols))
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
    fig.suptitle("Boxplots of normalized features colored by cluster", fontsize=20)
    plt.subplots_adjust(top=0.95)
    axs = axs.flatten()
    for i, col in enumerate(plot_data.columns):
        if col == "cluster":
            continue
        sns.boxplot(data=plot_data, x="cluster", y=col, ax=axs[i], hue="cluster", palette="tab10")
        utils.set_axis_style(axs, i)
    plt.show()

_images/5375c48c2cb7b86189b172e0020f23e4770fa0534e64539ed30d21e1ea1b91c2.png

# heatmap of features means per cluster
feat_labels = utils.get_feature_names_labels()
with plt.style.context("dark_background"):
    means = plot_data.groupby(plot_data["cluster"]).mean()
    sns.heatmap(means, cmap="coolwarm", annot=True, fmt=".1f", vmin=-1, vmax=1)
    plt.title("Heatmap of normalized features\nmeans per cluster", fontsize=25)
    plt.ylabel("Cluster", fontsize=20)
    plt.xlabel("Feature", fontsize=20)
    plt.xticks(labels=feat_labels+["Finished (norm)"], ticks=np.arange(len(feat_labels)+1)+0.5, rotation=90, fontsize=15)
    plt.show()

_images/0982bf6d25f36430c2cab55fb1eba2ed8678f7fe0ee562697036fb72687b6a4a.png

Discussion and conclusion#

Here we can see that the clustering is less interesting. It really makes 4 clusters in which all the features used to do the clustering are either very low, low, high or very high.