Package sledge
Python package sledge
: semantic evaluation of clustering results.
The package performs an evaluation of clustering results through the semantic relationship between the significant frequent patterns identified among the cluster items.
The method uses an internal validation technique to evaluate the cluster rather than using distance-related metrics. However, the algorithm requires that the data be organized in CATEGORICAL FORM.
Questions and information contact us: aquinordga@gmail.com
Expand source code
"""
Python package `sledge`: semantic evaluation of clustering results.
The package performs an evaluation of clustering results through
the semantic relationship between the significant frequent patterns
identified among the cluster items.
The method uses an internal validation technique to evaluate
the cluster rather than using distance-related metrics.
However, the algorithm requires that the data be organized in CATEGORICAL FORM.
Questions and information contact us: <aquinordga@gmail.com>
"""
import pandas as pd
import numpy as np
import math
import statistics
def particularize_descriptors(descriptors, particular_threshold=1.0):
"""
Particularization of descriptors based on support.
This function particularizes descriptors using a threshold applied
on the carrier (support maximum - support minimum) of the feature in the
clusters.
Parameters
----------
particular_threshold: float
Particularization threshold. Given the relative support,
0.0 means that the entire range of relative support will be used,
while 0.5 will be used half, and 1.0 only maximum support is kept.
descriptors: array-like of shape (n_clusters, n_features)
Matrix with the support of features in each cluster.
Returns
-------
descriptors: array-like of shape (n_clusters, n_features)
Matrix with the computed particularized support of features in each
cluster.
"""
for feature in descriptors.columns:
column = np.array(descriptors[feature])
minimum_support = np.min(column)
maximum_support = np.max(column)
toremove = column < minimum_support + \
particular_threshold * (maximum_support - minimum_support)
descriptors.loc[toremove, feature] = 0.0
return descriptors
def semantic_descriptors(X, labels, particular_threshold=None):
"""
Semantic descriptors based on feature support.
This function computes the support of the present feature (1-itemsets
composed by the features with value 1) of the samples in each cluster.
Features in a cluster that do not meet the *particularization criterion*
have their support zeroed.
Parameters
----------
X: array-like of shape (n_samples, n_features)
Feature array of each sample. All features must be binary.
labels: array-like of shape (n_samples,)
Cluster labels for each sample starting in 0.
particular_threshold: {None, float}
Particularization threshold. `None` means no particularization
strategy.
Returns
-------
descriptors: array-like of shape (n_clusters, n_features)
Matrix with the computed particularized support of features in each
cluster.
"""
n_clusters = max(labels) + 1
if isinstance(X, np.ndarray):
X = pd.DataFrame(X, columns=[f'X{i}' for i in range(X.shape[1])])
features = X.columns
# 1-itemsets, for greater k we need a different algorithm
support = X.groupby(labels).mean()
if particular_threshold is not None:
support = particularize_descriptors(
support, particular_threshold=particular_threshold)
return support
def sledge_score_clusters(
X,
labels,
particular_threshold=None,
aggregation='harmonic'):
"""
SLEDge score for each cluster.
This function computes the SLEDge score of each cluster.
If `aggregation` is `None`, returns a matrix with values *S*, *L*, *E*, and
*D* for each cluster.
Parameters
----------
X: array-like of shape (n_samples, n_features)
Feature array of each sample. All features must be binary.
labels: array-like of shape (n_samples,)
Cluster labels for each sample starting in 0.
particular_threshold: {None, float}
Particularization threshold. `None` means no particularization
strategy.
aggregation: {'harmonic', 'geometric', 'median', None}
Strategy to aggregate values of *S*, *L*, *E*, and *D*.
Returns
-------
scores: array-like of shape (n_clusters,)
SLEDge score for each cluster.
score_matrix: array-like of shape (n_clusters, 4) if `aggregation` is None
S,L,E,D score for each cluster.
"""
n_clusters = max(labels) + 1
descriptors = semantic_descriptors(
X, labels, particular_threshold=particular_threshold).transpose()
# S: Average support for descriptors (features with particularized support
# greater than zero)
def mean_gt_zero(x): return 0 if np.count_nonzero(
x) == 0 else np.mean(x[x > 0])
support_score = [mean_gt_zero(descriptors[cluster])
for cluster in range(n_clusters)]
# L: Description set size deviation
descriptor_set_size = np.array([np.count_nonzero(descriptors[cluster]) for
cluster in range(n_clusters)])
average_set_size = np.mean(descriptor_set_size[descriptor_set_size > 0])
length_score = [0 if set_size == 0 else 1.0 /
(1.0 +
abs(set_size -
average_set_size)) for set_size in descriptor_set_size]
# E: Exclusivity
descriptor_sets = np.array([frozenset(
descriptors.index[descriptors[cluster] > 0]) for cluster in range(n_clusters)])
exclusive_sets = [
descriptor_sets[cluster].difference(
frozenset.union(
*
np.delete(
descriptor_sets,
cluster))) for cluster in range(n_clusters)]
exclusive_score = [0 if len(descriptor_sets[cluster]) == 0 else len(
exclusive_sets[cluster]) / len(descriptor_sets[cluster]) for cluster in range(n_clusters)]
# D: Maximum ordered support difference
ordered_support = [np.sort(descriptors[cluster])
for cluster in range(n_clusters)]
diff_score = [math.sqrt(np.max(np.diff(ordered_support[cluster])))
for cluster in range(n_clusters)]
score = pd.DataFrame.from_dict({'S': support_score, 'L': length_score,
'E': exclusive_score, 'D': diff_score})
if aggregation == 'harmonic':
score = score.transpose().apply(statistics.harmonic_mean)
elif aggregation == 'geometric':
score = score.transpose().apply(statistics.geometric_mean)
elif aggregation == 'median':
score = score.transpose().apply(statistics.median)
else:
assert aggregation is None
return score
def sledge_score(X, labels, particular_threshold=None, aggregation='harmonic'):
"""
SLEDge score.
This function computes the average SLEDge score of all clusters.
Parameters
----------
X: array-like of shape (n_samples, n_features)
Feature array of each sample. All features must be binary.
labels: array-like of shape (n_samples,)
Cluster labels for each sample starting in 0.
particular_threshold: {None, float}
Particularization threshold. `None` means no particularization
strategy.
aggregation: {'harmonic', 'geometric', 'median'}
Strategy to aggregate values of *S*, *L*, *E*, and *D* for each cluster.
Returns
-------
score: float
Average SLEDge score.
"""
assert aggregation is not None
return np.mean(
sledge_score_clusters(
X,
labels,
particular_threshold=particular_threshold,
aggregation=aggregation))
def sledge_curve(X, labels, particular_threshold=0.0, aggregation='harmonic'):
"""
SLEDge curve.
This function computes the SLEDge curve.
Parameters
----------
X: array-like of shape (n_samples, n_features)
Feature array of each sample. All features must be binary.
labels: array-like of shape (n_samples,)
Cluster labels for each sample starting in 0.
particular_threshold: {None, float}
Particularization threshold. `None` means no particularization
strategy.
aggregation: {'harmonic', 'geometric', 'median', None}
Strategy to aggregate values of *S*, *L*, *E*, and *D*.
Returns
-------
fractions: array-like of shape (>2,)
Decreasing rate that element `i` is the fraction of clusters with
SLEDge score >= `thresholds[i]`. `fractions[0]` is always `1`.
thresholds: array-like of shape (>2, )
Increasing thresholds of the cluster SLEDge score used to compute
`fractions`. `thresholds[0]` is always `0` and `thresholds[-1]` is
always `1`.
"""
scores = sledge_score_clusters(X, labels,
particular_threshold=particular_threshold,
aggregation=aggregation)
n_clusters = len(scores)
thresholds = np.unique(scores)
if thresholds[0] != 1:
thresholds = np.concatenate((thresholds, [1]))
if thresholds[len(thresholds) - 1] != 0:
thresholds = np.concatenate(([0], thresholds))
fractions = np.array(
[np.count_nonzero(scores >= thr) / n_clusters for thr in thresholds])
return fractions, thresholds
Functions
def particularize_descriptors(descriptors, particular_threshold=1.0)
-
Particularization of descriptors based on support.
This function particularizes descriptors using a threshold applied on the carrier (support maximum - support minimum) of the feature in the clusters.
Parameters
particular_threshold
:float
- Particularization threshold. Given the relative support, 0.0 means that the entire range of relative support will be used, while 0.5 will be used half, and 1.0 only maximum support is kept.
descriptors
:array-like
ofshape (n_clusters, n_features)
- Matrix with the support of features in each cluster.
Returns
descriptors
:array-like
ofshape (n_clusters, n_features)
- Matrix with the computed particularized support of features in each cluster.
Expand source code
def particularize_descriptors(descriptors, particular_threshold=1.0): """ Particularization of descriptors based on support. This function particularizes descriptors using a threshold applied on the carrier (support maximum - support minimum) of the feature in the clusters. Parameters ---------- particular_threshold: float Particularization threshold. Given the relative support, 0.0 means that the entire range of relative support will be used, while 0.5 will be used half, and 1.0 only maximum support is kept. descriptors: array-like of shape (n_clusters, n_features) Matrix with the support of features in each cluster. Returns ------- descriptors: array-like of shape (n_clusters, n_features) Matrix with the computed particularized support of features in each cluster. """ for feature in descriptors.columns: column = np.array(descriptors[feature]) minimum_support = np.min(column) maximum_support = np.max(column) toremove = column < minimum_support + \ particular_threshold * (maximum_support - minimum_support) descriptors.loc[toremove, feature] = 0.0 return descriptors
def semantic_descriptors(X, labels, particular_threshold=None)
-
Semantic descriptors based on feature support.
This function computes the support of the present feature (1-itemsets composed by the features with value 1) of the samples in each cluster.
Features in a cluster that do not meet the particularization criterion have their support zeroed.
Parameters
X
:array-like
ofshape (n_samples, n_features)
- Feature array of each sample. All features must be binary.
labels
:array-like
ofshape (n_samples,)
- Cluster labels for each sample starting in 0.
particular_threshold
:{None, float}
- Particularization threshold.
None
means no particularization strategy.
Returns
descriptors
:array-like
ofshape (n_clusters, n_features)
- Matrix with the computed particularized support of features in each cluster.
Expand source code
def semantic_descriptors(X, labels, particular_threshold=None): """ Semantic descriptors based on feature support. This function computes the support of the present feature (1-itemsets composed by the features with value 1) of the samples in each cluster. Features in a cluster that do not meet the *particularization criterion* have their support zeroed. Parameters ---------- X: array-like of shape (n_samples, n_features) Feature array of each sample. All features must be binary. labels: array-like of shape (n_samples,) Cluster labels for each sample starting in 0. particular_threshold: {None, float} Particularization threshold. `None` means no particularization strategy. Returns ------- descriptors: array-like of shape (n_clusters, n_features) Matrix with the computed particularized support of features in each cluster. """ n_clusters = max(labels) + 1 if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=[f'X{i}' for i in range(X.shape[1])]) features = X.columns # 1-itemsets, for greater k we need a different algorithm support = X.groupby(labels).mean() if particular_threshold is not None: support = particularize_descriptors( support, particular_threshold=particular_threshold) return support
def sledge_curve(X, labels, particular_threshold=0.0, aggregation='harmonic')
-
SLEDge curve.
This function computes the SLEDge curve.
Parameters
X
:array-like
ofshape (n_samples, n_features)
- Feature array of each sample. All features must be binary.
labels
:array-like
ofshape (n_samples,)
- Cluster labels for each sample starting in 0.
particular_threshold
:{None, float}
- Particularization threshold.
None
means no particularization strategy. aggregation
:{'harmonic', 'geometric', 'median', None}
- Strategy to aggregate values of S, L, E, and D.
Returns
fractions
:array-like
ofshape (>2,)
- Decreasing rate that element
i
is the fraction of clusters with SLEDge score >=thresholds[i]
.fractions[0]
is always1
. thresholds
:array-like
ofshape (>2, )
- Increasing thresholds of the cluster SLEDge score used to compute
fractions
.thresholds[0]
is always0
andthresholds[-1]
is always1
.
Expand source code
def sledge_curve(X, labels, particular_threshold=0.0, aggregation='harmonic'): """ SLEDge curve. This function computes the SLEDge curve. Parameters ---------- X: array-like of shape (n_samples, n_features) Feature array of each sample. All features must be binary. labels: array-like of shape (n_samples,) Cluster labels for each sample starting in 0. particular_threshold: {None, float} Particularization threshold. `None` means no particularization strategy. aggregation: {'harmonic', 'geometric', 'median', None} Strategy to aggregate values of *S*, *L*, *E*, and *D*. Returns ------- fractions: array-like of shape (>2,) Decreasing rate that element `i` is the fraction of clusters with SLEDge score >= `thresholds[i]`. `fractions[0]` is always `1`. thresholds: array-like of shape (>2, ) Increasing thresholds of the cluster SLEDge score used to compute `fractions`. `thresholds[0]` is always `0` and `thresholds[-1]` is always `1`. """ scores = sledge_score_clusters(X, labels, particular_threshold=particular_threshold, aggregation=aggregation) n_clusters = len(scores) thresholds = np.unique(scores) if thresholds[0] != 1: thresholds = np.concatenate((thresholds, [1])) if thresholds[len(thresholds) - 1] != 0: thresholds = np.concatenate(([0], thresholds)) fractions = np.array( [np.count_nonzero(scores >= thr) / n_clusters for thr in thresholds]) return fractions, thresholds
def sledge_score(X, labels, particular_threshold=None, aggregation='harmonic')
-
SLEDge score.
This function computes the average SLEDge score of all clusters.
Parameters
X
:array-like
ofshape (n_samples, n_features)
- Feature array of each sample. All features must be binary.
labels
:array-like
ofshape (n_samples,)
- Cluster labels for each sample starting in 0.
particular_threshold
:{None, float}
- Particularization threshold.
None
means no particularization strategy. aggregation
:{'harmonic', 'geometric', 'median'}
- Strategy to aggregate values of S, L, E, and D for each cluster.
Returns
score
:float
- Average SLEDge score.
Expand source code
def sledge_score(X, labels, particular_threshold=None, aggregation='harmonic'): """ SLEDge score. This function computes the average SLEDge score of all clusters. Parameters ---------- X: array-like of shape (n_samples, n_features) Feature array of each sample. All features must be binary. labels: array-like of shape (n_samples,) Cluster labels for each sample starting in 0. particular_threshold: {None, float} Particularization threshold. `None` means no particularization strategy. aggregation: {'harmonic', 'geometric', 'median'} Strategy to aggregate values of *S*, *L*, *E*, and *D* for each cluster. Returns ------- score: float Average SLEDge score. """ assert aggregation is not None return np.mean( sledge_score_clusters( X, labels, particular_threshold=particular_threshold, aggregation=aggregation))
def sledge_score_clusters(X, labels, particular_threshold=None, aggregation='harmonic')
-
SLEDge score for each cluster.
This function computes the SLEDge score of each cluster.
If
aggregation
isNone
, returns a matrix with values S, L, E, and D for each cluster.Parameters
X
:array-like
ofshape (n_samples, n_features)
- Feature array of each sample. All features must be binary.
labels
:array-like
ofshape (n_samples,)
- Cluster labels for each sample starting in 0.
particular_threshold
:{None, float}
- Particularization threshold.
None
means no particularization strategy. aggregation
:{'harmonic', 'geometric', 'median', None}
- Strategy to aggregate values of S, L, E, and D.
Returns
scores
:array-like
ofshape (n_clusters,)
- SLEDge score for each cluster.
score_matrix
:array-like
ofshape (n_clusters, 4) if
aggregationis None
- S,L,E,D score for each cluster.
Expand source code
def sledge_score_clusters( X, labels, particular_threshold=None, aggregation='harmonic'): """ SLEDge score for each cluster. This function computes the SLEDge score of each cluster. If `aggregation` is `None`, returns a matrix with values *S*, *L*, *E*, and *D* for each cluster. Parameters ---------- X: array-like of shape (n_samples, n_features) Feature array of each sample. All features must be binary. labels: array-like of shape (n_samples,) Cluster labels for each sample starting in 0. particular_threshold: {None, float} Particularization threshold. `None` means no particularization strategy. aggregation: {'harmonic', 'geometric', 'median', None} Strategy to aggregate values of *S*, *L*, *E*, and *D*. Returns ------- scores: array-like of shape (n_clusters,) SLEDge score for each cluster. score_matrix: array-like of shape (n_clusters, 4) if `aggregation` is None S,L,E,D score for each cluster. """ n_clusters = max(labels) + 1 descriptors = semantic_descriptors( X, labels, particular_threshold=particular_threshold).transpose() # S: Average support for descriptors (features with particularized support # greater than zero) def mean_gt_zero(x): return 0 if np.count_nonzero( x) == 0 else np.mean(x[x > 0]) support_score = [mean_gt_zero(descriptors[cluster]) for cluster in range(n_clusters)] # L: Description set size deviation descriptor_set_size = np.array([np.count_nonzero(descriptors[cluster]) for cluster in range(n_clusters)]) average_set_size = np.mean(descriptor_set_size[descriptor_set_size > 0]) length_score = [0 if set_size == 0 else 1.0 / (1.0 + abs(set_size - average_set_size)) for set_size in descriptor_set_size] # E: Exclusivity descriptor_sets = np.array([frozenset( descriptors.index[descriptors[cluster] > 0]) for cluster in range(n_clusters)]) exclusive_sets = [ descriptor_sets[cluster].difference( frozenset.union( * np.delete( descriptor_sets, cluster))) for cluster in range(n_clusters)] exclusive_score = [0 if len(descriptor_sets[cluster]) == 0 else len( exclusive_sets[cluster]) / len(descriptor_sets[cluster]) for cluster in range(n_clusters)] # D: Maximum ordered support difference ordered_support = [np.sort(descriptors[cluster]) for cluster in range(n_clusters)] diff_score = [math.sqrt(np.max(np.diff(ordered_support[cluster]))) for cluster in range(n_clusters)] score = pd.DataFrame.from_dict({'S': support_score, 'L': length_score, 'E': exclusive_score, 'D': diff_score}) if aggregation == 'harmonic': score = score.transpose().apply(statistics.harmonic_mean) elif aggregation == 'geometric': score = score.transpose().apply(statistics.geometric_mean) elif aggregation == 'median': score = score.transpose().apply(statistics.median) else: assert aggregation is None return score