|
|
@@ -29,96 +29,106 @@ def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
|
|
|
|
|
|
@jit(nopython=True)
|
|
|
def canberra_modified(a,b):
|
|
|
- return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
|
|
|
+ return np.sqrt(np.sum(np.array(
|
|
|
+ [np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)]
|
|
|
+ )))
|
|
|
|
|
|
|
|
|
|
|
|
class Clustering:
|
|
|
- def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1):
|
|
|
+ def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1, max_components=2):
|
|
|
self.metric = metric
|
|
|
self.UMAP_neb = UMAP_neb
|
|
|
self.min_dist_UMAP = min_dist_UMAP
|
|
|
+ self.max_components = max_components
|
|
|
|
|
|
- def fit(self, data):
|
|
|
- def normalize(x):
|
|
|
- return (x - np.mean(x)) / np.std(x)
|
|
|
+ def normalize(self, x):
|
|
|
+ return (x - np.mean(x)) / np.std(x)
|
|
|
|
|
|
+ def fit(self, data):
|
|
|
np.random.seed(42)
|
|
|
- data_embedded = umap.UMAP(
|
|
|
- n_neighbors=self.UMAP_neb
|
|
|
- , min_dist=self.min_dist_UMAP
|
|
|
- , n_components=2
|
|
|
- , metric=self.metric
|
|
|
- , random_state=42
|
|
|
- ).fit_transform(data)
|
|
|
-
|
|
|
- data_embedded[:, 0] = normalize(data_embedded[:, 0])
|
|
|
- data_embedded[:, 1] = normalize(data_embedded[:, 1])
|
|
|
+
|
|
|
+ # ensure that the data is a 2d array.
|
|
|
+ if len(data.shape) < 2:
|
|
|
+ data = data.reshape((data.shape[0], 1))
|
|
|
+
|
|
|
+ # do UMAP if needed (e.g. data has more than 2 features)
|
|
|
+ if data.shape[1] > self.max_components:
|
|
|
+ data_embedded = umap.UMAP(
|
|
|
+ n_neighbors=self.UMAP_neb
|
|
|
+ , min_dist=self.min_dist_UMAP
|
|
|
+ , n_components=self.max_components
|
|
|
+ , metric=self.metric
|
|
|
+ , random_state=42
|
|
|
+ ).fit_transform(data)
|
|
|
+ else:
|
|
|
+ data_embedded = data
|
|
|
+
|
|
|
+ # normalize the data
|
|
|
+ for n in range(data_embedded.shape[1]):
|
|
|
+ data_embedded[:, n] = self.normalize(data_embedded[:, n])
|
|
|
|
|
|
return data_embedded
|
|
|
|
|
|
|
|
|
|
|
|
class FDC:
|
|
|
- def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
|
|
|
+ def __init__(self,
|
|
|
+ clustering_cont=None, clustering_ord=None, clustering_nom=None,
|
|
|
+ visual=False,
|
|
|
+ with_2d_embedding=False,
|
|
|
+ use_pandas_output=False
|
|
|
+ ):
|
|
|
# used clusterings
|
|
|
- self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
|
|
|
- self.clustering_ord = clustering_ord or Clustering(canberra_modified, 30, 0.1)
|
|
|
- self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
|
|
|
+ self.clustering_cont = value(clustering_cont, Clustering('euclidian', 30, 0.1))
|
|
|
+ self.clustering_ord = value(clustering_ord, Clustering(canberra_modified, 30, 0.1))
|
|
|
+ self.clustering_nom = value(clustering_nom, Clustering('hamming', 30, 0.1, max_components=1))
|
|
|
|
|
|
# Control of data output
|
|
|
self.use_pandas_output = use_pandas_output
|
|
|
self.with_2d_embedding = with_2d_embedding
|
|
|
- self.drop_nominal = drop_nominal
|
|
|
|
|
|
# Control if a graph is shown
|
|
|
self.visual = visual
|
|
|
|
|
|
- # Lists to select columns for continueous, nominal and ordinal data.
|
|
|
+ # Lists to select columns for continueous, nomial and ordinal data.
|
|
|
self.cont_list = None
|
|
|
self.nom_list = None
|
|
|
self.ord_list = None
|
|
|
|
|
|
-
|
|
|
- def calc_embedding(self, clustering, data, column_list):
|
|
|
- if column_list is not None:
|
|
|
- return clustering.fit(data[column_list])
|
|
|
- else:
|
|
|
- return None
|
|
|
-
|
|
|
-
|
|
|
- def normalize(self, data, cont_list=None, nom_list=None, ord_list=None, with_2d_embedding=False, visual=None):
|
|
|
- np.random.seed(42)
|
|
|
+ def selectFeatures(self, continueous=None, nomial=None, ordinal=None):
|
|
|
+ self.cont_list = continueous
|
|
|
+ self.nom_list = nomial
|
|
|
+ self.ord_list = ordinal
|
|
|
+
|
|
|
+ def normalize(self, data,
|
|
|
+ cont_list=None, nom_list=None, ord_list=None,
|
|
|
+ with_2d_embedding=None,
|
|
|
+ visual=None
|
|
|
+ ):
|
|
|
+
|
|
|
+ # Take instance value if parameter was not given.
|
|
|
visual = value(visual, self.visual)
|
|
|
+ with_2d_embedding = value(with_2d_embedding, self.with_2d_embedding)
|
|
|
+
|
|
|
+ # Initialize data.
|
|
|
+ np.random.seed(42)
|
|
|
concat_column_names = []
|
|
|
concat_lists = []
|
|
|
|
|
|
+ # Reducing features into 2dim or 1dim
|
|
|
+ actions = [
|
|
|
+ ("CONT", self.clustering_cont, value(cont_list, self.cont_list))
|
|
|
+ , ("ORD", self.clustering_ord, value(ord_list, self.ord_list))
|
|
|
+ , ("CONT", self.clustering_nom, value(nom_list, self.nom_list))
|
|
|
+ ]
|
|
|
|
|
|
- # Reducing continueous features into 2dim
|
|
|
- cont_emb = self.calc_embedding(self.clustering_cont, data, value(cont_list, self.cont_list))
|
|
|
- if cont_emb is not None:
|
|
|
- concat_lists.append(cont_emb)
|
|
|
- concat_column_names.extend(['CONT_UMAP_0', 'CONT_UMAP_1'])
|
|
|
-
|
|
|
-
|
|
|
- # Reducing ordinal features into 2dim
|
|
|
- ord_emb = self.calc_embedding(self.clustering_ord, data, value(ord_list, self.ord_list))
|
|
|
- if ord_emb is not None:
|
|
|
- concat_lists.append(ord_emb)
|
|
|
- concat_column_names.extend(['ORD_UMAP_0', 'ORD_UMAP_1'])
|
|
|
-
|
|
|
-
|
|
|
- # Reducing nominal features into 2dim
|
|
|
- nom_emb = self.calc_embedding(self.clustering_nom, data, value(nom_list, self.nom_list))
|
|
|
- if nom_emb is not None:
|
|
|
- concat_column_names.append('NOM_UMAP_0')
|
|
|
-
|
|
|
- if self.drop_nominal:
|
|
|
- nom_emb = nom_emb[:, 0].reshape((nom_emb.shape[0], 1))
|
|
|
- else:
|
|
|
- concat_column_names.append('NOM_UMAP_1')
|
|
|
-
|
|
|
- concat_lists.append(nom_emb)
|
|
|
+ for (name, clustering, column_list) in actions:
|
|
|
+ if column_list is not None:
|
|
|
+ emb = clustering.fit(data[column_list])
|
|
|
+ concat_lists.append(emb)
|
|
|
+ for n in range(emb.shape[1]):
|
|
|
+ concat_column_names.append(f"{name}_UMAP_{n}")
|
|
|
|
|
|
# Merge results
|
|
|
if concat_lists == []:
|
|
|
@@ -126,8 +136,7 @@ class FDC:
|
|
|
|
|
|
result_concat = np.concatenate(concat_lists, axis=1)
|
|
|
|
|
|
-
|
|
|
- # Create 2d embedding
|
|
|
+ # Create 2d embedding from 5d embedding
|
|
|
if with_2d_embedding or visual:
|
|
|
result_reduced = umap.UMAP(
|
|
|
n_neighbors=30
|
|
|
@@ -135,25 +144,28 @@ class FDC:
|
|
|
, n_components=2
|
|
|
, metric='euclidean'
|
|
|
, random_state=42
|
|
|
- ).fit_transform(result_concat) #reducing 5D embeddings to 2D using UMAP
|
|
|
+ ).fit_transform(result_concat)
|
|
|
|
|
|
if self.use_pandas_output:
|
|
|
- result_reduced = pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
|
|
|
+ result_reduced = pd.DataFrame(
|
|
|
+ data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
|
|
|
|
|
|
# Show mapping if needed
|
|
|
if visual:
|
|
|
if self.use_pandas_output:
|
|
|
plotMapping(result_reduced)
|
|
|
else:
|
|
|
- plotMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
|
|
|
+ plotMapping(pd.DataFrame(
|
|
|
+ data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
|
|
|
|
|
|
-
|
|
|
- # Return the results
|
|
|
+ # Transform to pandas DataFrame if needed.
|
|
|
if self.use_pandas_output:
|
|
|
- result_concat = pd.DataFrame(data=result_concat, columns=concat_column_names)
|
|
|
-
|
|
|
+ result_concat = pd.DataFrame(
|
|
|
+ data=result_concat, columns=concat_column_names)
|
|
|
|
|
|
if with_2d_embedding:
|
|
|
- return result_concat, result_reduced #returns both 5D and 2D embeddings
|
|
|
+ #returns both 5D and 2D embeddings
|
|
|
+ return result_concat, result_reduced
|
|
|
else:
|
|
|
- return result_concat #returns 5D embedding only
|
|
|
+ #returns 5D embedding only
|
|
|
+ return result_concat
|