# --[ Known to be used ]---- import numpy as np from numba import jit import umap.umap_ as umap # --[ Known to be used but can we avoid it? ]---- import pandas as pd import seaborn as sns import matplotlib.pyplot as plt def value(v, defaultValue): if v is None: return defaultValue else: return v def draw2dMapping(data): colors_set1 = ["lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood", "cornflowerblue", "plum", "yellowgreen"] customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1)) sns.lmplot(x="UMAP_0" , y="UMAP_1" , data=data , fit_reg=False , legend=False , scatter_kws={"s": 3} , palette=customPalette_set1) plt.show() def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False): data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data) result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1']) if visual: draw2dMapping(result) return result @jit(nopython=True) def modified_can(a,b): return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)]))) class Clustering: def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1): self.metric = metric self.UMAP_neb = UMAP_neb self.min_dist_UMAP = min_dist_UMAP def fit(self, data): def normalize(x): return (x - np.mean(x)) / np.std(x) np.random.seed(42) data_embedded = umap.UMAP( n_neighbors=self.UMAP_neb , min_dist=self.min_dist_UMAP , n_components=2 , metric=self.metric , random_state=42 ).fit_transform(data) data_embedded[:, 0] = normalize(data_embedded[:, 0]) data_embedded[:, 1] = normalize(data_embedded[:, 1]) return data_embedded class FDC: def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False): # used clusterings self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1) self.clustering_ord = clustering_ord or Clustering(modified_can, 30, 0.1) self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1) # Control of data output self.use_pandas_output = use_pandas_output self.with_2d_embedding = with_2d_embedding self.drop_nominal = drop_nominal # Control if a graph is shown self.visual = visual # Lists to select columns for continueous, nominal and ordinal data. self.cont_list = None self.nom_list = None self.ord_list = None def calc_embedding(self, clustering, data, column_list): if column_list is not None: return clustering.fit(data[column_list]) else: return None def normalize(self, data, cont_list=None, nom_list=None, ord_list=None, with_2d_embedding=False, visual=None): np.random.seed(42) visual = value(visual, self.visual) concat_column_names = [] concat_lists = [] # Reducing continueous features into 2dim cont_emb = self.calc_embedding(self.clustering_cont, data, value(cont_list, self.cont_list)) if cont_emb is not None: concat_lists.append(cont_emb) concat_column_names.extend(['CONT_UMAP_0', 'CONT_UMAP_1']) # Reducing ordinal features into 2dim ord_emb = self.calc_embedding(self.clustering_ord, data, value(ord_list, self.ord_list)) if ord_emb is not None: concat_lists.append(ord_emb) concat_column_names.extend(['ORD_UMAP_0', 'ORD_UMAP_1']) # Reducing nominal features into 2dim nom_emb = self.calc_embedding(self.clustering_nom, data, value(nom_list, self.nom_list)) if nom_emb is not None: concat_column_names.append('NOM_UMAP_0') if self.drop_nominal: nom_emb = nom_emb[:, 0].reshape((nom_emb.shape[0], 1)) else: concat_column_names.append('NOM_UMAP_1') concat_lists.append(nom_emb) # Merge results if concat_lists == []: raise ValueError("Expected at least one non empty column list.") result_concat = np.concatenate(concat_lists, axis=1) # Create 2d embedding if with_2d_embedding or visual: result_reduced = umap.UMAP( n_neighbors=30 , min_dist=0.001 , n_components=2 , metric='euclidean' , random_state=42 ).fit_transform(result_concat) #reducing 5D embeddings to 2D using UMAP if self.use_pandas_output: result_reduced = pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']) # Show mapping if needed if visual: if self.use_pandas_output: draw2dMapping(result_reduced) else: draw2dMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1'])) # Return the results if self.use_pandas_output: result_concat = pd.DataFrame(data=result_concat, columns=concat_column_names) if with_2d_embedding: return result_concat, result_reduced #returns both 5D and 2D embeddings else: return result_concat #returns 5D embedding only