Explorar o código

Generalized dimension selection in clustering.

Kristian Schultz %!s(int64=3) %!d(string=hai) anos
pai
achega
62b3527923
Modificáronse 2 ficheiros con 10216 adicións e 10204 borrados
  1. 10137 10137
      agglo_5dim_2NN_v3.ipynb
  2. 79 67
      fdc/fdc.py

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 10137 - 10137
agglo_5dim_2NN_v3.ipynb


+ 79 - 67
fdc/fdc.py

@@ -29,96 +29,106 @@ def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
 
 @jit(nopython=True)
 def canberra_modified(a,b):
-    return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
+    return np.sqrt(np.sum(np.array(
+        [np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)]
+        )))
 
 
 
 class Clustering:
-    def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1):
+    def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1, max_components=2):
         self.metric = metric
         self.UMAP_neb = UMAP_neb
         self.min_dist_UMAP = min_dist_UMAP
+        self.max_components = max_components
 
-    def fit(self, data):
-        def normalize(x):
-            return (x - np.mean(x)) / np.std(x)
+    def normalize(self, x):
+        return (x - np.mean(x)) / np.std(x)
 
+    def fit(self, data):
         np.random.seed(42)
-        data_embedded = umap.UMAP(
-            n_neighbors=self.UMAP_neb
-            , min_dist=self.min_dist_UMAP
-            , n_components=2
-            , metric=self.metric
-            , random_state=42
-            ).fit_transform(data)
-
-        data_embedded[:, 0] = normalize(data_embedded[:, 0])
-        data_embedded[:, 1] = normalize(data_embedded[:, 1])
+
+        # ensure that the data is a 2d array.
+        if len(data.shape) < 2:
+            data = data.reshape((data.shape[0], 1))
+
+        # do UMAP if needed (e.g. data has more than 2 features)
+        if data.shape[1] > self.max_components:
+            data_embedded = umap.UMAP(
+                n_neighbors=self.UMAP_neb
+                , min_dist=self.min_dist_UMAP
+                , n_components=self.max_components
+                , metric=self.metric
+                , random_state=42
+                ).fit_transform(data)
+        else:
+            data_embedded = data
+
+        # normalize the data
+        for n in range(data_embedded.shape[1]):
+            data_embedded[:, n] = self.normalize(data_embedded[:, n])
         
         return data_embedded
 
 
 
 class FDC:
-    def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
+    def __init__(self,
+                 clustering_cont=None, clustering_ord=None, clustering_nom=None,
+                 visual=False,
+                 with_2d_embedding=False,
+                 use_pandas_output=False
+                 ):
         # used clusterings
-        self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
-        self.clustering_ord = clustering_ord or Clustering(canberra_modified, 30, 0.1)
-        self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
+        self.clustering_cont = value(clustering_cont, Clustering('euclidian', 30, 0.1))
+        self.clustering_ord = value(clustering_ord, Clustering(canberra_modified, 30, 0.1))
+        self.clustering_nom = value(clustering_nom, Clustering('hamming', 30, 0.1, max_components=1))
 
         # Control of data output
         self.use_pandas_output = use_pandas_output
         self.with_2d_embedding = with_2d_embedding
-        self.drop_nominal = drop_nominal
 
         # Control if a graph is shown
         self.visual = visual
 
-        # Lists to select columns for continueous, nominal and ordinal data.
+        # Lists to select columns for continueous, nomial and ordinal data.
         self.cont_list = None
         self.nom_list = None
         self.ord_list = None
         
-
-    def calc_embedding(self, clustering, data, column_list):
-        if column_list is not None:
-            return clustering.fit(data[column_list])
-        else:
-            return None
-
-
-    def normalize(self, data, cont_list=None, nom_list=None, ord_list=None, with_2d_embedding=False, visual=None):
-        np.random.seed(42)
+    def selectFeatures(self, continueous=None, nomial=None, ordinal=None):
+        self.cont_list = continueous
+        self.nom_list = nomial
+        self.ord_list = ordinal
+
+    def normalize(self, data,
+                  cont_list=None, nom_list=None, ord_list=None,
+                  with_2d_embedding=None,
+                  visual=None
+                  ):
+
+        # Take instance value if parameter was not given.
         visual = value(visual, self.visual)
+        with_2d_embedding = value(with_2d_embedding, self.with_2d_embedding)
+        
+        # Initialize data. 
+        np.random.seed(42)
         concat_column_names = []
         concat_lists = []
 
+        # Reducing features into 2dim or 1dim
+        actions = [
+            ("CONT", self.clustering_cont, value(cont_list, self.cont_list))
+            , ("ORD", self.clustering_ord, value(ord_list, self.ord_list))
+            , ("CONT", self.clustering_nom, value(nom_list, self.nom_list))
+            ]
 
-        # Reducing continueous features into 2dim
-        cont_emb = self.calc_embedding(self.clustering_cont, data, value(cont_list, self.cont_list))
-        if cont_emb is not None:
-            concat_lists.append(cont_emb)
-            concat_column_names.extend(['CONT_UMAP_0', 'CONT_UMAP_1'])
-
-
-        # Reducing ordinal features into 2dim
-        ord_emb = self.calc_embedding(self.clustering_ord, data, value(ord_list, self.ord_list))
-        if ord_emb is not None:
-            concat_lists.append(ord_emb)
-            concat_column_names.extend(['ORD_UMAP_0', 'ORD_UMAP_1'])
-
-
-        # Reducing nominal features into 2dim
-        nom_emb = self.calc_embedding(self.clustering_nom, data, value(nom_list, self.nom_list))
-        if nom_emb is not None:
-            concat_column_names.append('NOM_UMAP_0')
-
-            if self.drop_nominal:
-                nom_emb = nom_emb[:, 0].reshape((nom_emb.shape[0], 1))
-            else:
-                concat_column_names.append('NOM_UMAP_1')
-
-            concat_lists.append(nom_emb)
+        for (name, clustering, column_list) in actions:
+            if column_list is not None:
+                emb = clustering.fit(data[column_list])
+                concat_lists.append(emb)
+                for n in range(emb.shape[1]):
+                    concat_column_names.append(f"{name}_UMAP_{n}")
 
         # Merge results
         if concat_lists == []:
@@ -126,8 +136,7 @@ class FDC:
 
         result_concat = np.concatenate(concat_lists, axis=1)
 
-
-        # Create 2d embedding
+        # Create 2d embedding from 5d embedding
         if with_2d_embedding or visual:
             result_reduced = umap.UMAP(
                 n_neighbors=30
@@ -135,25 +144,28 @@ class FDC:
                 , n_components=2
                 , metric='euclidean'
                 , random_state=42
-                ).fit_transform(result_concat) #reducing 5D embeddings to 2D using UMAP
+                ).fit_transform(result_concat)
         
             if self.use_pandas_output:
-                result_reduced = pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
+                result_reduced = pd.DataFrame(
+                    data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
 
             # Show mapping if needed
             if visual:
                 if self.use_pandas_output:
                     plotMapping(result_reduced)
                 else:
-                    plotMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
+                    plotMapping(pd.DataFrame(
+                        data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
 
-
-        # Return the results
+        # Transform to pandas DataFrame if needed.
         if self.use_pandas_output:
-            result_concat = pd.DataFrame(data=result_concat, columns=concat_column_names)
-
+            result_concat = pd.DataFrame(
+                data=result_concat, columns=concat_column_names)
 
         if with_2d_embedding:
-            return result_concat, result_reduced #returns both 5D and 2D embeddings
+            #returns both 5D and 2D embeddings
+            return result_concat, result_reduced
         else:
-            return result_concat #returns 5D embedding only
+            #returns 5D embedding only
+            return result_concat

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio