Browse Source

Added library for FDC.

Kristian Schultz 3 năm trước cách đây
mục cha
commit
a867e84a08
3 tập tin đã thay đổi với 10328 bổ sung10203 xóa
  1. 10154 10203
      agglo_5dim_2NN_v3.ipynb
  2. BIN
      fdc/__pycache__/fdc.cpython-39.pyc
  3. 174 0
      fdc/fdc.py

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 10154 - 10203
agglo_5dim_2NN_v3.ipynb


BIN
fdc/__pycache__/fdc.cpython-39.pyc


+ 174 - 0
fdc/fdc.py

@@ -0,0 +1,174 @@
+# --[ Known to be used ]----
+import numpy as np
+from numba import jit
+import umap.umap_ as umap
+
+# --[ Known to be used but can we avoid it? ]----
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def value(v, defaultValue):
+    if v is None:
+        return defaultValue
+    else:
+        return v
+
+def draw2dMapping(data):
+    colors_set1 = ["lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood", "cornflowerblue", "plum", "yellowgreen"]
+    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
+
+    sns.lmplot(x="UMAP_0"
+        , y="UMAP_1"
+        , data=data
+        , fit_reg=False
+        , legend=False
+        , scatter_kws={"s": 3}
+        , palette=customPalette_set1)
+    plt.show()
+
+
+
+def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
+    data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data)
+
+    result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1'])
+    
+    if visual:
+        draw2dMapping(result)
+
+    return result
+
+
+
+@jit(nopython=True)
+def modified_can(a,b):
+    return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
+
+
+
+class Clustering:
+    def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1):
+        self.metric = metric
+        self.UMAP_neb = UMAP_neb
+        self.min_dist_UMAP = min_dist_UMAP
+
+    def fit(self, data):
+        def normalize(x):
+            return (x - np.mean(x)) / np.std(x)
+
+        np.random.seed(42)
+        data_embedded = umap.UMAP(
+            n_neighbors=self.UMAP_neb
+            , min_dist=self.min_dist_UMAP
+            , n_components=2
+            , metric=self.metric
+            , random_state=42
+            ).fit_transform(data)
+
+        data_embedded[:, 0] = normalize(data_embedded[:, 0])
+        data_embedded[:, 1] = normalize(data_embedded[:, 1])
+        
+        return data_embedded
+
+
+
+class FDC:
+    def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
+        # used clusterings
+        self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
+        self.clustering_ord = clustering_ord or Clustering(modified_can, 30, 0.1)
+        self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
+
+        # Control of data output
+        self.use_pandas_output = use_pandas_output
+        self.with_2d_embedding = with_2d_embedding
+        self.drop_nominal = drop_nominal
+
+        # Control if a graph is shown
+        self.visual = visual
+
+        # Lists to select columns for continueous, nominal and ordinal data.
+        self.cont_list = None
+        self.nom_list = None
+        self.ord_list = None
+        
+
+    def calc_embedding(self, clustering, data, column_list):
+        if column_list is not None:
+            return clustering.fit(data[column_list])
+        else:
+            return None
+
+
+    def normalize(self, data, cont_list=None, nom_list=None, ord_list=None, with_2d_embedding=False, visual=None):
+        np.random.seed(42)
+        visual = value(visual, self.visual)
+        concat_column_names = []
+        concat_lists = []
+
+
+        # Reducing continueous features into 2dim
+        cont_emb = self.calc_embedding(self.clustering_cont, data, value(cont_list, self.cont_list))
+        if cont_emb is not None:
+            concat_lists.append(cont_emb)
+            concat_column_names.extend(['CONT_UMAP_0', 'CONT_UMAP_1'])
+
+
+        # Reducing ordinal features into 2dim
+        ord_emb = self.calc_embedding(self.clustering_ord, data, value(ord_list, self.ord_list))
+        if ord_emb is not None:
+            concat_lists.append(ord_emb)
+            concat_column_names.extend(['ORD_UMAP_0', 'ORD_UMAP_1'])
+
+
+        # Reducing nominal features into 2dim
+        nom_emb = self.calc_embedding(self.clustering_nom, data, value(nom_list, self.nom_list))
+        if nom_emb is not None:
+            concat_column_names.append('NOM_UMAP_0')
+
+            if self.drop_nominal:
+                nom_emb = nom_emb[:, 0].reshape((nom_emb.shape[0], 1))
+            else:
+                concat_column_names.append('NOM_UMAP_1')
+
+            concat_lists.append(nom_emb)
+
+        # Merge results
+        if concat_lists == []:
+            raise ValueError("Expected at least one non empty column list.") 
+
+        result_concat = np.concatenate(concat_lists, axis=1)
+
+
+        # Create 2d embedding
+        if with_2d_embedding or visual:
+            result_reduced = umap.UMAP(
+                n_neighbors=30
+                , min_dist=0.001
+                , n_components=2
+                , metric='euclidean'
+                , random_state=42
+                ).fit_transform(result_concat) #reducing 5D embeddings to 2D using UMAP
+        
+            if self.use_pandas_output:
+                result_reduced = pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
+
+            # Show mapping if needed
+            if visual:
+                if self.use_pandas_output:
+                    draw2dMapping(result_reduced)
+                else:
+                    draw2dMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
+
+
+        # Return the results
+        if self.use_pandas_output:
+            result_concat = pd.DataFrame(data=result_concat, columns=concat_column_names)
+
+
+        if with_2d_embedding:
+            return result_concat, result_reduced #returns both 5D and 2D embeddings
+        else:
+            return result_concat #returns 5D embedding only

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác