3 лет назад · 2e38ecba01
--- a/4.ipynb
+++ b/4.ipynb
--- a/fdc/fdc.py
+++ b/fdc/fdc.py
@@ -126,7 +126,7 @@ class FDC:
 
															         actions = [
														
 
															             ("CONT", self.clustering_cont, value(cont_list, self.cont_list))
														
 
															             , ("ORD", self.clustering_ord, value(ord_list, self.ord_list))
														
 
															-            , ("CONT", self.clustering_nom, value(nom_list, self.nom_list))
														
 
															+            , ("NOM", self.clustering_nom, value(nom_list, self.nom_list))
														
 
															             ]
														
 
															         for (name, clustering, column_list) in actions:
														
--- a/fdc/visualize.py
+++ b/fdc/visualize.py
@@ -58,9 +58,8 @@ def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features, rev
 
															     if len(main_data[feature].value_counts()) <= vizlimit:
														
 
															       for cluster_counter, cluster in enumerate(cluster_df_list):
														
 
															         print('Cluster '+ str(cluster_counter + 1) + ' frequency distribution')
														
 
															-        if feature in list(rev_dict.keys()):
														
 
															-          feat_keys=rev_dict[feature]
														
 
															-          r = dict(zip(feat_keys.values(), feat_keys.keys()))
														
 
															+        if feature in rev_dict:
														
 
															+          r = rev_dict[feature]
														
 
															           print(cluster.replace({feature:r})[feature].value_counts())
														
 
															         else:
														
 
															           print(cluster[feature].value_counts())
														
@@ -71,7 +70,7 @@ def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features, rev
 
															       cluster_bar = []
														
 
															       for cluster in cluster_df_list:
														
 
															-        if feature in list(rev_dict.keys()):
														
 
															+        if feature in rev_dict:
														
 
															           y = np.array(cluster.replace({feature:r})[feature].value_counts())
														
 
															           x = np.array(cluster.replace({feature:r})[feature].value_counts().index)
														
 
															           cluster_bar.append([x,y])
														
@@ -138,9 +137,8 @@ def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features, rev
 
															     if feature not in cont_features:
														
 
															       print('Feature distribution in UMAP embedding')
														
 
															-      if feature in list(rev_dict.keys()):
														
 
															-        feat_keys=rev_dict[feature]
														
 
															-        r = dict(zip(feat_keys.values(), feat_keys.keys()))
														
 
															+      if feature in rev_dict:
														
 
															+        r = rev_dict[feature]
														
 
															         umap_data[feature] = np.array(main_data.replace({feature:r})[feature])
														
 
															       else:
														
 
															         umap_data[feature] = np.array(main_data[feature])
														
--- a/fdcTool.py
+++ b/fdcTool.py
@@ -0,0 +1,161 @@
 
															+import warnings
														
 
															+warnings.filterwarnings('ignore')
														
 
															+
														
 
															+import json
														
 
															+
														
 
															+import pandas as pd
														
 
															+import numpy as np
														
 
															+
														
 
															+from fdc.visualize import plotCluster
														
 
															+from fdc.tools import Timing
														
 
															+from fdc.missingValues import fix_missing_values
														
 
															+from fdc.fdc import canberra_modified, FDC, Clustering
														
 
															+
														
 
															+
														
 
															+def indent(text, i="  "):
														
 
															+  result = ""
														
 
															+  for x in text.split("\n"):
														
 
															+    result += i + x + "\n"
														
 
															+  return result
														
 
															+
														
 
															+def indentPair(a, b, e="", i="  "):
														
 
															+  m = a + " "
														
 
															+  if len(m) < 32:
														
 
															+    m += "_" * (32 - len(m))
														
 
															+  if len(b) < 16:
														
 
															+    m += "_" * (10 - len(b))
														
 
															+  m += " "
														
 
															+  m += b
														
 
															+  if e == False:
														
 
															+    pass
														
 
															+  elif e == True:
														
 
															+    m += " *"
														
 
															+  else:
														
 
															+    m += e
														
 
															+  print("  " + m)
														
 
															+
														
 
															+
														
 
															+
														
 
															+class FdcToolbox:
														
 
															+
														
 
															+  def __init__(self, file_name, index_col=0):
														
 
															+    data = pd.read_csv(file_name, index_col=0)
														
 
															+    self.data = data.sample(frac=1)
														
 
															+
														
 
															+    self.value_dict = {}
														
 
															+    self.value_dict_rev = {}
														
 
															+    self.cols_cont = []
														
 
															+    self.cols_ord = []
														
 
															+    self.cols_nom = []
														
 
															+
														
 
															+    for k in self.data.dtypes.keys():
														
 
															+      t = str(self.data.dtypes[k])
														
 
															+      if t[:3] == "int":
														
 
															+        self.cols_ord.append(k)
														
 
															+      elif t == "object":
														
 
															+        self.cols_nom.append(k)
														
 
															+      else:
														
 
															+        self.cols_cont.append(k)
														
 
															+
														
 
															+    self.has_missing_values = False
														
 
															+    self.updateMissingValuesState()
														
 
															+
														
 
															+  def updateMissingValuesState(self):
														
 
															+    self.has_missing_values = False
														
 
															+    for k in self.data.isna().sum():
														
 
															+      if k > 0:
														
 
															+        self.has_missing_values = True
														
 
															+        break
														
 
															+
														
 
															+  def showStatistic(self):
														
 
															+    print(f"Fratures: {self.data.shape[1]}")
														
 
															+    print(f"Points:   {self.data.shape[0]}")
														
 
															+    print(f"Columns:")
														
 
															+
														
 
															+    for k in self.data.dtypes.keys():
														
 
															+      t = str(self.data.dtypes[k])
														
 
															+      e = " c"
														
 
															+      if k in self.cols_ord:
														
 
															+        e = " o"
														
 
															+      if k in self.cols_nom:
														
 
															+        e = " n"
														
 
															+      indentPair(k, t, e)
														
 
															+    print()
														
 
															+    print(f"Missing values:")
														
 
															+
														
 
															+    n = 0
														
 
															+    d = self.data.isna().sum()
														
 
															+    for k in d.keys():
														
 
															+      if d[k] > 0:
														
 
															+        indentPair(k, str(d[k]))
														
 
															+        n += 1
														
 
															+    if n == 0:
														
 
															+      print("  none")
														
 
															+
														
 
															+
														
 
															+  def fixDatatypes(self):
														
 
															+    columnsToFix = []
														
 
															+    for k in self.data.dtypes.keys():
														
 
															+      if str(self.data.dtypes[k]) == "object":
														
 
															+        columnsToFix.append(k)
														
 
															+
														
 
															+    self.value_dict = {}
														
 
															+    self.value_dict_rev = {}
														
 
															+    for c in columnsToFix:
														
 
															+      histogram = self.data[c].value_counts()
														
 
															+      self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
														
 
															+      self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
														
 
															+    
														
 
															+    if len(self.value_dict.keys()) > 0:
														
 
															+      self.data.replace(self.value_dict, inplace=True)
														
 
															+
														
 
															+  def fix_missing_values(self):
														
 
															+    self.data = fix_missing_values(self.data, 4)
														
 
															+    self.updateMissingValuesState()
														
 
															+
														
 
															+
														
 
															+
														
 
															+filename='healthcare-dataset-stroke-data.csv'
														
 
															+np.random.seed(42)
														
 
															+tb = FdcToolbox(filename)
														
 
															+tb.showStatistic()
														
 
															+
														
 
															+hasChanged = False
														
 
															+if len(tb.cols_nom) > 0:
														
 
															+  print()
														
 
															+  print("Fixing object datatypes ...")
														
 
															+  tb.fixDatatypes()
														
 
															+  print("done")
														
 
															+  hasChanged = True
														
 
															+
														
 
															+if tb.has_missing_values:
														
 
															+  print()
														
 
															+  print("Fix missing values ...")
														
 
															+  tb.fix_missing_values()
														
 
															+  print("done")
														
 
															+  hasChanged = True
														
 
															+
														
 
															+if hasChanged:
														
 
															+  print()
														
 
															+  tb.showStatistic()
														
 
															+  tb.data.to_csv(filename + "_fixed_values.csv")
														
 
															+  with open(filename + "_value_mapping.json", "w") as f:
														
 
															+    json.dump(tb.value_dict_rev, f)
														
 
															+
														
 
															+
														
 
															+print("Doing FDC ...")
														
 
															+fdc = FDC(clustering_cont=Clustering('euclidean')
														
 
															+          , clustering_ord=Clustering(canberra_modified)
														
 
															+          , clustering_nom=Clustering('hamming', max_components=1)
														
 
															+          , visual=False
														
 
															+          , use_pandas_output=True
														
 
															+          , with_2d_embedding=False
														
 
															+          )
														
 
															+
														
 
															+fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
														
 
															+
														
 
															+entire_data_FDC_emb_five = fdc.normalize(tb.data)
														
 
															+entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
														
 
															+print("done")
														
 
															+
														
 
															+