Ver Fonte

Added commandline toolbox. Changed meaning of rev_dict for easier use.

Kristian Schultz há 3 anos atrás
pai
commit
2e38ecba01

Diff do ficheiro suprimidas por serem muito extensas
+ 521 - 525
Is your data fit for decision making using Machine Learning version 4.ipynb


+ 1 - 1
fdc/fdc.py

@@ -126,7 +126,7 @@ class FDC:
         actions = [
             ("CONT", self.clustering_cont, value(cont_list, self.cont_list))
             , ("ORD", self.clustering_ord, value(ord_list, self.ord_list))
-            , ("CONT", self.clustering_nom, value(nom_list, self.nom_list))
+            , ("NOM", self.clustering_nom, value(nom_list, self.nom_list))
             ]
 
         for (name, clustering, column_list) in actions:

+ 5 - 7
fdc/visualize.py

@@ -58,9 +58,8 @@ def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features, rev
     if len(main_data[feature].value_counts()) <= vizlimit:
       for cluster_counter, cluster in enumerate(cluster_df_list):
         print('Cluster '+ str(cluster_counter + 1) + ' frequency distribution')
-        if feature in list(rev_dict.keys()):
-          feat_keys=rev_dict[feature]
-          r = dict(zip(feat_keys.values(), feat_keys.keys()))
+        if feature in rev_dict:
+          r = rev_dict[feature]
           print(cluster.replace({feature:r})[feature].value_counts())
         else:
           print(cluster[feature].value_counts())
@@ -71,7 +70,7 @@ def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features, rev
     
       cluster_bar = []
       for cluster in cluster_df_list:
-        if feature in list(rev_dict.keys()):
+        if feature in rev_dict:
           y = np.array(cluster.replace({feature:r})[feature].value_counts())
           x = np.array(cluster.replace({feature:r})[feature].value_counts().index)
           cluster_bar.append([x,y])
@@ -138,9 +137,8 @@ def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features, rev
     
     if feature not in cont_features:
       print('Feature distribution in UMAP embedding')
-      if feature in list(rev_dict.keys()):
-        feat_keys=rev_dict[feature]
-        r = dict(zip(feat_keys.values(), feat_keys.keys()))
+      if feature in rev_dict:
+        r = rev_dict[feature]
         umap_data[feature] = np.array(main_data.replace({feature:r})[feature])
       else:
         umap_data[feature] = np.array(main_data[feature])

+ 161 - 0
fdcTool.py

@@ -0,0 +1,161 @@
+import warnings
+warnings.filterwarnings('ignore')
+
+import json
+
+import pandas as pd
+import numpy as np
+
+from fdc.visualize import plotCluster
+from fdc.tools import Timing
+from fdc.missingValues import fix_missing_values
+from fdc.fdc import canberra_modified, FDC, Clustering
+
+
+def indent(text, i="  "):
+  result = ""
+  for x in text.split("\n"):
+    result += i + x + "\n"
+  return result
+
+def indentPair(a, b, e="", i="  "):
+  m = a + " "
+  if len(m) < 32:
+    m += "_" * (32 - len(m))
+  if len(b) < 16:
+    m += "_" * (10 - len(b))
+  m += " "
+  m += b
+  if e == False:
+    pass
+  elif e == True:
+    m += " *"
+  else:
+    m += e
+  print("  " + m)
+
+
+
+class FdcToolbox:
+
+  def __init__(self, file_name, index_col=0):
+    data = pd.read_csv(file_name, index_col=0)
+    self.data = data.sample(frac=1)
+
+    self.value_dict = {}
+    self.value_dict_rev = {}
+    self.cols_cont = []
+    self.cols_ord = []
+    self.cols_nom = []
+
+    for k in self.data.dtypes.keys():
+      t = str(self.data.dtypes[k])
+      if t[:3] == "int":
+        self.cols_ord.append(k)
+      elif t == "object":
+        self.cols_nom.append(k)
+      else:
+        self.cols_cont.append(k)
+
+    self.has_missing_values = False
+    self.updateMissingValuesState()
+
+  def updateMissingValuesState(self):
+    self.has_missing_values = False
+    for k in self.data.isna().sum():
+      if k > 0:
+        self.has_missing_values = True
+        break
+
+  def showStatistic(self):
+    print(f"Fratures: {self.data.shape[1]}")
+    print(f"Points:   {self.data.shape[0]}")
+    print(f"Columns:")
+
+    for k in self.data.dtypes.keys():
+      t = str(self.data.dtypes[k])
+      e = " c"
+      if k in self.cols_ord:
+        e = " o"
+      if k in self.cols_nom:
+        e = " n"
+      indentPair(k, t, e)
+    print()
+    print(f"Missing values:")
+
+    n = 0
+    d = self.data.isna().sum()
+    for k in d.keys():
+      if d[k] > 0:
+        indentPair(k, str(d[k]))
+        n += 1
+    if n == 0:
+      print("  none")
+
+
+  def fixDatatypes(self):
+    columnsToFix = []
+    for k in self.data.dtypes.keys():
+      if str(self.data.dtypes[k]) == "object":
+        columnsToFix.append(k)
+
+    self.value_dict = {}
+    self.value_dict_rev = {}
+    for c in columnsToFix:
+      histogram = self.data[c].value_counts()
+      self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
+      self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
+    
+    if len(self.value_dict.keys()) > 0:
+      self.data.replace(self.value_dict, inplace=True)
+
+  def fix_missing_values(self):
+    self.data = fix_missing_values(self.data, 4)
+    self.updateMissingValuesState()
+
+
+
+filename='healthcare-dataset-stroke-data.csv'
+np.random.seed(42)
+tb = FdcToolbox(filename)
+tb.showStatistic()
+
+hasChanged = False
+if len(tb.cols_nom) > 0:
+  print()
+  print("Fixing object datatypes ...")
+  tb.fixDatatypes()
+  print("done")
+  hasChanged = True
+
+if tb.has_missing_values:
+  print()
+  print("Fix missing values ...")
+  tb.fix_missing_values()
+  print("done")
+  hasChanged = True
+
+if hasChanged:
+  print()
+  tb.showStatistic()
+  tb.data.to_csv(filename + "_fixed_values.csv")
+  with open(filename + "_value_mapping.json", "w") as f:
+    json.dump(tb.value_dict_rev, f)
+
+
+print("Doing FDC ...")
+fdc = FDC(clustering_cont=Clustering('euclidean')
+          , clustering_ord=Clustering(canberra_modified)
+          , clustering_nom=Clustering('hamming', max_components=1)
+          , visual=False
+          , use_pandas_output=True
+          , with_2d_embedding=False
+          )
+
+fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
+
+entire_data_FDC_emb_five = fdc.normalize(tb.data)
+entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
+print("done")
+
+

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff