Просмотр исходного кода

Moved the class for processing the data to an own file.

Kristian Schultz 3 лет назад
Родитель
Сommit
f9563b84f1
3 измененных файлов с 228 добавлено и 106 удалено
  1. 120 0
      fdc/dataSheet.py
  2. 24 0
      fdc/tools.py
  3. 84 106
      fdcTool.py

+ 120 - 0
fdc/dataSheet.py

@@ -0,0 +1,120 @@
+import json
+
+import pandas as pd
+import numpy as np
+
+from fdc.tools import *
+from fdc.visualize import plotCluster
+from fdc.missingValues import fix_missing_values
+
+
+class DataSheet:
+
+  # ---------------------------------------------------------------------------
+  # Data I/O
+  # ---------------------------------------------------------------------------
+  def __init__(self, file_name=None, index_col=0, dataFrame=None):
+    if file_name is not None:
+      self.data = pd.read_csv(file_name, index_col=0).sample(frac=1)
+    elif dataFrame is not None:
+      self.data = dataFrame
+    else:
+      self.data = pd.DataFrame(np.array([[0.0]]), "dummy")
+
+    self.value_dict = {}
+    self.cols_cont = []
+    self.cols_ord = []
+    self.cols_nom = []
+
+    for k in self.data.dtypes.keys():
+      t = str(self.data.dtypes[k])
+      if t[:3] == "int":
+        self.cols_ord.append(k)
+      elif t == "object":
+        self.cols_nom.append(k)
+      else:
+        self.cols_cont.append(k)
+
+    self.has_missing_values = False
+    self.updateMissingValuesState()
+    self.detectMapping()
+
+  def saveTable(self, file_name):
+    self.data.to_csv(file_name)
+
+  # ---------------------------------------------------------------------------
+  # Data mapping
+  # ---------------------------------------------------------------------------
+  def saveMapping(self, file_name):
+    with open(file_name, "w") as f:
+      json.dump(self.value_dict, f)
+
+  def loadMapping(self, file_name):
+    with open(file_name) as f:
+      pass #json.dump(tb.value_dict, f)
+
+  def detectMapping(self):
+    columnsToFix = []
+    for k in self.data.dtypes.keys():
+      if str(self.data.dtypes[k]) == "object":
+        columnsToFix.append(k)
+
+    self.value_dict = {}
+    for c in columnsToFix:
+      histogram = self.data[c].value_counts()
+      self.value_dict[c] = { n : k for n, k in enumerate(histogram.keys()) }
+
+  def useMapping(self, mapping=None):
+    if mapping is None:
+      mapping = { c: { k: v for v, k in self.value_dict[c].items() } for c in self.value_dict.keys() }
+
+    if len(mapping.keys()) > 0:
+      self.data.replace(mapping, inplace=True)
+
+  # ---------------------------------------------------------------------------
+  # Statistics
+  # ---------------------------------------------------------------------------
+  def showStatistic(self):
+    print(f"Fratures: {self.data.shape[1]}")
+    print(f"Points:   {self.data.shape[0]}")
+    print(f"Columns:")
+
+    for k in self.data.dtypes.keys():
+      t = str(self.data.dtypes[k])
+      e = " c"
+      if k in self.cols_ord:
+        e = " o"
+      if k in self.cols_nom:
+        e = " n"
+      indentPair(k, t, e)
+    print()
+    print(f"Missing values:")
+
+    n = 0
+    d = self.data.isna().sum()
+    for k in d.keys():
+      if d[k] > 0:
+        indentPair(k, str(d[k]))
+        n += 1
+    if n == 0:
+      print("  none")
+
+  def updateMissingValuesState(self):
+    self.has_missing_values = False
+    for k in self.data.isna().sum():
+      if k > 0:
+        self.has_missing_values = True
+        break
+
+
+  # ---------------------------------------------------------------------------
+  # Automatic fixing tools
+  # ---------------------------------------------------------------------------
+  def fixDatatypes(self):
+    self.detectMapping()
+    self.useMapping() 
+
+  def fix_missing_values(self):
+    self.data = fix_missing_values(self.data, 4)
+    self.updateMissingValuesState()
+

+ 24 - 0
fdc/tools.py

@@ -9,6 +9,30 @@ def count(testFn, items):
     return s
 
 
+def indent(text, i="  "):
+  result = ""
+  for x in text.split("\n"):
+    result += i + x + "\n"
+  return result
+
+
+
+def indentPair(a, b, e="", i="  "):
+  m = a + " "
+  if len(m) < 32:
+    m += "_" * (32 - len(m))
+  if len(b) < 16:
+    m += "_" * (10 - len(b))
+  m += " "
+  m += b
+  if e == False:
+    pass
+  elif e == True:
+    m += " *"
+  else:
+    m += e
+  print("  " + m)
+
 
 class Timing:
     def __init__(self, name="Duration"):

+ 84 - 106
fdcTool.py

@@ -11,113 +11,92 @@ from fdc.tools import Timing
 from fdc.missingValues import fix_missing_values
 from fdc.fdc import canberra_modified, FDC, Clustering
 
-
-def indent(text, i="  "):
-  result = ""
-  for x in text.split("\n"):
-    result += i + x + "\n"
-  return result
-
-def indentPair(a, b, e="", i="  "):
-  m = a + " "
-  if len(m) < 32:
-    m += "_" * (32 - len(m))
-  if len(b) < 16:
-    m += "_" * (10 - len(b))
-  m += " "
-  m += b
-  if e == False:
-    pass
-  elif e == True:
-    m += " *"
-  else:
-    m += e
-  print("  " + m)
-
-
-
-class FdcToolbox:
-
-  def __init__(self, file_name, index_col=0):
-    data = pd.read_csv(file_name, index_col=0)
-    self.data = data.sample(frac=1)
-
-    self.value_dict = {}
-    self.value_dict_rev = {}
-    self.cols_cont = []
-    self.cols_ord = []
-    self.cols_nom = []
-
-    for k in self.data.dtypes.keys():
-      t = str(self.data.dtypes[k])
-      if t[:3] == "int":
-        self.cols_ord.append(k)
-      elif t == "object":
-        self.cols_nom.append(k)
-      else:
-        self.cols_cont.append(k)
-
-    self.has_missing_values = False
-    self.updateMissingValuesState()
-
-  def updateMissingValuesState(self):
-    self.has_missing_values = False
-    for k in self.data.isna().sum():
-      if k > 0:
-        self.has_missing_values = True
-        break
-
-  def showStatistic(self):
-    print(f"Fratures: {self.data.shape[1]}")
-    print(f"Points:   {self.data.shape[0]}")
-    print(f"Columns:")
-
-    for k in self.data.dtypes.keys():
-      t = str(self.data.dtypes[k])
-      e = " c"
-      if k in self.cols_ord:
-        e = " o"
-      if k in self.cols_nom:
-        e = " n"
-      indentPair(k, t, e)
-    print()
-    print(f"Missing values:")
-
-    n = 0
-    d = self.data.isna().sum()
-    for k in d.keys():
-      if d[k] > 0:
-        indentPair(k, str(d[k]))
-        n += 1
-    if n == 0:
-      print("  none")
-
-
-  def fixDatatypes(self):
-    columnsToFix = []
-    for k in self.data.dtypes.keys():
-      if str(self.data.dtypes[k]) == "object":
-        columnsToFix.append(k)
-
-    self.value_dict = {}
-    self.value_dict_rev = {}
-    for c in columnsToFix:
-      histogram = self.data[c].value_counts()
-      self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
-      self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
-    
-    if len(self.value_dict.keys()) > 0:
-      self.data.replace(self.value_dict, inplace=True)
-
-  def fix_missing_values(self):
-    self.data = fix_missing_values(self.data, 4)
-    self.updateMissingValuesState()
-
+from fdc.dataSheet import DataSheet
+
+
+
+# class FdcToolbox:
+# 
+#   def __init__(self, file_name, index_col=0):
+#     data = pd.read_csv(file_name, index_col=0)
+#     self.data = data.sample(frac=1)
+# 
+#     self.value_dict = {}
+#     self.value_dict_rev = {}
+#     self.cols_cont = []
+#     self.cols_ord = []
+#     self.cols_nom = []
+# 
+#     for k in self.data.dtypes.keys():
+#       t = str(self.data.dtypes[k])
+#       if t[:3] == "int":
+#         self.cols_ord.append(k)
+#       elif t == "object":
+#         self.cols_nom.append(k)
+#       else:
+#         self.cols_cont.append(k)
+# 
+#     self.has_missing_values = False
+#     self.updateMissingValuesState()
+# 
+#   def updateMissingValuesState(self):
+#     self.has_missing_values = False
+#     for k in self.data.isna().sum():
+#       if k > 0:
+#         self.has_missing_values = True
+#         break
+# 
+#   def showStatistic(self):
+#     print(f"Fratures: {self.data.shape[1]}")
+#     print(f"Points:   {self.data.shape[0]}")
+#     print(f"Columns:")
+# 
+#     for k in self.data.dtypes.keys():
+#       t = str(self.data.dtypes[k])
+#       e = " c"
+#       if k in self.cols_ord:
+#         e = " o"
+#       if k in self.cols_nom:
+#         e = " n"
+#       indentPair(k, t, e)
+#     print()
+#     print(f"Missing values:")
+# 
+#     n = 0
+#     d = self.data.isna().sum()
+#     for k in d.keys():
+#       if d[k] > 0:
+#         indentPair(k, str(d[k]))
+#         n += 1
+#     if n == 0:
+#       print("  none")
+# 
+# 
+#   def fixDatatypes(self):
+#     columnsToFix = []
+#     for k in self.data.dtypes.keys():
+#       if str(self.data.dtypes[k]) == "object":
+#         columnsToFix.append(k)
+# 
+#     self.value_dict = {}
+#     self.value_dict_rev = {}
+#     for c in columnsToFix:
+#       histogram = self.data[c].value_counts()
+#       self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
+#       self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
+#     
+#     if len(self.value_dict.keys()) > 0:
+#       self.data.replace(self.value_dict, inplace=True)
+# 
+#   def fix_missing_values(self):
+#     self.data = fix_missing_values(self.data, 4)
+#     self.updateMissingValuesState()
+# 
 
 
 filename='healthcare-dataset-stroke-data.csv'
 np.random.seed(42)
-tb = FdcToolbox(filename)
+tb = DataSheet(filename)
 tb.showStatistic()
 
 hasChanged = False
@@ -138,9 +117,8 @@ if tb.has_missing_values:
 if hasChanged:
   print()
   tb.showStatistic()
-  tb.data.to_csv(filename + "_fixed_values.csv")
-  with open(filename + "_value_mapping.json", "w") as f:
-    json.dump(tb.value_dict_rev, f)
+  tb.saveTable(filename + "_fixed_values.csv")
+  tb.saveMapping(filename + "_value_mapping.json")
 
 
 print("Doing FDC ...")