|
|
@@ -11,113 +11,92 @@ from fdc.tools import Timing
|
|
|
from fdc.missingValues import fix_missing_values
|
|
|
from fdc.fdc import canberra_modified, FDC, Clustering
|
|
|
|
|
|
-
|
|
|
-def indent(text, i=" "):
|
|
|
- result = ""
|
|
|
- for x in text.split("\n"):
|
|
|
- result += i + x + "\n"
|
|
|
- return result
|
|
|
-
|
|
|
-def indentPair(a, b, e="", i=" "):
|
|
|
- m = a + " "
|
|
|
- if len(m) < 32:
|
|
|
- m += "_" * (32 - len(m))
|
|
|
- if len(b) < 16:
|
|
|
- m += "_" * (10 - len(b))
|
|
|
- m += " "
|
|
|
- m += b
|
|
|
- if e == False:
|
|
|
- pass
|
|
|
- elif e == True:
|
|
|
- m += " *"
|
|
|
- else:
|
|
|
- m += e
|
|
|
- print(" " + m)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-class FdcToolbox:
|
|
|
-
|
|
|
- def __init__(self, file_name, index_col=0):
|
|
|
- data = pd.read_csv(file_name, index_col=0)
|
|
|
- self.data = data.sample(frac=1)
|
|
|
-
|
|
|
- self.value_dict = {}
|
|
|
- self.value_dict_rev = {}
|
|
|
- self.cols_cont = []
|
|
|
- self.cols_ord = []
|
|
|
- self.cols_nom = []
|
|
|
-
|
|
|
- for k in self.data.dtypes.keys():
|
|
|
- t = str(self.data.dtypes[k])
|
|
|
- if t[:3] == "int":
|
|
|
- self.cols_ord.append(k)
|
|
|
- elif t == "object":
|
|
|
- self.cols_nom.append(k)
|
|
|
- else:
|
|
|
- self.cols_cont.append(k)
|
|
|
-
|
|
|
- self.has_missing_values = False
|
|
|
- self.updateMissingValuesState()
|
|
|
-
|
|
|
- def updateMissingValuesState(self):
|
|
|
- self.has_missing_values = False
|
|
|
- for k in self.data.isna().sum():
|
|
|
- if k > 0:
|
|
|
- self.has_missing_values = True
|
|
|
- break
|
|
|
-
|
|
|
- def showStatistic(self):
|
|
|
- print(f"Fratures: {self.data.shape[1]}")
|
|
|
- print(f"Points: {self.data.shape[0]}")
|
|
|
- print(f"Columns:")
|
|
|
-
|
|
|
- for k in self.data.dtypes.keys():
|
|
|
- t = str(self.data.dtypes[k])
|
|
|
- e = " c"
|
|
|
- if k in self.cols_ord:
|
|
|
- e = " o"
|
|
|
- if k in self.cols_nom:
|
|
|
- e = " n"
|
|
|
- indentPair(k, t, e)
|
|
|
- print()
|
|
|
- print(f"Missing values:")
|
|
|
-
|
|
|
- n = 0
|
|
|
- d = self.data.isna().sum()
|
|
|
- for k in d.keys():
|
|
|
- if d[k] > 0:
|
|
|
- indentPair(k, str(d[k]))
|
|
|
- n += 1
|
|
|
- if n == 0:
|
|
|
- print(" none")
|
|
|
-
|
|
|
-
|
|
|
- def fixDatatypes(self):
|
|
|
- columnsToFix = []
|
|
|
- for k in self.data.dtypes.keys():
|
|
|
- if str(self.data.dtypes[k]) == "object":
|
|
|
- columnsToFix.append(k)
|
|
|
-
|
|
|
- self.value_dict = {}
|
|
|
- self.value_dict_rev = {}
|
|
|
- for c in columnsToFix:
|
|
|
- histogram = self.data[c].value_counts()
|
|
|
- self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
|
|
|
- self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
|
|
|
-
|
|
|
- if len(self.value_dict.keys()) > 0:
|
|
|
- self.data.replace(self.value_dict, inplace=True)
|
|
|
-
|
|
|
- def fix_missing_values(self):
|
|
|
- self.data = fix_missing_values(self.data, 4)
|
|
|
- self.updateMissingValuesState()
|
|
|
-
|
|
|
+from fdc.dataSheet import DataSheet
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# class FdcToolbox:
|
|
|
+#
|
|
|
+# def __init__(self, file_name, index_col=0):
|
|
|
+# data = pd.read_csv(file_name, index_col=0)
|
|
|
+# self.data = data.sample(frac=1)
|
|
|
+#
|
|
|
+# self.value_dict = {}
|
|
|
+# self.value_dict_rev = {}
|
|
|
+# self.cols_cont = []
|
|
|
+# self.cols_ord = []
|
|
|
+# self.cols_nom = []
|
|
|
+#
|
|
|
+# for k in self.data.dtypes.keys():
|
|
|
+# t = str(self.data.dtypes[k])
|
|
|
+# if t[:3] == "int":
|
|
|
+# self.cols_ord.append(k)
|
|
|
+# elif t == "object":
|
|
|
+# self.cols_nom.append(k)
|
|
|
+# else:
|
|
|
+# self.cols_cont.append(k)
|
|
|
+#
|
|
|
+# self.has_missing_values = False
|
|
|
+# self.updateMissingValuesState()
|
|
|
+#
|
|
|
+# def updateMissingValuesState(self):
|
|
|
+# self.has_missing_values = False
|
|
|
+# for k in self.data.isna().sum():
|
|
|
+# if k > 0:
|
|
|
+# self.has_missing_values = True
|
|
|
+# break
|
|
|
+#
|
|
|
+# def showStatistic(self):
|
|
|
+# print(f"Fratures: {self.data.shape[1]}")
|
|
|
+# print(f"Points: {self.data.shape[0]}")
|
|
|
+# print(f"Columns:")
|
|
|
+#
|
|
|
+# for k in self.data.dtypes.keys():
|
|
|
+# t = str(self.data.dtypes[k])
|
|
|
+# e = " c"
|
|
|
+# if k in self.cols_ord:
|
|
|
+# e = " o"
|
|
|
+# if k in self.cols_nom:
|
|
|
+# e = " n"
|
|
|
+# indentPair(k, t, e)
|
|
|
+# print()
|
|
|
+# print(f"Missing values:")
|
|
|
+#
|
|
|
+# n = 0
|
|
|
+# d = self.data.isna().sum()
|
|
|
+# for k in d.keys():
|
|
|
+# if d[k] > 0:
|
|
|
+# indentPair(k, str(d[k]))
|
|
|
+# n += 1
|
|
|
+# if n == 0:
|
|
|
+# print(" none")
|
|
|
+#
|
|
|
+#
|
|
|
+# def fixDatatypes(self):
|
|
|
+# columnsToFix = []
|
|
|
+# for k in self.data.dtypes.keys():
|
|
|
+# if str(self.data.dtypes[k]) == "object":
|
|
|
+# columnsToFix.append(k)
|
|
|
+#
|
|
|
+# self.value_dict = {}
|
|
|
+# self.value_dict_rev = {}
|
|
|
+# for c in columnsToFix:
|
|
|
+# histogram = self.data[c].value_counts()
|
|
|
+# self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
|
|
|
+# self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
|
|
|
+#
|
|
|
+# if len(self.value_dict.keys()) > 0:
|
|
|
+# self.data.replace(self.value_dict, inplace=True)
|
|
|
+#
|
|
|
+# def fix_missing_values(self):
|
|
|
+# self.data = fix_missing_values(self.data, 4)
|
|
|
+# self.updateMissingValuesState()
|
|
|
+#
|
|
|
|
|
|
|
|
|
filename='healthcare-dataset-stroke-data.csv'
|
|
|
np.random.seed(42)
|
|
|
-tb = FdcToolbox(filename)
|
|
|
+tb = DataSheet(filename)
|
|
|
tb.showStatistic()
|
|
|
|
|
|
hasChanged = False
|
|
|
@@ -138,9 +117,8 @@ if tb.has_missing_values:
|
|
|
if hasChanged:
|
|
|
print()
|
|
|
tb.showStatistic()
|
|
|
- tb.data.to_csv(filename + "_fixed_values.csv")
|
|
|
- with open(filename + "_value_mapping.json", "w") as f:
|
|
|
- json.dump(tb.value_dict_rev, f)
|
|
|
+ tb.saveTable(filename + "_fixed_values.csv")
|
|
|
+ tb.saveMapping(filename + "_value_mapping.json")
|
|
|
|
|
|
|
|
|
print("Doing FDC ...")
|