|
@@ -0,0 +1,161 @@
|
|
|
|
|
+import warnings
|
|
|
|
|
+warnings.filterwarnings('ignore')
|
|
|
|
|
+
|
|
|
|
|
+import json
|
|
|
|
|
+
|
|
|
|
|
+import pandas as pd
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+
|
|
|
|
|
+from fdc.visualize import plotCluster
|
|
|
|
|
+from fdc.tools import Timing
|
|
|
|
|
+from fdc.missingValues import fix_missing_values
|
|
|
|
|
+from fdc.fdc import canberra_modified, FDC, Clustering
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def indent(text, i=" "):
|
|
|
|
|
+ result = ""
|
|
|
|
|
+ for x in text.split("\n"):
|
|
|
|
|
+ result += i + x + "\n"
|
|
|
|
|
+ return result
|
|
|
|
|
+
|
|
|
|
|
+def indentPair(a, b, e="", i=" "):
|
|
|
|
|
+ m = a + " "
|
|
|
|
|
+ if len(m) < 32:
|
|
|
|
|
+ m += "_" * (32 - len(m))
|
|
|
|
|
+ if len(b) < 16:
|
|
|
|
|
+ m += "_" * (10 - len(b))
|
|
|
|
|
+ m += " "
|
|
|
|
|
+ m += b
|
|
|
|
|
+ if e == False:
|
|
|
|
|
+ pass
|
|
|
|
|
+ elif e == True:
|
|
|
|
|
+ m += " *"
|
|
|
|
|
+ else:
|
|
|
|
|
+ m += e
|
|
|
|
|
+ print(" " + m)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class FdcToolbox:
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, file_name, index_col=0):
|
|
|
|
|
+ data = pd.read_csv(file_name, index_col=0)
|
|
|
|
|
+ self.data = data.sample(frac=1)
|
|
|
|
|
+
|
|
|
|
|
+ self.value_dict = {}
|
|
|
|
|
+ self.value_dict_rev = {}
|
|
|
|
|
+ self.cols_cont = []
|
|
|
|
|
+ self.cols_ord = []
|
|
|
|
|
+ self.cols_nom = []
|
|
|
|
|
+
|
|
|
|
|
+ for k in self.data.dtypes.keys():
|
|
|
|
|
+ t = str(self.data.dtypes[k])
|
|
|
|
|
+ if t[:3] == "int":
|
|
|
|
|
+ self.cols_ord.append(k)
|
|
|
|
|
+ elif t == "object":
|
|
|
|
|
+ self.cols_nom.append(k)
|
|
|
|
|
+ else:
|
|
|
|
|
+ self.cols_cont.append(k)
|
|
|
|
|
+
|
|
|
|
|
+ self.has_missing_values = False
|
|
|
|
|
+ self.updateMissingValuesState()
|
|
|
|
|
+
|
|
|
|
|
+ def updateMissingValuesState(self):
|
|
|
|
|
+ self.has_missing_values = False
|
|
|
|
|
+ for k in self.data.isna().sum():
|
|
|
|
|
+ if k > 0:
|
|
|
|
|
+ self.has_missing_values = True
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ def showStatistic(self):
|
|
|
|
|
+ print(f"Fratures: {self.data.shape[1]}")
|
|
|
|
|
+ print(f"Points: {self.data.shape[0]}")
|
|
|
|
|
+ print(f"Columns:")
|
|
|
|
|
+
|
|
|
|
|
+ for k in self.data.dtypes.keys():
|
|
|
|
|
+ t = str(self.data.dtypes[k])
|
|
|
|
|
+ e = " c"
|
|
|
|
|
+ if k in self.cols_ord:
|
|
|
|
|
+ e = " o"
|
|
|
|
|
+ if k in self.cols_nom:
|
|
|
|
|
+ e = " n"
|
|
|
|
|
+ indentPair(k, t, e)
|
|
|
|
|
+ print()
|
|
|
|
|
+ print(f"Missing values:")
|
|
|
|
|
+
|
|
|
|
|
+ n = 0
|
|
|
|
|
+ d = self.data.isna().sum()
|
|
|
|
|
+ for k in d.keys():
|
|
|
|
|
+ if d[k] > 0:
|
|
|
|
|
+ indentPair(k, str(d[k]))
|
|
|
|
|
+ n += 1
|
|
|
|
|
+ if n == 0:
|
|
|
|
|
+ print(" none")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def fixDatatypes(self):
|
|
|
|
|
+ columnsToFix = []
|
|
|
|
|
+ for k in self.data.dtypes.keys():
|
|
|
|
|
+ if str(self.data.dtypes[k]) == "object":
|
|
|
|
|
+ columnsToFix.append(k)
|
|
|
|
|
+
|
|
|
|
|
+ self.value_dict = {}
|
|
|
|
|
+ self.value_dict_rev = {}
|
|
|
|
|
+ for c in columnsToFix:
|
|
|
|
|
+ histogram = self.data[c].value_counts()
|
|
|
|
|
+ self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
|
|
|
|
|
+ self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
|
|
|
|
|
+
|
|
|
|
|
+ if len(self.value_dict.keys()) > 0:
|
|
|
|
|
+ self.data.replace(self.value_dict, inplace=True)
|
|
|
|
|
+
|
|
|
|
|
+ def fix_missing_values(self):
|
|
|
|
|
+ self.data = fix_missing_values(self.data, 4)
|
|
|
|
|
+ self.updateMissingValuesState()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+filename='healthcare-dataset-stroke-data.csv'
|
|
|
|
|
+np.random.seed(42)
|
|
|
|
|
+tb = FdcToolbox(filename)
|
|
|
|
|
+tb.showStatistic()
|
|
|
|
|
+
|
|
|
|
|
+hasChanged = False
|
|
|
|
|
+if len(tb.cols_nom) > 0:
|
|
|
|
|
+ print()
|
|
|
|
|
+ print("Fixing object datatypes ...")
|
|
|
|
|
+ tb.fixDatatypes()
|
|
|
|
|
+ print("done")
|
|
|
|
|
+ hasChanged = True
|
|
|
|
|
+
|
|
|
|
|
+if tb.has_missing_values:
|
|
|
|
|
+ print()
|
|
|
|
|
+ print("Fix missing values ...")
|
|
|
|
|
+ tb.fix_missing_values()
|
|
|
|
|
+ print("done")
|
|
|
|
|
+ hasChanged = True
|
|
|
|
|
+
|
|
|
|
|
+if hasChanged:
|
|
|
|
|
+ print()
|
|
|
|
|
+ tb.showStatistic()
|
|
|
|
|
+ tb.data.to_csv(filename + "_fixed_values.csv")
|
|
|
|
|
+ with open(filename + "_value_mapping.json", "w") as f:
|
|
|
|
|
+ json.dump(tb.value_dict_rev, f)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+print("Doing FDC ...")
|
|
|
|
|
+fdc = FDC(clustering_cont=Clustering('euclidean')
|
|
|
|
|
+ , clustering_ord=Clustering(canberra_modified)
|
|
|
|
|
+ , clustering_nom=Clustering('hamming', max_components=1)
|
|
|
|
|
+ , visual=False
|
|
|
|
|
+ , use_pandas_output=True
|
|
|
|
|
+ , with_2d_embedding=False
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
|
|
|
|
|
+
|
|
|
|
|
+entire_data_FDC_emb_five = fdc.normalize(tb.data)
|
|
|
|
|
+entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
|
|
|
|
|
+print("done")
|
|
|
|
|
+
|
|
|
|
|
+
|