import warnings warnings.filterwarnings('ignore') import json import pandas as pd import numpy as np from fdc.visualize import plotCluster from fdc.tools import Timing from fdc.missingValues import fix_missing_values from fdc.fdc import canberra_modified, FDC, Clustering def indent(text, i=" "): result = "" for x in text.split("\n"): result += i + x + "\n" return result def indentPair(a, b, e="", i=" "): m = a + " " if len(m) < 32: m += "_" * (32 - len(m)) if len(b) < 16: m += "_" * (10 - len(b)) m += " " m += b if e == False: pass elif e == True: m += " *" else: m += e print(" " + m) class FdcToolbox: def __init__(self, file_name, index_col=0): data = pd.read_csv(file_name, index_col=0) self.data = data.sample(frac=1) self.value_dict = {} self.value_dict_rev = {} self.cols_cont = [] self.cols_ord = [] self.cols_nom = [] for k in self.data.dtypes.keys(): t = str(self.data.dtypes[k]) if t[:3] == "int": self.cols_ord.append(k) elif t == "object": self.cols_nom.append(k) else: self.cols_cont.append(k) self.has_missing_values = False self.updateMissingValuesState() def updateMissingValuesState(self): self.has_missing_values = False for k in self.data.isna().sum(): if k > 0: self.has_missing_values = True break def showStatistic(self): print(f"Fratures: {self.data.shape[1]}") print(f"Points: {self.data.shape[0]}") print(f"Columns:") for k in self.data.dtypes.keys(): t = str(self.data.dtypes[k]) e = " c" if k in self.cols_ord: e = " o" if k in self.cols_nom: e = " n" indentPair(k, t, e) print() print(f"Missing values:") n = 0 d = self.data.isna().sum() for k in d.keys(): if d[k] > 0: indentPair(k, str(d[k])) n += 1 if n == 0: print(" none") def fixDatatypes(self): columnsToFix = [] for k in self.data.dtypes.keys(): if str(self.data.dtypes[k]) == "object": columnsToFix.append(k) self.value_dict = {} self.value_dict_rev = {} for c in columnsToFix: histogram = self.data[c].value_counts() self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) } self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) } if len(self.value_dict.keys()) > 0: self.data.replace(self.value_dict, inplace=True) def fix_missing_values(self): self.data = fix_missing_values(self.data, 4) self.updateMissingValuesState() filename='healthcare-dataset-stroke-data.csv' np.random.seed(42) tb = FdcToolbox(filename) tb.showStatistic() hasChanged = False if len(tb.cols_nom) > 0: print() print("Fixing object datatypes ...") tb.fixDatatypes() print("done") hasChanged = True if tb.has_missing_values: print() print("Fix missing values ...") tb.fix_missing_values() print("done") hasChanged = True if hasChanged: print() tb.showStatistic() tb.data.to_csv(filename + "_fixed_values.csv") with open(filename + "_value_mapping.json", "w") as f: json.dump(tb.value_dict_rev, f) print("Doing FDC ...") fdc = FDC(clustering_cont=Clustering('euclidean') , clustering_ord=Clustering(canberra_modified) , clustering_nom=Clustering('hamming', max_components=1) , visual=False , use_pandas_output=True , with_2d_embedding=False ) fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord) entire_data_FDC_emb_five = fdc.normalize(tb.data) entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv") print("done")