import warnings warnings.filterwarnings('ignore') import json import pandas as pd import numpy as np from fdc.visualize import plotCluster from fdc.tools import Timing from fdc.missingValues import fix_missing_values from fdc.fdc import canberra_modified, FDC, Clustering from fdc.dataSheet import DataSheet # class FdcToolbox: # # def __init__(self, file_name, index_col=0): # data = pd.read_csv(file_name, index_col=0) # self.data = data.sample(frac=1) # # self.value_dict = {} # self.value_dict_rev = {} # self.cols_cont = [] # self.cols_ord = [] # self.cols_nom = [] # # for k in self.data.dtypes.keys(): # t = str(self.data.dtypes[k]) # if t[:3] == "int": # self.cols_ord.append(k) # elif t == "object": # self.cols_nom.append(k) # else: # self.cols_cont.append(k) # # self.has_missing_values = False # self.updateMissingValuesState() # # def updateMissingValuesState(self): # self.has_missing_values = False # for k in self.data.isna().sum(): # if k > 0: # self.has_missing_values = True # break # # def showStatistic(self): # print(f"Fratures: {self.data.shape[1]}") # print(f"Points: {self.data.shape[0]}") # print(f"Columns:") # # for k in self.data.dtypes.keys(): # t = str(self.data.dtypes[k]) # e = " c" # if k in self.cols_ord: # e = " o" # if k in self.cols_nom: # e = " n" # indentPair(k, t, e) # print() # print(f"Missing values:") # # n = 0 # d = self.data.isna().sum() # for k in d.keys(): # if d[k] > 0: # indentPair(k, str(d[k])) # n += 1 # if n == 0: # print(" none") # # # def fixDatatypes(self): # columnsToFix = [] # for k in self.data.dtypes.keys(): # if str(self.data.dtypes[k]) == "object": # columnsToFix.append(k) # # self.value_dict = {} # self.value_dict_rev = {} # for c in columnsToFix: # histogram = self.data[c].value_counts() # self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) } # self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) } # # if len(self.value_dict.keys()) > 0: # self.data.replace(self.value_dict, inplace=True) # # def fix_missing_values(self): # self.data = fix_missing_values(self.data, 4) # self.updateMissingValuesState() # filename='healthcare-dataset-stroke-data.csv' np.random.seed(42) tb = DataSheet(filename) tb.showStatistic() hasChanged = False if len(tb.cols_nom) > 0: print() print("Fixing object datatypes ...") tb.fixDatatypes() print("done") hasChanged = True if tb.has_missing_values: print() print("Fix missing values ...") tb.fix_missing_values() print("done") hasChanged = True if hasChanged: print() tb.showStatistic() tb.saveTable(filename + "_fixed_values.csv") tb.saveMapping(filename + "_value_mapping.json") print("Doing FDC ...") fdc = FDC(clustering_cont=Clustering('euclidean') , clustering_ord=Clustering(canberra_modified) , clustering_nom=Clustering('hamming', max_components=1) , visual=False , use_pandas_output=True , with_2d_embedding=False ) fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord) entire_data_FDC_emb_five = fdc.normalize(tb.data) entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv") print("done")