import warnings warnings.filterwarnings('ignore') import json import pandas as pd import numpy as np from fdc.visualize import plotCluster from fdc.tools import Timing from fdc.missingValues import fix_missing_values from fdc.fdc import canberra_modified, FDC, Clustering from fdc.dataSheet import DataSheet filename='healthcare-dataset-stroke-data.csv' np.random.seed(42) tb = DataSheet(filename) tb.showStatistic() hasChanged = False if len(tb.cols_nom) > 0: print() print("Fixing object datatypes ...") tb.fixDatatypes() print("done") hasChanged = True if tb.has_missing_values: print() print("Fix missing values ...") tb.fix_missing_values() print("done") hasChanged = True if hasChanged: print() tb.showStatistic() tb.saveTable(filename + "_fixed_values.csv") tb.saveMapping(filename + "_value_mapping.json") print("Doing FDC ...") fdc = FDC(clustering_cont=Clustering('euclidean') , clustering_ord=Clustering(canberra_modified) , clustering_nom=Clustering('hamming', max_components=1) , visual=False , use_pandas_output=True , with_2d_embedding=False ) fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord) entire_data_FDC_emb_five = fdc.normalize(tb.data) entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv") print("done")