| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- import warnings
- warnings.filterwarnings('ignore')
- import json
- import pandas as pd
- import numpy as np
- from fdc.visualize import plotCluster
- from fdc.tools import Timing
- from fdc.missingValues import fix_missing_values
- from fdc.fdc import canberra_modified, FDC, Clustering
- from fdc.dataSheet import DataSheet
- filename='healthcare-dataset-stroke-data.csv'
- np.random.seed(42)
- tb = DataSheet(filename)
- tb.showStatistic()
- hasChanged = False
- if len(tb.cols_nom) > 0:
- print()
- print("Fixing object datatypes ...")
- tb.fixDatatypes()
- print("done")
- hasChanged = True
- if tb.has_missing_values:
- print()
- print("Fix missing values ...")
- tb.fix_missing_values()
- print("done")
- hasChanged = True
- if hasChanged:
- print()
- tb.showStatistic()
- tb.saveTable(filename + "_fixed_values.csv")
- tb.saveMapping(filename + "_value_mapping.json")
- print("Doing FDC ...")
- fdc = FDC(clustering_cont=Clustering('euclidean')
- , clustering_ord=Clustering(canberra_modified)
- , clustering_nom=Clustering('hamming', max_components=1)
- , visual=False
- , use_pandas_output=True
- , with_2d_embedding=False
- )
- fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
- entire_data_FDC_emb_five = fdc.normalize(tb.data)
- entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
- print("done")
|