fdcTool.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import warnings
  2. warnings.filterwarnings('ignore')
  3. import json
  4. import pandas as pd
  5. import numpy as np
  6. from fdc.visualize import plotCluster
  7. from fdc.tools import Timing
  8. from fdc.missingValues import fix_missing_values
  9. from fdc.fdc import canberra_modified, FDC, Clustering
  10. from fdc.dataSheet import DataSheet
  11. filename='healthcare-dataset-stroke-data.csv'
  12. np.random.seed(42)
  13. tb = DataSheet(filename)
  14. tb.showStatistic()
  15. hasChanged = False
  16. if len(tb.cols_nom) > 0:
  17. print()
  18. print("Fixing object datatypes ...")
  19. tb.fixDatatypes()
  20. print("done")
  21. hasChanged = True
  22. if tb.has_missing_values:
  23. print()
  24. print("Fix missing values ...")
  25. tb.fix_missing_values()
  26. print("done")
  27. hasChanged = True
  28. if hasChanged:
  29. print()
  30. tb.showStatistic()
  31. tb.saveTable(filename + "_fixed_values.csv")
  32. tb.saveMapping(filename + "_value_mapping.json")
  33. print("Doing FDC ...")
  34. fdc = FDC(clustering_cont=Clustering('euclidean')
  35. , clustering_ord=Clustering(canberra_modified)
  36. , clustering_nom=Clustering('hamming', max_components=1)
  37. , visual=False
  38. , use_pandas_output=True
  39. , with_2d_embedding=False
  40. )
  41. fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
  42. entire_data_FDC_emb_five = fdc.normalize(tb.data)
  43. entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
  44. print("done")