fdcTool.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import warnings
  2. warnings.filterwarnings('ignore')
  3. import json
  4. import pandas as pd
  5. import numpy as np
  6. from fdc.visualize import plotCluster
  7. from fdc.tools import Timing
  8. from fdc.missingValues import fix_missing_values
  9. from fdc.fdc import canberra_modified, FDC, Clustering
  10. from fdc.dataSheet import DataSheet
  11. # class FdcToolbox:
  12. #
  13. # def __init__(self, file_name, index_col=0):
  14. # data = pd.read_csv(file_name, index_col=0)
  15. # self.data = data.sample(frac=1)
  16. #
  17. # self.value_dict = {}
  18. # self.value_dict_rev = {}
  19. # self.cols_cont = []
  20. # self.cols_ord = []
  21. # self.cols_nom = []
  22. #
  23. # for k in self.data.dtypes.keys():
  24. # t = str(self.data.dtypes[k])
  25. # if t[:3] == "int":
  26. # self.cols_ord.append(k)
  27. # elif t == "object":
  28. # self.cols_nom.append(k)
  29. # else:
  30. # self.cols_cont.append(k)
  31. #
  32. # self.has_missing_values = False
  33. # self.updateMissingValuesState()
  34. #
  35. # def updateMissingValuesState(self):
  36. # self.has_missing_values = False
  37. # for k in self.data.isna().sum():
  38. # if k > 0:
  39. # self.has_missing_values = True
  40. # break
  41. #
  42. # def showStatistic(self):
  43. # print(f"Fratures: {self.data.shape[1]}")
  44. # print(f"Points: {self.data.shape[0]}")
  45. # print(f"Columns:")
  46. #
  47. # for k in self.data.dtypes.keys():
  48. # t = str(self.data.dtypes[k])
  49. # e = " c"
  50. # if k in self.cols_ord:
  51. # e = " o"
  52. # if k in self.cols_nom:
  53. # e = " n"
  54. # indentPair(k, t, e)
  55. # print()
  56. # print(f"Missing values:")
  57. #
  58. # n = 0
  59. # d = self.data.isna().sum()
  60. # for k in d.keys():
  61. # if d[k] > 0:
  62. # indentPair(k, str(d[k]))
  63. # n += 1
  64. # if n == 0:
  65. # print(" none")
  66. #
  67. #
  68. # def fixDatatypes(self):
  69. # columnsToFix = []
  70. # for k in self.data.dtypes.keys():
  71. # if str(self.data.dtypes[k]) == "object":
  72. # columnsToFix.append(k)
  73. #
  74. # self.value_dict = {}
  75. # self.value_dict_rev = {}
  76. # for c in columnsToFix:
  77. # histogram = self.data[c].value_counts()
  78. # self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
  79. # self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
  80. #
  81. # if len(self.value_dict.keys()) > 0:
  82. # self.data.replace(self.value_dict, inplace=True)
  83. #
  84. # def fix_missing_values(self):
  85. # self.data = fix_missing_values(self.data, 4)
  86. # self.updateMissingValuesState()
  87. #
  88. filename='healthcare-dataset-stroke-data.csv'
  89. np.random.seed(42)
  90. tb = DataSheet(filename)
  91. tb.showStatistic()
  92. hasChanged = False
  93. if len(tb.cols_nom) > 0:
  94. print()
  95. print("Fixing object datatypes ...")
  96. tb.fixDatatypes()
  97. print("done")
  98. hasChanged = True
  99. if tb.has_missing_values:
  100. print()
  101. print("Fix missing values ...")
  102. tb.fix_missing_values()
  103. print("done")
  104. hasChanged = True
  105. if hasChanged:
  106. print()
  107. tb.showStatistic()
  108. tb.saveTable(filename + "_fixed_values.csv")
  109. tb.saveMapping(filename + "_value_mapping.json")
  110. print("Doing FDC ...")
  111. fdc = FDC(clustering_cont=Clustering('euclidean')
  112. , clustering_ord=Clustering(canberra_modified)
  113. , clustering_nom=Clustering('hamming', max_components=1)
  114. , visual=False
  115. , use_pandas_output=True
  116. , with_2d_embedding=False
  117. )
  118. fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
  119. entire_data_FDC_emb_five = fdc.normalize(tb.data)
  120. entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
  121. print("done")