fdcTool.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. import warnings
  2. warnings.filterwarnings('ignore')
  3. import json
  4. import pandas as pd
  5. import numpy as np
  6. from fdc.visualize import plotCluster
  7. from fdc.tools import Timing
  8. from fdc.missingValues import fix_missing_values
  9. from fdc.fdc import canberra_modified, FDC, Clustering
  10. def indent(text, i=" "):
  11. result = ""
  12. for x in text.split("\n"):
  13. result += i + x + "\n"
  14. return result
  15. def indentPair(a, b, e="", i=" "):
  16. m = a + " "
  17. if len(m) < 32:
  18. m += "_" * (32 - len(m))
  19. if len(b) < 16:
  20. m += "_" * (10 - len(b))
  21. m += " "
  22. m += b
  23. if e == False:
  24. pass
  25. elif e == True:
  26. m += " *"
  27. else:
  28. m += e
  29. print(" " + m)
  30. class FdcToolbox:
  31. def __init__(self, file_name, index_col=0):
  32. data = pd.read_csv(file_name, index_col=0)
  33. self.data = data.sample(frac=1)
  34. self.value_dict = {}
  35. self.value_dict_rev = {}
  36. self.cols_cont = []
  37. self.cols_ord = []
  38. self.cols_nom = []
  39. for k in self.data.dtypes.keys():
  40. t = str(self.data.dtypes[k])
  41. if t[:3] == "int":
  42. self.cols_ord.append(k)
  43. elif t == "object":
  44. self.cols_nom.append(k)
  45. else:
  46. self.cols_cont.append(k)
  47. self.has_missing_values = False
  48. self.updateMissingValuesState()
  49. def updateMissingValuesState(self):
  50. self.has_missing_values = False
  51. for k in self.data.isna().sum():
  52. if k > 0:
  53. self.has_missing_values = True
  54. break
  55. def showStatistic(self):
  56. print(f"Fratures: {self.data.shape[1]}")
  57. print(f"Points: {self.data.shape[0]}")
  58. print(f"Columns:")
  59. for k in self.data.dtypes.keys():
  60. t = str(self.data.dtypes[k])
  61. e = " c"
  62. if k in self.cols_ord:
  63. e = " o"
  64. if k in self.cols_nom:
  65. e = " n"
  66. indentPair(k, t, e)
  67. print()
  68. print(f"Missing values:")
  69. n = 0
  70. d = self.data.isna().sum()
  71. for k in d.keys():
  72. if d[k] > 0:
  73. indentPair(k, str(d[k]))
  74. n += 1
  75. if n == 0:
  76. print(" none")
  77. def fixDatatypes(self):
  78. columnsToFix = []
  79. for k in self.data.dtypes.keys():
  80. if str(self.data.dtypes[k]) == "object":
  81. columnsToFix.append(k)
  82. self.value_dict = {}
  83. self.value_dict_rev = {}
  84. for c in columnsToFix:
  85. histogram = self.data[c].value_counts()
  86. self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
  87. self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
  88. if len(self.value_dict.keys()) > 0:
  89. self.data.replace(self.value_dict, inplace=True)
  90. def fix_missing_values(self):
  91. self.data = fix_missing_values(self.data, 4)
  92. self.updateMissingValuesState()
  93. filename='healthcare-dataset-stroke-data.csv'
  94. np.random.seed(42)
  95. tb = FdcToolbox(filename)
  96. tb.showStatistic()
  97. hasChanged = False
  98. if len(tb.cols_nom) > 0:
  99. print()
  100. print("Fixing object datatypes ...")
  101. tb.fixDatatypes()
  102. print("done")
  103. hasChanged = True
  104. if tb.has_missing_values:
  105. print()
  106. print("Fix missing values ...")
  107. tb.fix_missing_values()
  108. print("done")
  109. hasChanged = True
  110. if hasChanged:
  111. print()
  112. tb.showStatistic()
  113. tb.data.to_csv(filename + "_fixed_values.csv")
  114. with open(filename + "_value_mapping.json", "w") as f:
  115. json.dump(tb.value_dict_rev, f)
  116. print("Doing FDC ...")
  117. fdc = FDC(clustering_cont=Clustering('euclidean')
  118. , clustering_ord=Clustering(canberra_modified)
  119. , clustering_nom=Clustering('hamming', max_components=1)
  120. , visual=False
  121. , use_pandas_output=True
  122. , with_2d_embedding=False
  123. )
  124. fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
  125. entire_data_FDC_emb_five = fdc.normalize(tb.data)
  126. entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
  127. print("done")