dataSheet.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. import json
  2. import pandas as pd
  3. import numpy as np
  4. from fdc.tools import *
  5. from fdc.visualize import plotCluster
  6. from fdc.missingValues import fix_missing_values
  7. class DataSheet:
  8. # ---------------------------------------------------------------------------
  9. # Data I/O
  10. # ---------------------------------------------------------------------------
  11. def __init__(self, file_name=None, index_col=0, dataFrame=None):
  12. if file_name is not None:
  13. self.data = pd.read_csv(file_name, index_col=0).sample(frac=1)
  14. elif dataFrame is not None:
  15. self.data = dataFrame
  16. else:
  17. self.data = pd.DataFrame(np.array([[0.0]]), "dummy")
  18. self.value_dict = {}
  19. self.cols_cont = []
  20. self.cols_ord = []
  21. self.cols_nom = []
  22. for k in self.data.dtypes.keys():
  23. t = str(self.data.dtypes[k])
  24. if t[:3] == "int":
  25. self.cols_ord.append(k)
  26. elif t == "object":
  27. self.cols_nom.append(k)
  28. else:
  29. self.cols_cont.append(k)
  30. self.has_missing_values = False
  31. self.updateMissingValuesState()
  32. self.detectMapping()
  33. def saveTable(self, file_name):
  34. self.data.to_csv(file_name)
  35. # ---------------------------------------------------------------------------
  36. # Data mapping
  37. # ---------------------------------------------------------------------------
  38. def saveMapping(self, file_name):
  39. with open(file_name, "w") as f:
  40. json.dump(self.value_dict, f)
  41. def loadMapping(self, file_name):
  42. with open(file_name) as f:
  43. pass #json.dump(tb.value_dict, f)
  44. def detectMapping(self):
  45. columnsToFix = []
  46. for k in self.data.dtypes.keys():
  47. if str(self.data.dtypes[k]) == "object":
  48. columnsToFix.append(k)
  49. self.value_dict = {}
  50. for c in columnsToFix:
  51. histogram = self.data[c].value_counts()
  52. self.value_dict[c] = { n : k for n, k in enumerate(histogram.keys()) }
  53. def useMapping(self, mapping=None):
  54. if mapping is None:
  55. mapping = { c: { k: v for v, k in self.value_dict[c].items() } for c in self.value_dict.keys() }
  56. if len(mapping.keys()) > 0:
  57. self.data.replace(mapping, inplace=True)
  58. # ---------------------------------------------------------------------------
  59. # Statistics
  60. # ---------------------------------------------------------------------------
  61. def showStatistic(self):
  62. print(f"Fratures: {self.data.shape[1]}")
  63. print(f"Points: {self.data.shape[0]}")
  64. print(f"Columns:")
  65. for k in self.data.dtypes.keys():
  66. t = str(self.data.dtypes[k])
  67. e = " c"
  68. if k in self.cols_ord:
  69. e = " o"
  70. if k in self.cols_nom:
  71. e = " n"
  72. indentPair(k, t, e)
  73. print()
  74. print(f"Missing values:")
  75. n = 0
  76. d = self.data.isna().sum()
  77. for k in d.keys():
  78. if d[k] > 0:
  79. indentPair(k, str(d[k]))
  80. n += 1
  81. if n == 0:
  82. print(" none")
  83. def updateMissingValuesState(self):
  84. self.has_missing_values = False
  85. for k in self.data.isna().sum():
  86. if k > 0:
  87. self.has_missing_values = True
  88. break
  89. # ---------------------------------------------------------------------------
  90. # Automatic fixing tools
  91. # ---------------------------------------------------------------------------
  92. def fixDatatypes(self):
  93. self.detectMapping()
  94. self.useMapping()
  95. def fix_missing_values(self):
  96. self.data = fix_missing_values(self.data, 4)
  97. self.updateMissingValuesState()