|
@@ -13,87 +13,6 @@ from fdc.fdc import canberra_modified, FDC, Clustering
|
|
|
|
|
|
|
|
from fdc.dataSheet import DataSheet
|
|
from fdc.dataSheet import DataSheet
|
|
|
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-# class FdcToolbox:
|
|
|
|
|
-#
|
|
|
|
|
-# def __init__(self, file_name, index_col=0):
|
|
|
|
|
-# data = pd.read_csv(file_name, index_col=0)
|
|
|
|
|
-# self.data = data.sample(frac=1)
|
|
|
|
|
-#
|
|
|
|
|
-# self.value_dict = {}
|
|
|
|
|
-# self.value_dict_rev = {}
|
|
|
|
|
-# self.cols_cont = []
|
|
|
|
|
-# self.cols_ord = []
|
|
|
|
|
-# self.cols_nom = []
|
|
|
|
|
-#
|
|
|
|
|
-# for k in self.data.dtypes.keys():
|
|
|
|
|
-# t = str(self.data.dtypes[k])
|
|
|
|
|
-# if t[:3] == "int":
|
|
|
|
|
-# self.cols_ord.append(k)
|
|
|
|
|
-# elif t == "object":
|
|
|
|
|
-# self.cols_nom.append(k)
|
|
|
|
|
-# else:
|
|
|
|
|
-# self.cols_cont.append(k)
|
|
|
|
|
-#
|
|
|
|
|
-# self.has_missing_values = False
|
|
|
|
|
-# self.updateMissingValuesState()
|
|
|
|
|
-#
|
|
|
|
|
-# def updateMissingValuesState(self):
|
|
|
|
|
-# self.has_missing_values = False
|
|
|
|
|
-# for k in self.data.isna().sum():
|
|
|
|
|
-# if k > 0:
|
|
|
|
|
-# self.has_missing_values = True
|
|
|
|
|
-# break
|
|
|
|
|
-#
|
|
|
|
|
-# def showStatistic(self):
|
|
|
|
|
-# print(f"Fratures: {self.data.shape[1]}")
|
|
|
|
|
-# print(f"Points: {self.data.shape[0]}")
|
|
|
|
|
-# print(f"Columns:")
|
|
|
|
|
-#
|
|
|
|
|
-# for k in self.data.dtypes.keys():
|
|
|
|
|
-# t = str(self.data.dtypes[k])
|
|
|
|
|
-# e = " c"
|
|
|
|
|
-# if k in self.cols_ord:
|
|
|
|
|
-# e = " o"
|
|
|
|
|
-# if k in self.cols_nom:
|
|
|
|
|
-# e = " n"
|
|
|
|
|
-# indentPair(k, t, e)
|
|
|
|
|
-# print()
|
|
|
|
|
-# print(f"Missing values:")
|
|
|
|
|
-#
|
|
|
|
|
-# n = 0
|
|
|
|
|
-# d = self.data.isna().sum()
|
|
|
|
|
-# for k in d.keys():
|
|
|
|
|
-# if d[k] > 0:
|
|
|
|
|
-# indentPair(k, str(d[k]))
|
|
|
|
|
-# n += 1
|
|
|
|
|
-# if n == 0:
|
|
|
|
|
-# print(" none")
|
|
|
|
|
-#
|
|
|
|
|
-#
|
|
|
|
|
-# def fixDatatypes(self):
|
|
|
|
|
-# columnsToFix = []
|
|
|
|
|
-# for k in self.data.dtypes.keys():
|
|
|
|
|
-# if str(self.data.dtypes[k]) == "object":
|
|
|
|
|
-# columnsToFix.append(k)
|
|
|
|
|
-#
|
|
|
|
|
-# self.value_dict = {}
|
|
|
|
|
-# self.value_dict_rev = {}
|
|
|
|
|
-# for c in columnsToFix:
|
|
|
|
|
-# histogram = self.data[c].value_counts()
|
|
|
|
|
-# self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
|
|
|
|
|
-# self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
|
|
|
|
|
-#
|
|
|
|
|
-# if len(self.value_dict.keys()) > 0:
|
|
|
|
|
-# self.data.replace(self.value_dict, inplace=True)
|
|
|
|
|
-#
|
|
|
|
|
-# def fix_missing_values(self):
|
|
|
|
|
-# self.data = fix_missing_values(self.data, 4)
|
|
|
|
|
-# self.updateMissingValuesState()
|
|
|
|
|
-#
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
filename='healthcare-dataset-stroke-data.csv'
|
|
filename='healthcare-dataset-stroke-data.csv'
|
|
|
np.random.seed(42)
|
|
np.random.seed(42)
|
|
|
tb = DataSheet(filename)
|
|
tb = DataSheet(filename)
|