| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- import warnings
- warnings.filterwarnings('ignore')
- import json
- import pandas as pd
- import numpy as np
- from fdc.visualize import plotCluster
- from fdc.tools import Timing
- from fdc.missingValues import fix_missing_values
- from fdc.fdc import canberra_modified, FDC, Clustering
- from fdc.dataSheet import DataSheet
- # class FdcToolbox:
- #
- # def __init__(self, file_name, index_col=0):
- # data = pd.read_csv(file_name, index_col=0)
- # self.data = data.sample(frac=1)
- #
- # self.value_dict = {}
- # self.value_dict_rev = {}
- # self.cols_cont = []
- # self.cols_ord = []
- # self.cols_nom = []
- #
- # for k in self.data.dtypes.keys():
- # t = str(self.data.dtypes[k])
- # if t[:3] == "int":
- # self.cols_ord.append(k)
- # elif t == "object":
- # self.cols_nom.append(k)
- # else:
- # self.cols_cont.append(k)
- #
- # self.has_missing_values = False
- # self.updateMissingValuesState()
- #
- # def updateMissingValuesState(self):
- # self.has_missing_values = False
- # for k in self.data.isna().sum():
- # if k > 0:
- # self.has_missing_values = True
- # break
- #
- # def showStatistic(self):
- # print(f"Fratures: {self.data.shape[1]}")
- # print(f"Points: {self.data.shape[0]}")
- # print(f"Columns:")
- #
- # for k in self.data.dtypes.keys():
- # t = str(self.data.dtypes[k])
- # e = " c"
- # if k in self.cols_ord:
- # e = " o"
- # if k in self.cols_nom:
- # e = " n"
- # indentPair(k, t, e)
- # print()
- # print(f"Missing values:")
- #
- # n = 0
- # d = self.data.isna().sum()
- # for k in d.keys():
- # if d[k] > 0:
- # indentPair(k, str(d[k]))
- # n += 1
- # if n == 0:
- # print(" none")
- #
- #
- # def fixDatatypes(self):
- # columnsToFix = []
- # for k in self.data.dtypes.keys():
- # if str(self.data.dtypes[k]) == "object":
- # columnsToFix.append(k)
- #
- # self.value_dict = {}
- # self.value_dict_rev = {}
- # for c in columnsToFix:
- # histogram = self.data[c].value_counts()
- # self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
- # self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
- #
- # if len(self.value_dict.keys()) > 0:
- # self.data.replace(self.value_dict, inplace=True)
- #
- # def fix_missing_values(self):
- # self.data = fix_missing_values(self.data, 4)
- # self.updateMissingValuesState()
- #
- filename='healthcare-dataset-stroke-data.csv'
- np.random.seed(42)
- tb = DataSheet(filename)
- tb.showStatistic()
- hasChanged = False
- if len(tb.cols_nom) > 0:
- print()
- print("Fixing object datatypes ...")
- tb.fixDatatypes()
- print("done")
- hasChanged = True
- if tb.has_missing_values:
- print()
- print("Fix missing values ...")
- tb.fix_missing_values()
- print("done")
- hasChanged = True
- if hasChanged:
- print()
- tb.showStatistic()
- tb.saveTable(filename + "_fixed_values.csv")
- tb.saveMapping(filename + "_value_mapping.json")
- print("Doing FDC ...")
- fdc = FDC(clustering_cont=Clustering('euclidean')
- , clustering_ord=Clustering(canberra_modified)
- , clustering_nom=Clustering('hamming', max_components=1)
- , visual=False
- , use_pandas_output=True
- , with_2d_embedding=False
- )
- fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
- entire_data_FDC_emb_five = fdc.normalize(tb.data)
- entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
- print("done")
|