| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- import warnings
- warnings.filterwarnings('ignore')
- import json
- import pandas as pd
- import numpy as np
- from fdc.visualize import plotCluster
- from fdc.tools import Timing
- from fdc.missingValues import fix_missing_values
- from fdc.fdc import canberra_modified, FDC, Clustering
- def indent(text, i=" "):
- result = ""
- for x in text.split("\n"):
- result += i + x + "\n"
- return result
- def indentPair(a, b, e="", i=" "):
- m = a + " "
- if len(m) < 32:
- m += "_" * (32 - len(m))
- if len(b) < 16:
- m += "_" * (10 - len(b))
- m += " "
- m += b
- if e == False:
- pass
- elif e == True:
- m += " *"
- else:
- m += e
- print(" " + m)
- class FdcToolbox:
- def __init__(self, file_name, index_col=0):
- data = pd.read_csv(file_name, index_col=0)
- self.data = data.sample(frac=1)
- self.value_dict = {}
- self.value_dict_rev = {}
- self.cols_cont = []
- self.cols_ord = []
- self.cols_nom = []
- for k in self.data.dtypes.keys():
- t = str(self.data.dtypes[k])
- if t[:3] == "int":
- self.cols_ord.append(k)
- elif t == "object":
- self.cols_nom.append(k)
- else:
- self.cols_cont.append(k)
- self.has_missing_values = False
- self.updateMissingValuesState()
- def updateMissingValuesState(self):
- self.has_missing_values = False
- for k in self.data.isna().sum():
- if k > 0:
- self.has_missing_values = True
- break
- def showStatistic(self):
- print(f"Fratures: {self.data.shape[1]}")
- print(f"Points: {self.data.shape[0]}")
- print(f"Columns:")
- for k in self.data.dtypes.keys():
- t = str(self.data.dtypes[k])
- e = " c"
- if k in self.cols_ord:
- e = " o"
- if k in self.cols_nom:
- e = " n"
- indentPair(k, t, e)
- print()
- print(f"Missing values:")
- n = 0
- d = self.data.isna().sum()
- for k in d.keys():
- if d[k] > 0:
- indentPair(k, str(d[k]))
- n += 1
- if n == 0:
- print(" none")
- def fixDatatypes(self):
- columnsToFix = []
- for k in self.data.dtypes.keys():
- if str(self.data.dtypes[k]) == "object":
- columnsToFix.append(k)
- self.value_dict = {}
- self.value_dict_rev = {}
- for c in columnsToFix:
- histogram = self.data[c].value_counts()
- self.value_dict[c] = { k : n for n, k in enumerate(histogram.keys()) }
- self.value_dict_rev[c] = { n : k for n, k in enumerate(histogram.keys()) }
-
- if len(self.value_dict.keys()) > 0:
- self.data.replace(self.value_dict, inplace=True)
- def fix_missing_values(self):
- self.data = fix_missing_values(self.data, 4)
- self.updateMissingValuesState()
- filename='healthcare-dataset-stroke-data.csv'
- np.random.seed(42)
- tb = FdcToolbox(filename)
- tb.showStatistic()
- hasChanged = False
- if len(tb.cols_nom) > 0:
- print()
- print("Fixing object datatypes ...")
- tb.fixDatatypes()
- print("done")
- hasChanged = True
- if tb.has_missing_values:
- print()
- print("Fix missing values ...")
- tb.fix_missing_values()
- print("done")
- hasChanged = True
- if hasChanged:
- print()
- tb.showStatistic()
- tb.data.to_csv(filename + "_fixed_values.csv")
- with open(filename + "_value_mapping.json", "w") as f:
- json.dump(tb.value_dict_rev, f)
- print("Doing FDC ...")
- fdc = FDC(clustering_cont=Clustering('euclidean')
- , clustering_ord=Clustering(canberra_modified)
- , clustering_nom=Clustering('hamming', max_components=1)
- , visual=False
- , use_pandas_output=True
- , with_2d_embedding=False
- )
- fdc.selectFeatures(continueous=tb.cols_cont, nomial=tb.cols_nom, ordinal=tb.cols_ord)
- entire_data_FDC_emb_five = fdc.normalize(tb.data)
- entire_data_FDC_emb_five.to_csv(filename + "_fdc.csv")
- print("done")
|