|
|
@@ -0,0 +1,82 @@
|
|
|
+import math
|
|
|
+import numpy as np
|
|
|
+from scipy.spatial import distance
|
|
|
+import fdc.tools as tools
|
|
|
+
|
|
|
+
|
|
|
+def create_total_impute(data, distance_matrix, missing_value_list):
|
|
|
+ def create_value_list(f, index):
|
|
|
+ index_in_dist_mat = np.where(distance_matrix[:,0] == index)[0][0]
|
|
|
+ value_list = []
|
|
|
+ for neb_index in distance_matrix[index_in_dist_mat][1:]:
|
|
|
+ impute_value = data.loc[[neb_index]][f]
|
|
|
+ if float(impute_value) == float(impute_value):
|
|
|
+ value_list.append(float(impute_value))
|
|
|
+
|
|
|
+ if len(value_list) >= 6:
|
|
|
+ break
|
|
|
+
|
|
|
+ return np.array(value_list)
|
|
|
+
|
|
|
+ def feature_impute_master(f):
|
|
|
+ missing_value_indices = data[data[f].isnull()].index.tolist()
|
|
|
+ return np.array([
|
|
|
+ create_value_list(f, index)
|
|
|
+ for index in missing_value_indices
|
|
|
+ ])
|
|
|
+
|
|
|
+ def imputed_value(row):
|
|
|
+ intcounter = tools.count(lambda x: (not math.isnan(x)) and 0 == (x - int(x)), row)
|
|
|
+
|
|
|
+ if intcounter == len(row):
|
|
|
+ return np.array(np.bincount(row.astype(int)).argmax())
|
|
|
+ else:
|
|
|
+ return np.array(np.mean(row))
|
|
|
+
|
|
|
+ total_impute_master = [
|
|
|
+ feature_impute_master(f)
|
|
|
+ for f in missing_value_list
|
|
|
+ ]
|
|
|
+
|
|
|
+ return [
|
|
|
+ [imputed_value(row) for row in plane]
|
|
|
+ for plane in total_impute_master
|
|
|
+ ]
|
|
|
+
|
|
|
+
|
|
|
+def create_distance_matrix(dense_data):
|
|
|
+ dense_data_index = np.array(dense_data.index)
|
|
|
+ dense_data = np.array(dense_data)
|
|
|
+
|
|
|
+ return np.array([
|
|
|
+ dense_data_index[
|
|
|
+ np.argsort([ distance.euclidean(x, y) for y in dense_data ])
|
|
|
+ ]
|
|
|
+ for x in dense_data
|
|
|
+ ])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def fix_missing_values(data, limit=4):
|
|
|
+ dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
|
|
|
+ dense_data = data[dense_data_pool].dropna()
|
|
|
+ data = data.loc[np.array(dense_data.index)]
|
|
|
+
|
|
|
+ distance_matrix = create_distance_matrix(dense_data)
|
|
|
+
|
|
|
+ missing_value_list = [ x
|
|
|
+ for x in list(data.columns)
|
|
|
+ if x not in dense_data_pool
|
|
|
+ ]
|
|
|
+
|
|
|
+ total_impute = create_total_impute(
|
|
|
+ data, distance_matrix, missing_value_list)
|
|
|
+
|
|
|
+ for f, value in enumerate(missing_value_list):
|
|
|
+ missing_value_indices = data[data[value].isnull()].index.tolist()
|
|
|
+ for i, value_index in enumerate(missing_value_indices):
|
|
|
+ data.at[value_index, value] = total_impute[f][i]
|
|
|
+
|
|
|
+ return data
|
|
|
+
|
|
|
+
|