|
|
@@ -79,37 +79,27 @@ def create_distance_matrix(dense_data):
|
|
|
|
|
|
|
|
|
def fix_missing_values(data, limit=4):
|
|
|
- timing = tools.Timing("fix_missing_values")
|
|
|
-
|
|
|
dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
|
|
|
- timing.step("dense_data_pool")
|
|
|
|
|
|
dense_data = data[dense_data_pool].dropna()
|
|
|
- timing.step("dense_data")
|
|
|
|
|
|
data = data.loc[np.array(dense_data.index)]
|
|
|
- timing.step("data.loc")
|
|
|
|
|
|
distance_matrix = create_distance_matrix(dense_data)
|
|
|
- timing.step("distance_matrix")
|
|
|
|
|
|
missing_value_list = [ x
|
|
|
for x in list(data.columns)
|
|
|
if x not in dense_data_pool
|
|
|
]
|
|
|
- timing.step("missing_value_list")
|
|
|
|
|
|
total_impute = create_total_impute(
|
|
|
data, distance_matrix, missing_value_list)
|
|
|
- timing.step("total_impute")
|
|
|
|
|
|
for f, value in enumerate(missing_value_list):
|
|
|
missing_value_indices = data[data[value].isnull()].index.tolist()
|
|
|
for i, value_index in enumerate(missing_value_indices):
|
|
|
data.at[value_index, value] = total_impute[f][i]
|
|
|
|
|
|
- timing.step("update data")
|
|
|
-
|
|
|
return data
|
|
|
|
|
|
|