| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- import math
- import numpy as np
- from scipy.spatial import distance
- import fdc.tools as tools
- def create_total_impute(data, distance_matrix, missing_value_list):
- def create_value_list(f, index):
- index_in_dist_mat = np.where(distance_matrix[:,0] == index)[0][0]
- value_list = []
- for neb_index in distance_matrix[index_in_dist_mat][1:]:
- impute_value = data.loc[[neb_index]][f]
- if float(impute_value) == float(impute_value):
- value_list.append(float(impute_value))
-
- if len(value_list) >= 6:
- break
- return np.array(value_list)
- def feature_impute_master(f):
- missing_value_indices = data[data[f].isnull()].index.tolist()
- return np.array([
- create_value_list(f, index)
- for index in missing_value_indices
- ])
- def imputed_value(row):
- intcounter = tools.count(lambda x: (not math.isnan(x)) and 0 == (x - int(x)), row)
- if intcounter == len(row):
- return np.array(np.bincount(row.astype(int)).argmax())
- else:
- return np.array(np.mean(row))
- total_impute_master = [
- feature_impute_master(f)
- for f in missing_value_list
- ]
- return [
- [imputed_value(row) for row in plane]
- for plane in total_impute_master
- ]
- def create_distance_matrix(dense_data):
- dense_data_index = np.array(dense_data.index)
- dense_data = np.array(dense_data)
- return np.array([
- dense_data_index[
- np.argsort([ distance.euclidean(x, y) for y in dense_data ])
- ]
- for x in dense_data
- ])
- def fix_missing_values(data, limit=4):
- dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
- dense_data = data[dense_data_pool].dropna()
- data = data.loc[np.array(dense_data.index)]
-
- distance_matrix = create_distance_matrix(dense_data)
- missing_value_list = [ x
- for x in list(data.columns)
- if x not in dense_data_pool
- ]
- total_impute = create_total_impute(
- data, distance_matrix, missing_value_list)
- for f, value in enumerate(missing_value_list):
- missing_value_indices = data[data[value].isnull()].index.tolist()
- for i, value_index in enumerate(missing_value_indices):
- data.at[value_index, value] = total_impute[f][i]
- return data
|