missingValues.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import math
  2. import numpy as np
  3. from scipy.spatial import distance
  4. import fdc.tools as tools
  5. def create_total_impute(data, distance_matrix, missing_value_list):
  6. def create_value_list(f, index):
  7. index_in_dist_mat = np.where(distance_matrix[:,0] == index)[0][0]
  8. value_list = []
  9. for neb_index in distance_matrix[index_in_dist_mat][1:]:
  10. impute_value = data.loc[[neb_index]][f]
  11. if float(impute_value) == float(impute_value):
  12. value_list.append(float(impute_value))
  13. if len(value_list) >= 6:
  14. break
  15. return np.array(value_list)
  16. def feature_impute_master(f):
  17. missing_value_indices = data[data[f].isnull()].index.tolist()
  18. return np.array([
  19. create_value_list(f, index)
  20. for index in missing_value_indices
  21. ])
  22. def imputed_value(row):
  23. intcounter = tools.count(lambda x: (not math.isnan(x)) and 0 == (x - int(x)), row)
  24. if intcounter == len(row):
  25. return np.array(np.bincount(row.astype(int)).argmax())
  26. else:
  27. return np.array(np.mean(row))
  28. total_impute_master = [
  29. feature_impute_master(f)
  30. for f in missing_value_list
  31. ]
  32. return [
  33. [imputed_value(row) for row in plane]
  34. for plane in total_impute_master
  35. ]
  36. def create_distance_matrix_old(dense_data):
  37. dense_data_index = np.array(dense_data.index)
  38. dense_data = np.array(dense_data)
  39. return np.array([
  40. dense_data_index[
  41. np.argsort([ distance.euclidean(x, y) for y in dense_data ])
  42. ]
  43. for x in dense_data
  44. ])
  45. def create_distance_matrix(dense_data):
  46. dense_data_index = np.array(dense_data.index)
  47. dense_data = np.array(dense_data)
  48. size = len(dense_data)
  49. matrix = [[ None for i in range(size)] for j in range(size)]
  50. # Calculate the squared euclidian distances.
  51. for nx, x in enumerate(dense_data):
  52. b = dense_data - x
  53. matrix[nx] = np.sum(b*b, axis=1)
  54. # Calculate the indices and replace the distance rows.
  55. # So we create our result matrix and do cleanup at the same time.
  56. for n in range(size):
  57. matrix[n] = dense_data_index[ np.argsort(matrix[n]) ]
  58. return np.array(matrix)
  59. def fix_missing_values(data, limit=4):
  60. dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
  61. dense_data = data[dense_data_pool].dropna()
  62. data = data.loc[np.array(dense_data.index)]
  63. distance_matrix = create_distance_matrix(dense_data)
  64. missing_value_list = [ x
  65. for x in list(data.columns)
  66. if x not in dense_data_pool
  67. ]
  68. total_impute = create_total_impute(
  69. data, distance_matrix, missing_value_list)
  70. for f, value in enumerate(missing_value_list):
  71. missing_value_indices = data[data[value].isnull()].index.tolist()
  72. for i, value_index in enumerate(missing_value_indices):
  73. data.at[value_index, value] = total_impute[f][i]
  74. return data