missingValues.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. import math
  2. import numpy as np
  3. from scipy.spatial import distance
  4. import fdc.tools as tools
  5. def create_total_impute(data, distance_matrix, missing_value_list):
  6. def create_value_list(f, index):
  7. index_in_dist_mat = np.where(distance_matrix[:,0] == index)[0][0]
  8. value_list = []
  9. for neb_index in distance_matrix[index_in_dist_mat][1:]:
  10. impute_value = data.loc[[neb_index]][f]
  11. if float(impute_value) == float(impute_value):
  12. value_list.append(float(impute_value))
  13. if len(value_list) >= 6:
  14. break
  15. return np.array(value_list)
  16. def feature_impute_master(f):
  17. missing_value_indices = data[data[f].isnull()].index.tolist()
  18. return np.array([
  19. create_value_list(f, index)
  20. for index in missing_value_indices
  21. ])
  22. def imputed_value(row):
  23. intcounter = tools.count(lambda x: (not math.isnan(x)) and 0 == (x - int(x)), row)
  24. if intcounter == len(row):
  25. return np.array(np.bincount(row.astype(int)).argmax())
  26. else:
  27. return np.array(np.mean(row))
  28. total_impute_master = [
  29. feature_impute_master(f)
  30. for f in missing_value_list
  31. ]
  32. return [
  33. [imputed_value(row) for row in plane]
  34. for plane in total_impute_master
  35. ]
  36. def create_distance_matrix_old(dense_data):
  37. dense_data_index = np.array(dense_data.index)
  38. dense_data = np.array(dense_data)
  39. return np.array([
  40. dense_data_index[
  41. np.argsort([ distance.euclidean(x, y) for y in dense_data ])
  42. ]
  43. for x in dense_data
  44. ])
  45. def create_distance_matrix(dense_data):
  46. dense_data_index = np.array(dense_data.index)
  47. dense_data = np.array(dense_data)
  48. size = len(dense_data)
  49. matrix = [[ None for i in range(size)] for j in range(size)]
  50. # Calculate the distances. As the distance matrix is symmetric we can do
  51. # that with at most n*n/2 distance evaluations.
  52. for nx, x in enumerate(dense_data):
  53. for ny, y in enumerate(dense_data):
  54. # Same index so distance is 0.0
  55. if nx == ny:
  56. matrix[nx][ny] = 0.0
  57. # As the matrix is symetric we can copy already calculated values.
  58. # As distance computation is expencive we should do that.
  59. elif nx > ny:
  60. matrix[nx][ny] = matrix[ny][nx]
  61. # Unseen pair so calculate the distance.
  62. else:
  63. matrix[nx][ny] = distance.euclidean(x, y)
  64. # Calculate the indices and replace the distance rows.
  65. # So we create our result matrix and do cleanup at the same time.
  66. for n in range(size):
  67. matrix[n] = dense_data_index[ np.argsort(matrix[n]) ]
  68. return np.array(matrix)
  69. def fix_missing_values(data, limit=4):
  70. timing = tools.Timing("fix_missing_values")
  71. dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
  72. timing.step("dense_data_pool")
  73. dense_data = data[dense_data_pool].dropna()
  74. timing.step("dense_data")
  75. data = data.loc[np.array(dense_data.index)]
  76. timing.step("data.loc")
  77. distance_matrix = create_distance_matrix(dense_data)
  78. timing.step("distance_matrix")
  79. missing_value_list = [ x
  80. for x in list(data.columns)
  81. if x not in dense_data_pool
  82. ]
  83. timing.step("missing_value_list")
  84. total_impute = create_total_impute(
  85. data, distance_matrix, missing_value_list)
  86. timing.step("total_impute")
  87. for f, value in enumerate(missing_value_list):
  88. missing_value_indices = data[data[value].isnull()].index.tolist()
  89. for i, value_index in enumerate(missing_value_indices):
  90. data.at[value_index, value] = total_impute[f][i]
  91. timing.step("update data")
  92. return data