missingValues.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import math
  2. import numpy as np
  3. from scipy.spatial import distance
  4. import fdc.tools as tools
  5. def create_total_impute(data, distance_matrix, missing_value_list):
  6. def create_value_list(f, index):
  7. index_in_dist_mat = np.where(distance_matrix[:,0] == index)[0][0]
  8. value_list = []
  9. for neb_index in distance_matrix[index_in_dist_mat][1:]:
  10. impute_value = data.loc[[neb_index]][f]
  11. if float(impute_value) == float(impute_value):
  12. value_list.append(float(impute_value))
  13. if len(value_list) >= 6:
  14. break
  15. return np.array(value_list)
  16. def feature_impute_master(f):
  17. missing_value_indices = data[data[f].isnull()].index.tolist()
  18. return np.array([
  19. create_value_list(f, index)
  20. for index in missing_value_indices
  21. ])
  22. def imputed_value(row):
  23. intcounter = tools.count(lambda x: (not math.isnan(x)) and 0 == (x - int(x)), row)
  24. if intcounter == len(row):
  25. return np.array(np.bincount(row.astype(int)).argmax())
  26. else:
  27. return np.array(np.mean(row))
  28. total_impute_master = [
  29. feature_impute_master(f)
  30. for f in missing_value_list
  31. ]
  32. return [
  33. [imputed_value(row) for row in plane]
  34. for plane in total_impute_master
  35. ]
  36. def create_distance_matrix(dense_data):
  37. dense_data_index = np.array(dense_data.index)
  38. dense_data = np.array(dense_data)
  39. return np.array([
  40. dense_data_index[
  41. np.argsort([ distance.euclidean(x, y) for y in dense_data ])
  42. ]
  43. for x in dense_data
  44. ])
  45. def fix_missing_values(data, limit=4):
  46. timing = tools.Timing("fix_missing_values")
  47. dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
  48. timing.step("dense_data_pool")
  49. dense_data = data[dense_data_pool].dropna()
  50. timing.step("dense_data")
  51. data = data.loc[np.array(dense_data.index)]
  52. timing.step("data.loc")
  53. distance_matrix = create_distance_matrix(dense_data)
  54. timing.step("distance_matrix")
  55. missing_value_list = [ x
  56. for x in list(data.columns)
  57. if x not in dense_data_pool
  58. ]
  59. timing.step("missing_value_list")
  60. total_impute = create_total_impute(
  61. data, distance_matrix, missing_value_list)
  62. timing.step("total_impute")
  63. for f, value in enumerate(missing_value_list):
  64. missing_value_indices = data[data[value].isnull()].index.tolist()
  65. for i, value_index in enumerate(missing_value_indices):
  66. data.at[value_index, value] = total_impute[f][i]
  67. timing.step("update data")
  68. return data