fdc.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # --[ Known to be used ]----
  2. import numpy as np
  3. from numba import jit
  4. import umap.umap_ as umap
  5. # --[ Known to be used but can we avoid it? ]----
  6. import pandas as pd
  7. import seaborn as sns
  8. import matplotlib.pyplot as plt
  9. def value(v, defaultValue):
  10. if v is None:
  11. return defaultValue
  12. else:
  13. return v
  14. def draw2dMapping(data):
  15. colors_set1 = ["lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood", "cornflowerblue", "plum", "yellowgreen"]
  16. customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
  17. sns.lmplot(x="UMAP_0"
  18. , y="UMAP_1"
  19. , data=data
  20. , fit_reg=False
  21. , legend=False
  22. , scatter_kws={"s": 3}
  23. , palette=customPalette_set1)
  24. plt.show()
  25. def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
  26. data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data)
  27. result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1'])
  28. if visual:
  29. draw2dMapping(result)
  30. return result
  31. @jit(nopython=True)
  32. def modified_can(a,b):
  33. return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
  34. class Clustering:
  35. def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1):
  36. self.metric = metric
  37. self.UMAP_neb = UMAP_neb
  38. self.min_dist_UMAP = min_dist_UMAP
  39. def fit(self, data):
  40. def normalize(x):
  41. return (x - np.mean(x)) / np.std(x)
  42. np.random.seed(42)
  43. data_embedded = umap.UMAP(
  44. n_neighbors=self.UMAP_neb
  45. , min_dist=self.min_dist_UMAP
  46. , n_components=2
  47. , metric=self.metric
  48. , random_state=42
  49. ).fit_transform(data)
  50. data_embedded[:, 0] = normalize(data_embedded[:, 0])
  51. data_embedded[:, 1] = normalize(data_embedded[:, 1])
  52. return data_embedded
  53. class FDC:
  54. def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
  55. # used clusterings
  56. self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
  57. self.clustering_ord = clustering_ord or Clustering(modified_can, 30, 0.1)
  58. self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
  59. # Control of data output
  60. self.use_pandas_output = use_pandas_output
  61. self.with_2d_embedding = with_2d_embedding
  62. self.drop_nominal = drop_nominal
  63. # Control if a graph is shown
  64. self.visual = visual
  65. # Lists to select columns for continueous, nominal and ordinal data.
  66. self.cont_list = None
  67. self.nom_list = None
  68. self.ord_list = None
  69. def calc_embedding(self, clustering, data, column_list):
  70. if column_list is not None:
  71. return clustering.fit(data[column_list])
  72. else:
  73. return None
  74. def normalize(self, data, cont_list=None, nom_list=None, ord_list=None, with_2d_embedding=False, visual=None):
  75. np.random.seed(42)
  76. visual = value(visual, self.visual)
  77. concat_column_names = []
  78. concat_lists = []
  79. # Reducing continueous features into 2dim
  80. cont_emb = self.calc_embedding(self.clustering_cont, data, value(cont_list, self.cont_list))
  81. if cont_emb is not None:
  82. concat_lists.append(cont_emb)
  83. concat_column_names.extend(['CONT_UMAP_0', 'CONT_UMAP_1'])
  84. # Reducing ordinal features into 2dim
  85. ord_emb = self.calc_embedding(self.clustering_ord, data, value(ord_list, self.ord_list))
  86. if ord_emb is not None:
  87. concat_lists.append(ord_emb)
  88. concat_column_names.extend(['ORD_UMAP_0', 'ORD_UMAP_1'])
  89. # Reducing nominal features into 2dim
  90. nom_emb = self.calc_embedding(self.clustering_nom, data, value(nom_list, self.nom_list))
  91. if nom_emb is not None:
  92. concat_column_names.append('NOM_UMAP_0')
  93. if self.drop_nominal:
  94. nom_emb = nom_emb[:, 0].reshape((nom_emb.shape[0], 1))
  95. else:
  96. concat_column_names.append('NOM_UMAP_1')
  97. concat_lists.append(nom_emb)
  98. # Merge results
  99. if concat_lists == []:
  100. raise ValueError("Expected at least one non empty column list.")
  101. result_concat = np.concatenate(concat_lists, axis=1)
  102. # Create 2d embedding
  103. if with_2d_embedding or visual:
  104. result_reduced = umap.UMAP(
  105. n_neighbors=30
  106. , min_dist=0.001
  107. , n_components=2
  108. , metric='euclidean'
  109. , random_state=42
  110. ).fit_transform(result_concat) #reducing 5D embeddings to 2D using UMAP
  111. if self.use_pandas_output:
  112. result_reduced = pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
  113. # Show mapping if needed
  114. if visual:
  115. if self.use_pandas_output:
  116. draw2dMapping(result_reduced)
  117. else:
  118. draw2dMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
  119. # Return the results
  120. if self.use_pandas_output:
  121. result_concat = pd.DataFrame(data=result_concat, columns=concat_column_names)
  122. if with_2d_embedding:
  123. return result_concat, result_reduced #returns both 5D and 2D embeddings
  124. else:
  125. return result_concat #returns 5D embedding only