fdc.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # --[ Known to be used ]----
  2. import numpy as np
  3. from numba import jit
  4. import umap.umap_ as umap
  5. from fdc.tools import Timing
  6. # --[ Known to be used but can we avoid it? ]----
  7. import pandas as pd
  8. from fdc.visualize import plotMapping
  9. def value(v, defaultValue):
  10. if v is None:
  11. return defaultValue
  12. else:
  13. return v
  14. def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
  15. data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data)
  16. result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1'])
  17. if visual:
  18. plotMapping(result)
  19. return result
  20. @jit(nopython=True)
  21. def canberra_modified(a,b):
  22. return np.sqrt(np.sum(np.array(
  23. [np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)]
  24. )))
  25. class Clustering:
  26. def __init__(self, metric='euclidian', UMAP_neb=30, min_dist_UMAP=0.1, max_components=2):
  27. self.metric = metric
  28. self.UMAP_neb = UMAP_neb
  29. self.min_dist_UMAP = min_dist_UMAP
  30. self.max_components = max_components
  31. def normalize(self, x):
  32. return (x - np.mean(x)) / np.std(x)
  33. def fit(self, data):
  34. np.random.seed(42)
  35. # ensure that the data is a 2d array.
  36. if len(data.shape) < 2:
  37. data = data.reshape((data.shape[0], 1))
  38. # do UMAP if needed (e.g. data has more than 2 features)
  39. if data.shape[1] > self.max_components:
  40. data_embedded = umap.UMAP(
  41. n_neighbors=self.UMAP_neb
  42. , min_dist=self.min_dist_UMAP
  43. , n_components=self.max_components
  44. , metric=self.metric
  45. , random_state=42
  46. ).fit_transform(data)
  47. else:
  48. data_embedded = data
  49. # normalize the data
  50. for n in range(data_embedded.shape[1]):
  51. data_embedded[:, n] = self.normalize(data_embedded[:, n])
  52. return data_embedded
  53. class FDC:
  54. def __init__(self,
  55. clustering_cont=None, clustering_ord=None, clustering_nom=None,
  56. visual=False,
  57. with_2d_embedding=False,
  58. use_pandas_output=False
  59. ):
  60. # used clusterings
  61. self.clustering_cont = value(clustering_cont, Clustering('euclidian', 30, 0.1))
  62. self.clustering_ord = value(clustering_ord, Clustering(canberra_modified, 30, 0.1))
  63. self.clustering_nom = value(clustering_nom, Clustering('hamming', 30, 0.1, max_components=1))
  64. # Control of data output
  65. self.use_pandas_output = use_pandas_output
  66. self.with_2d_embedding = with_2d_embedding
  67. # Control if a graph is shown
  68. self.visual = visual
  69. # Lists to select columns for continueous, nomial and ordinal data.
  70. self.cont_list = None
  71. self.nom_list = None
  72. self.ord_list = None
  73. def selectFeatures(self, continueous=None, nomial=None, ordinal=None):
  74. self.cont_list = continueous
  75. self.nom_list = nomial
  76. self.ord_list = ordinal
  77. def normalize(self, data,
  78. cont_list=None, nom_list=None, ord_list=None,
  79. with_2d_embedding=None,
  80. visual=None
  81. ):
  82. timing = Timing("FDC.normalize")
  83. # Take instance value if parameter was not given.
  84. visual = value(visual, self.visual)
  85. with_2d_embedding = value(with_2d_embedding, self.with_2d_embedding)
  86. # Initialize data.
  87. np.random.seed(42)
  88. concat_column_names = []
  89. concat_lists = []
  90. timing.step("init")
  91. # Reducing features into 2dim or 1dim
  92. actions = [
  93. ("CONT", self.clustering_cont, value(cont_list, self.cont_list))
  94. , ("ORD", self.clustering_ord, value(ord_list, self.ord_list))
  95. , ("CONT", self.clustering_nom, value(nom_list, self.nom_list))
  96. ]
  97. for (name, clustering, column_list) in actions:
  98. if column_list is not None:
  99. emb = clustering.fit(data[column_list])
  100. concat_lists.append(emb)
  101. for n in range(emb.shape[1]):
  102. concat_column_names.append(f"{name}_UMAP_{n}")
  103. timing.step(f"clustering {name}")
  104. # Merge results
  105. if concat_lists == []:
  106. raise ValueError("Expected at least one non empty column list.")
  107. result_concat = np.concatenate(concat_lists, axis=1)
  108. timing.step("concat")
  109. # Create 2d embedding from 5d embedding
  110. if with_2d_embedding or visual:
  111. result_reduced = umap.UMAP(
  112. n_neighbors=30
  113. , min_dist=0.001
  114. , n_components=2
  115. , metric='euclidean'
  116. , random_state=42
  117. ).fit_transform(result_concat)
  118. timing.step("umap 5 -> 2")
  119. if self.use_pandas_output:
  120. result_reduced = pd.DataFrame(
  121. data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
  122. timing.step("array -> DataFrame")
  123. # Show mapping if needed
  124. if visual:
  125. if self.use_pandas_output:
  126. plotMapping(result_reduced)
  127. else:
  128. plotMapping(pd.DataFrame(
  129. data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
  130. timing.step("plotting")
  131. # Transform to pandas DataFrame if needed.
  132. if self.use_pandas_output:
  133. result_concat = pd.DataFrame(
  134. data=result_concat, columns=concat_column_names)
  135. timing.step("array -> DataFrame")
  136. timing.step("total")
  137. if with_2d_embedding:
  138. #returns both 5D and 2D embeddings
  139. return result_concat, result_reduced
  140. else:
  141. #returns 5D embedding only
  142. return result_concat