NNSearch_experimental.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. import math
  2. from ctypes import cdll, c_uint, c_double, c_void_p
  3. import tensorflow as tf
  4. import tensorflow.keras.layers as L
  5. import numpy as np
  6. import numpy.ctypeslib as npct
  7. array_2d_uint = npct.ndpointer(dtype=np.uint, ndim=2, flags='CONTIGUOUS')
  8. array_1d_double = npct.ndpointer(dtype=np.double, ndim=1, flags='CONTIGUOUS')
  9. array_2d_double = npct.ndpointer(dtype=np.double, ndim=2, flags='CONTIGUOUS')
  10. from sklearn.neighbors import NearestNeighbors
  11. from library.timing import timing
  12. from library.MaxHeap import MaxHeap
  13. nbhLib = cdll.LoadLibrary("./library/c/libNeighborhood.so")
  14. nbhLib.Neighborhood.rettype = None
  15. nbhLib.Neighborhood.argtypes = [c_uint, c_uint, c_uint, array_2d_double, array_2d_uint]
  16. nbhLib.NeighborhoodHeuristic.rettype = None
  17. nbhLib.NeighborhoodHeuristic.argtypes = [c_uint, c_uint, c_uint, array_2d_double, array_2d_uint]
  18. def scalarP(a, b):
  19. return sum(map(lambda c: c[0] * c[1], zip(a, b)))
  20. def norm2(v):
  21. return sum(map(lambda z: z*z, v))
  22. def dist(x,y):
  23. return norm2(x - y)
  24. def maxby(data, fn, startValue=0.0):
  25. m = startValue
  26. for v in data:
  27. m = max(m, fn(v))
  28. return m
  29. def distancesToPoint(p, points):
  30. w = np.array(np.repeat([p], len(points), axis=0))
  31. d = L.Subtract()([w, np.array(points)])
  32. t = L.Dot(axes=(1,1))([d,d])
  33. # As the concrete distance is not needed and sqrt(x) is strict monotone
  34. # we avoid here unneccessary calculating of expensive roots.
  35. return t.numpy()
  36. def calculateCenter(points):
  37. if points.shape[0] == 1:
  38. return points[0]
  39. return tf.keras.layers.Average()(np.array(points)).numpy()
  40. def centerPoints(points):
  41. points = np.array(points)
  42. center = L.Average()(list(points)).numpy()
  43. ctr = np.array(np.repeat([center], points.shape[0], axis=0))
  44. return L.Subtract()([ctr, points]).numpy()
  45. def maxNormPoints(points):
  46. points = np.array(points)
  47. a = L.Lambda(lambda x: np.abs(x))(points)
  48. a = L.Reshape((points.shape[1], 1))(a)
  49. m = L.GlobalMaxPooling1D()(a)
  50. m = L.Reshape((1,))
  51. return m.numpy()
  52. def twoNormSquaredPoints(points):
  53. points = np.array(points)
  54. return L.Dot(axes=(1,1))([points,points]).numpy()
  55. def twoNormPoints(points):
  56. points = np.array(points)
  57. nsq = L.Dot(axes=(1,1))([points,points])
  58. return L.Lambda(lambda x: np.sqrt(x))(points).numpy()
  59. def norms(points):
  60. points = np.array(points)
  61. a = L.Lambda(lambda x: np.abs(x))(points)
  62. a = L.Reshape((points.shape[1], 1))(a)
  63. m = L.GlobalMaxPooling1D()(a)
  64. m = L.Reshape((1,))(m)
  65. nsq = L.Dot(axes=(1,1))([points,points])
  66. return L.Concatenate()([m, nsq]).numpy()
  67. class Ball:
  68. def __init__(self, points=None, indices=None, parent=None, center=None):
  69. if center is not None:
  70. self.center = center
  71. elif points is not None:
  72. self.center = calculateCenter(np.array(points))
  73. else:
  74. raise ParameterError("Missing points or center")
  75. self.radius = 0
  76. self.points = []
  77. self.indices = set()
  78. self.childs = []
  79. self.parent = parent
  80. if points is not None and indices is not None:
  81. for (i, r) in zip(indices, distancesToPoint(self.center, points)):
  82. self.add(i, r)
  83. elif points is not None:
  84. raise ParameterError("Missing indices")
  85. def findIt(self, r):
  86. if r == self.points[0][1]:
  87. return 0
  88. upper = len(self.points) - 1
  89. lower = 0
  90. while upper > lower + 1:
  91. h = (upper + lower) // 2
  92. if self.points[h][1] >= r:
  93. upper = h
  94. else:
  95. lower = h
  96. return upper
  97. def add(self, xi, r):
  98. if xi in self.indices:
  99. return False
  100. # Here we know, that x is not in points.
  101. newEntry = (xi, r)
  102. self.indices.add(xi)
  103. # Special case: empty list or new element will extend the radius:
  104. # Here we can avoid the search
  105. if self.points == [] or r >= self.radius:
  106. self.points.append(newEntry)
  107. self.radius = r
  108. return True
  109. # Special case: r <= min radius
  110. # Here we can avoid the search
  111. if self.points[0][1] >= r:
  112. self.points = [newEntry] + self.points
  113. return True
  114. # Here we know that min radius < r < max radius.
  115. # So len(points) >= 2.
  116. pos = self.findIt(r)
  117. # here shoul be: r(pos) >= r > r(pos - 1)
  118. self.points = self.points[:pos] + [newEntry] + self.points[pos: ]
  119. return True
  120. def remove(self, xi, r):
  121. if xi not in self.indices:
  122. return False
  123. # special case: remove the element with the heighest radius:
  124. if self.points[-1][0] == xi:
  125. self.indices.remove(xi)
  126. self.points = self.points[: -1]
  127. if self.points == []:
  128. self.radius = 0.0
  129. else:
  130. self.radius = self.points[-1][1]
  131. return True
  132. pos = self.findIt(r)
  133. nPoints = len(self.points)
  134. # pos is the smalest position with:
  135. # r(pos - 1) < r <= r(pos)
  136. # Just in case: check that we remove the correct index.
  137. while pos < nPoints and r == self.points[pos]:
  138. if self.points[pos][0] == xi:
  139. self.indices.remove(xi)
  140. self.points = self.points[:pos] + self.points[(pos + 1): ]
  141. return True
  142. pos += 1
  143. return False
  144. def divideBall(self, X):
  145. indices = [i for (i, _r) in self.points]
  146. points = np.array([X[i] for i in indices])
  147. distances = tf.keras.layers.Dot(axes=(1,1))([points, points]).numpy()
  148. ball = Ball(points, indices, center=np.zeros(len(points[0])))
  149. h = len(ball) // 2
  150. indicesA = [i for (i, _) in ball.points[:h]]
  151. indicesB = [i for (i, _) in ball.points[h:]]
  152. pointsA = np.array([X[i] for i in indicesA])
  153. pointsB = np.array([X[i] for i in indicesB])
  154. self.childs = []
  155. print(f"{len(points)} -> <{len(pointsA)}|{len(pointsB)}>")
  156. self.childs.append(Ball(pointsA, indicesA, self))
  157. self.childs.append(Ball(pointsB, indicesB, self))
  158. return self.childs
  159. def __len__(self):
  160. return len(self.points)
  161. def smalestBallFor(self, i):
  162. if i not in self.indices:
  163. return None
  164. for c in self.childs:
  165. b = c.smalestBallFor(i)
  166. if b is not None:
  167. return b
  168. return self
  169. class NNSearch:
  170. def __init__(self, nebSize=5, timingDict=None):
  171. self.nebSize = nebSize
  172. self.neighbourhoods = []
  173. self.timingDict = timingDict
  174. def timerStart(self, name):
  175. if self.timingDict is not None:
  176. if name not in self.timingDict:
  177. self.timingDict[name] = timing(name)
  178. self.timingDict[name].start()
  179. def timerStop(self, name):
  180. if self.timingDict is not None:
  181. if name in self.timingDict:
  182. self.timingDict[name].stop()
  183. def neighbourhoodOfItem(self, i):
  184. return self.neighbourhoods[i]
  185. def fit(self, X, nebSize=None):
  186. self.timerStart("NN_fit_all")
  187. self.fit_chained(X, nebSize)
  188. #self.fit_bruteForce_np(X, nebSize)
  189. self.timerStop("NN_fit_all")
  190. def fit_optimized(self, X, nebSize=None):
  191. self.timerStart("NN_fit_all")
  192. if nebSize == None:
  193. nebSize = self.nebSize
  194. nPoints = len(X)
  195. nFeatures = len(X[0])
  196. if nFeatures > 15 or nebSize >= (nPoints // 2):
  197. print("Using brute force")
  198. self.fit_bruteForce_np(X, nebSize)
  199. else:
  200. print("Using chained")
  201. self.fit_chained(X, nebSize)
  202. self.timerStop("NN_fit_all")
  203. def fit_bruteForce(self, X, nebSize=None):
  204. if nebSize == None:
  205. nebSize = self.nebSize
  206. self.timerStart("NN_fit_bf_init")
  207. isGreaterThan = lambda x, y: x[1] > y[1]
  208. self.neighbourhoods = [MaxHeap(nebSize, isGreaterThan, (i, 0.0)) for i in range(len(X))]
  209. self.timerStop("NN_fit_bf_init")
  210. self.timerStart("NN_fit_bf_loop")
  211. for (i, x) in enumerate(X):
  212. nbh = self.neighbourhoods[i]
  213. for (j, y) in enumerate(X[i+1:]):
  214. j += i + 1
  215. self.timerStart("NN_fit_bf_dist")
  216. d = dist(x,y)
  217. self.timerStop("NN_fit_bf_dist")
  218. self.timerStart("NN_fit_bf_insert")
  219. nbh.insert((j,d))
  220. self.neighbourhoods[j].insert((i,d))
  221. self.timerStop("NN_fit_bf_insert")
  222. self.timerStart("NN_fit_bf_toList")
  223. self.neighbourhoods[i] = nbh.toArray(lambda v: v[0])
  224. self.timerStop("NN_fit_bf_toList")
  225. self.timerStop("NN_fit_bf_loop")
  226. def fit_bruteForce_np(self, X, nebSize=None):
  227. self.timerStart("NN_fit_bfnp_init")
  228. numOfPoints = len(X)
  229. nFeatures = len(X[0])
  230. tX = tf.convert_to_tensor(X)
  231. def distancesTo(x):
  232. w = np.repeat([x], numOfPoints, axis=0)
  233. d = tf.keras.layers.Subtract()([w,tX])
  234. t = tf.keras.layers.Dot(axes=(1,1))([d,d])
  235. return t.numpy()
  236. if nebSize == None:
  237. nebSize = self.nebSize
  238. isGreaterThan = lambda x, y: x[1] > y[1]
  239. self.neighbourhoods = [MaxHeap(nebSize, isGreaterThan, (i, 0.0)) for i in range(len(X))]
  240. self.timerStop("NN_fit_bfnp_init")
  241. self.timerStart("NN_fit_bfnp_loop")
  242. for (i, x) in enumerate(X):
  243. self.timerStart("NN_fit_bfnp_dist")
  244. distances = distancesTo(x)
  245. self.timerStop("NN_fit_bfnp_dist")
  246. nbh = self.neighbourhoods[i]
  247. for (j, y) in enumerate(X[i+1:]):
  248. j += i + 1
  249. d = distances[j]
  250. self.timerStart("NN_fit_bfnp_insert")
  251. nbh.insert((j,d))
  252. self.neighbourhoods[j].insert((i,d))
  253. self.timerStop("NN_fit_bfnp_insert")
  254. self.timerStart("NN_fit_bfnp_toList")
  255. self.neighbourhoods[i] = nbh.toArray(lambda v: v[0])
  256. self.timerStop("NN_fit_bfnp_toList")
  257. self.timerStop("NN_fit_bfnp_loop")
  258. def fit_chained(self, X, nebSize=None):
  259. self.timerStart("NN_fit_chained_init")
  260. if nebSize == None:
  261. nebSize = self.nebSize
  262. nPoints = len(X)
  263. nFeatures = len(X[0])
  264. neigh = NearestNeighbors(n_neighbors=nebSize)
  265. neigh.fit(X)
  266. self.timerStop("NN_fit_chained_init")
  267. self.timerStart("NN_fit_chained_toList")
  268. self.neighbourhoods = [
  269. (neigh.kneighbors([x], nebSize, return_distance=False))[0]
  270. for (i, x) in enumerate(X)
  271. ]
  272. self.timerStop("NN_fit_chained_toList")
  273. def fit_cLib(self, X, nebSize=None):
  274. self.timerStart("NN_fit_cLib_init")
  275. if nebSize == None:
  276. nebSize = self.nebSize
  277. nbh = np.array([np.zeros(nebSize, dtype=np.uint) for i in range(X.shape[0])])
  278. self.timerStop("NN_fit_cLib_init")
  279. self.timerStart("NN_fit_cLib_call")
  280. nbhLib.Neighborhood(nebSize, X.shape[0], X.shape[1], X, nbh)
  281. self.timerStop("NN_fit_cLib_call")
  282. self.timerStart("NN_fit_cLib_list")
  283. self.neighbourhoods = list(nbh)
  284. self.timerStop("NN_fit_cLib_list")
  285. def fit_cLibHeuristic(self, X, nebSize=None):
  286. self.timerStart("NN_fit_cLib_init")
  287. if nebSize == None:
  288. nebSize = self.nebSize
  289. nbh = np.array([np.zeros(nebSize, dtype=np.uint) for i in range(X.shape[0])])
  290. self.timerStop("NN_fit_cLib_init")
  291. self.timerStart("NN_fit_cLib_call")
  292. nbhLib.NeighborhoodHeuristic(nebSize, X.shape[0], X.shape[1], X, nbh)
  293. self.timerStop("NN_fit_cLib_call")
  294. self.timerStart("NN_fit_cLib_list")
  295. self.neighbourhoods = list(nbh)
  296. self.timerStop("NN_fit_cLib_list")
  297. # ===============================================================
  298. # Heuristic search
  299. # ===============================================================
  300. def fit_heuristic(self, X, nebSize=None, debugLayer=0, withDouble=True):
  301. if nebSize == None:
  302. nebSize = self.nebSize
  303. self.timerStart("NN_fit_heuristic_init")
  304. nPoints = len(X)
  305. nFeatures = len(X[0])
  306. nHeuristic = max(1, int(math.log(nFeatures)))
  307. isGreaterThan = lambda x, y: x[1] > y[1]
  308. self.neighbourhoods = [MaxHeap(maxSize=nebSize, isGreaterThan=isGreaterThan, smalestValue=(i, 0.0)) for i in range(len(X))]
  309. self.timerStart("NN_fit_heuristic_lineStart")
  310. z = X[0]
  311. farest = 0
  312. bestDist = 0.0
  313. for (i, x) in enumerate(X):
  314. d = dist(x, z)
  315. if d > bestDist:
  316. farest = i
  317. bestDist = d
  318. lineStart = farest
  319. z = X[lineStart]
  320. self.timerStop("NN_fit_heuristic_lineStart")
  321. # print(f"lineStart: {lineStart}@{z} ... {bestDist}")
  322. self.timerStart("NN_fit_heuristic_lineEnd")
  323. bestDist = 0.0
  324. for (i, x) in enumerate(X):
  325. d = dist(x, z)
  326. if d > bestDist:
  327. farest = i
  328. bestDist = d
  329. lineEnd = farest
  330. self.timerStop("NN_fit_heuristic_lineEnd")
  331. self.timerStart("NN_fit_heuristic_line")
  332. # print(f"lineEnd: {lineEnd}@{X[lineEnd]} ... {bestDist}")
  333. u = (X[lineEnd] - z)
  334. uFactor = (1 / math.sqrt(norm2(u)))
  335. u = uFactor * u
  336. # print(f"u: {u} ... {norm2(u)}")
  337. def heuristic(i,x):
  338. p = z + (scalarP(u, x - z) * u)
  339. dz = math.sqrt(dist(z, p))
  340. dx = math.sqrt(dist(x, p))
  341. return (i, dz, dx)
  342. line = [heuristic(i, x) for (i,x) in enumerate(X) ]
  343. line.sort(key= lambda a: a[1])
  344. self.timerStop("NN_fit_heuristic_line")
  345. self.timerStop("NN_fit_heuristic_init")
  346. self.timerStart("NN_fit_heuristic_loop")
  347. s = 0
  348. ff = False
  349. ptsDone = set()
  350. for (i,(xi, di, dix)) in enumerate(line):
  351. self.timerStart("NN_fit_heuristic_loop_init")
  352. h = self.neighbourhoods[xi]
  353. z = X[xi]
  354. self.timerStop("NN_fit_heuristic_loop_init")
  355. ptsDone.add(xi)
  356. self.timerStart("NN_fit_heuristic_loop_distance")
  357. ll = [(xj, norm2([dj - di, djx - dix])) for (xj, dj, djx) in line[i:]]
  358. # ll = [(xj, dist(
  359. # np.array([di, dix] + list(X[xi][0:nHeuristic])),
  360. # np.array([dj, djx] + list(X[xj][0:nHeuristic]))))
  361. # for (xj, dj, djx) in line[1:]
  362. # ]
  363. ll.sort(key = lambda a: a[1])
  364. kk = distancesToPoint(z, [X[j] for (j, _) in ll])
  365. self.timerStop("NN_fit_heuristic_loop_distance")
  366. for (d, (xj, djx)) in zip(kk, ll):
  367. ign = h.size >= nebSize and djx > h.getMax()[1]
  368. if ign:
  369. break
  370. else:
  371. #d = dist(X[xj], z)
  372. self.timerStart("NN_fit_heuristic_insert")
  373. s += 1
  374. h.insert((xj, d))
  375. k = self.neighbourhoods[xj]
  376. if not isinstance(k, list):
  377. k.insert((xi, d))
  378. self.timerStop("NN_fit_heuristic_insert")
  379. # if xi == debugLayer:
  380. # d = dist(X[xj], z)
  381. # hint = ""
  382. # if djx > d:
  383. # hint += "!!"
  384. # if ign:
  385. # hint += "*"
  386. # print(f"xj:{xj} dx:{h.getMax()[1]:0.1f} djx:{djx:0.1f} d:{d:0.1f}" + hint)
  387. self.timerStart("NN_fit_heuristic_toArray")
  388. self.neighbourhoods[xi] = h.toArray()
  389. self.timerStop("NN_fit_heuristic_toArray")
  390. self.timerStop("NN_fit_heuristic_loop")
  391. print(f"calculated distances: {s} / {nPoints * (nPoints - 1)}")