XConvGeN.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. from library.interfaces import GanBaseClass
  4. from library.dataset import DataSet
  5. from library.timing import timing
  6. from keras.layers import Dense, Input, Multiply, Flatten, Conv1D, Reshape, InputLayer, Add
  7. from keras.models import Model, Sequential
  8. from keras import backend as K
  9. #from tqdm import tqdm
  10. import tensorflow as tf
  11. from tensorflow.keras.optimizers import Adam
  12. from tensorflow.keras.layers import Lambda
  13. import tensorflow_probability as tfp
  14. from sklearn.utils import shuffle
  15. from library.NNSearch import NNSearch, randomIndices
  16. import warnings
  17. warnings.filterwarnings("ignore")
  18. def repeat(x, times):
  19. return [x for _i in range(times)]
  20. def create01Labels(totalSize, sizeFirstHalf):
  21. labels = repeat(np.array([1,0]), sizeFirstHalf)
  22. labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
  23. return np.array(labels)
  24. class XConvGeN(GanBaseClass):
  25. """
  26. This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
  27. """
  28. def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
  29. self.isTrained = False
  30. self.n_feat = n_feat
  31. self.neb = neb
  32. self.nebInitial = neb
  33. self.genInitial = gen
  34. self.gen = gen if gen is not None else self.neb
  35. self.neb_epochs = neb_epochs
  36. self.loss_history = None
  37. self.debug = debug
  38. self.minSetSize = 0
  39. self.conv_sample_generator = None
  40. self.maj_min_discriminator = None
  41. self.maj_proximal = maj_proximal
  42. self.cg = None
  43. self.canPredict = True
  44. self.fdc = fdc
  45. self.lastProgress = (-1,-1,-1)
  46. self.timing = { n: timing(n) for n in [
  47. "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
  48. ] }
  49. if self.neb is not None and self.gen is not None and self.neb > self.gen:
  50. raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
  51. def reset(self, data):
  52. """
  53. Creates the network.
  54. *dataSet* is a instance of /library.dataset.DataSet/ or None.
  55. It contains the training dataset.
  56. It is used to determine the neighbourhood size if /neb/ in /__init__/ was None.
  57. """
  58. self.isTrained = False
  59. if data is not None:
  60. nMinoryPoints = data.shape[0]
  61. if self.nebInitial is None:
  62. self.neb = nMinoryPoints
  63. else:
  64. self.neb = min(self.nebInitial, nMinoryPoints)
  65. else:
  66. self.neb = self.nebInitial
  67. self.gen = self.genInitial if self.genInitial is not None else self.neb
  68. ## instanciate generator network and visualize architecture
  69. self.conv_sample_generator = self._conv_sample_gen()
  70. ## instanciate discriminator network and visualize architecture
  71. self.maj_min_discriminator = self._maj_min_disc()
  72. ## instanciate network and visualize architecture
  73. self.cg = self._convGeN(self.conv_sample_generator, self.maj_min_discriminator)
  74. self.lastProgress = (-1,-1,-1)
  75. if self.debug:
  76. print(f"neb={self.neb}, gen={self.gen}")
  77. print(self.conv_sample_generator.summary())
  78. print('\n')
  79. print(self.maj_min_discriminator.summary())
  80. print('\n')
  81. print(self.cg.summary())
  82. print('\n')
  83. def train(self, data, discTrainCount=5, batchSize=32):
  84. """
  85. Trains the Network.
  86. *dataSet* is a instance of /library.dataset.DataSet/. It contains the training dataset.
  87. *discTrainCount* gives the number of extra training for the discriminator for each epoch. (>= 0)
  88. """
  89. if data.shape[0] <= 0:
  90. raise AttributeError("Train: Expected data class 1 to contain at least one point.")
  91. self.timing["Train"].start()
  92. # Store size of minority class. This is needed during point generation.
  93. self.minSetSize = data.shape[0]
  94. normalizedData = data
  95. if self.fdc is not None:
  96. normalizedData = self.fdc.normalize(data)
  97. print(f"|N| = {normalizedData.shape}")
  98. print(f"|D| = {data.shape}")
  99. self.timing["NbhSearch"].start()
  100. # Precalculate neighborhoods
  101. self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
  102. self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
  103. self.timing["NbhSearch"].stop()
  104. # Do the training.
  105. self._rough_learning(data, discTrainCount, batchSize=batchSize)
  106. # Neighborhood in majority class is no longer needed. So save memory.
  107. self.isTrained = True
  108. self.timing["Train"].stop()
  109. def generateDataPoint(self):
  110. """
  111. Returns one synthetic data point by repeating the stored list.
  112. """
  113. return (self.generateData(1))[0]
  114. def generateData(self, numOfSamples=1):
  115. """
  116. Generates a list of synthetic data-points.
  117. *numOfSamples* is a integer > 0. It gives the number of new generated samples.
  118. """
  119. if not self.isTrained:
  120. raise ValueError("Try to generate data with untrained network.")
  121. ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
  122. synth_num = (numOfSamples // self.minSetSize) + 1
  123. runs = (synth_num // self.gen) + 1
  124. ## Get a random list of all indices
  125. indices = randomIndices(self.minSetSize)
  126. ## generate all neighborhoods
  127. def neighborhoodGenerator():
  128. for index in indices:
  129. yield self.nmbMin.getNbhPointsOfItem(index)
  130. neighborhoods = (tf.data.Dataset
  131. .from_generator(neighborhoodGenerator, output_types=tf.float32)
  132. .repeat()
  133. )
  134. batch = neighborhoods.take(runs * self.minSetSize)
  135. synth_batch = self.conv_sample_generator.predict(batch.batch(32), verbose=0)
  136. pairs = tf.data.Dataset.zip(
  137. ( batch
  138. , tf.data.Dataset.from_tensor_slices(synth_batch)
  139. ))
  140. corrected = pairs.map(self.correct_feature_types())
  141. ## extract the exact number of synthetic samples needed to exactly balance the two classes
  142. r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.gen)))), axis=0)[:numOfSamples]
  143. return r
  144. def predictReal(self, data):
  145. """
  146. Uses the discriminator on data.
  147. *data* is a numpy array of shape (n, n_feat) where n is the number of datapoints and n_feat the number of features.
  148. """
  149. prediction = self.maj_min_discriminator.predict(data)
  150. return np.array([x[0] for x in prediction])
  151. # ###############################################################
  152. # Hidden internal functions
  153. # ###############################################################
  154. # Creating the Network: Generator
  155. def _conv_sample_gen(self, layerSize=None):
  156. """
  157. The generator network to generate synthetic samples from the convex space
  158. of arbitrary minority neighbourhoods
  159. """
  160. if layerSize is None:
  161. layerSize = (self.gen // 2) + 1
  162. ## takes minority batch as input
  163. min_neb_batch = Input(shape=(self.neb, self.n_feat,))
  164. ## using 1-D convolution, feature dimension remains the same
  165. x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
  166. ## flatten after convolution
  167. x = Flatten(name="InputMatrixToVector")(x)
  168. synth = []
  169. n = 0
  170. while n < self.gen:
  171. w = min(layerSize, self.gen - n)
  172. if w <= 0:
  173. break
  174. n += w
  175. ## add dense layer to transform the vector to a convenient dimension
  176. y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
  177. ## again, witching to 2-D tensor once we have the convenient shape
  178. y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
  179. ## column wise sum
  180. s = K.sum(y, axis=1)
  181. ## adding a small constant to always ensure the column sums are non zero.
  182. ## if this is not done then during initialization the sum can be zero.
  183. s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
  184. ## reprocals of the approximated column sum
  185. sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
  186. ## At this step we ensure that column sum is 1 for every row in x.
  187. ## That means, each column is set of convex co-efficient
  188. y = Multiply(name=f"P{n}_normalize")([sinv, y])
  189. ## Now we transpose the matrix. So each row is now a set of convex coefficients
  190. aff = tf.transpose(y[0], name=f"P{n}_transpose")
  191. ## We now do matrix multiplication of the affine combinations with the original
  192. ## minority batch taken as input. This generates a convex transformation
  193. ## of the input minority batch
  194. y = tf.matmul(aff, min_neb_batch, name=f"P{n}_project")
  195. synth.append(y)
  196. synth = tf.concat(synth, axis=1, name="collect_planes")
  197. nOut = self.gen * self.n_feat
  198. noiseGenerator = Sequential([
  199. InputLayer(input_shape=(self.gen, self.n_feat)),
  200. Flatten(),
  201. Dense(tfp.layers.IndependentNormal.params_size(nOut)),
  202. tfp.layers.IndependentNormal(nOut)
  203. ], name="RandomNoise")
  204. noise = noiseGenerator(synth)
  205. noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
  206. synth = Add(name="AddNoise")([synth, noise])
  207. ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
  208. ## as input and a covex space transformation of the same number of samples as output
  209. model = Model(inputs=min_neb_batch, outputs=synth)
  210. opt = Adam(learning_rate=0.001)
  211. model.compile(loss='mean_squared_logarithmic_error', optimizer=opt)
  212. return model
  213. # Creating the Network: discriminator
  214. def _maj_min_disc(self):
  215. """
  216. the discriminator is trained in two phase:
  217. first phase: while training ConvGeN the discriminator learns to differentiate synthetic
  218. minority samples generated from convex minority data space against
  219. the borderline majority samples
  220. second phase: after the ConvGeN generator learns to create synthetic samples,
  221. it can be used to generate synthetic samples to balance the dataset
  222. and then rettrain the discriminator with the balanced dataset
  223. """
  224. ## takes as input synthetic sample generated as input stacked upon a batch of
  225. ## borderline majority samples
  226. samples = Input(shape=(self.n_feat,))
  227. ## passed through two dense layers
  228. y = Dense(250, activation='relu')(samples)
  229. y = Dense(125, activation='relu')(y)
  230. y = Dense(75, activation='relu')(y)
  231. ## two output nodes. outputs have to be one-hot coded (see labels variable before)
  232. output = Dense(2, activation='sigmoid')(y)
  233. ## compile model
  234. model = Model(inputs=samples, outputs=output)
  235. opt = Adam(learning_rate=0.0001)
  236. model.compile(loss='binary_crossentropy', optimizer=opt)
  237. return model
  238. # Creating the Network: ConvGeN
  239. def _convGeN(self, generator, discriminator):
  240. """
  241. for joining the generator and the discriminator
  242. conv_coeff_generator-> generator network instance
  243. maj_min_discriminator -> discriminator network instance
  244. """
  245. ## by default the discriminator trainability is switched off.
  246. ## Thus training ConvGeN means training the generator network as per previously
  247. ## trained discriminator network.
  248. discriminator.trainable = False
  249. # Shape of data: (batchSize, 2, gen, n_feat)
  250. # Shape of labels: (batchSize, 2 * gen, 2)
  251. ## input receives a neighbourhood minority batch
  252. ## and a proximal majority batch concatenated
  253. batch_data = Input(shape=(2, self.gen, self.n_feat,))
  254. # batch_data: (batchSize, 2, gen, n_feat)
  255. ## extract minority batch
  256. min_batch = Lambda(lambda x: x[:, 0, : ,:], name="SplitForGen")(batch_data)
  257. # min_batch: (batchSize, gen, n_feat)
  258. ## extract majority batch
  259. maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
  260. # maj_batch: (batchSize, gen, n_feat)
  261. maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
  262. # maj_batch: (batchSize * gen, n_feat)
  263. ## pass minority batch into generator to obtain convex space transformation
  264. ## (synthetic samples) of the minority neighbourhood input batch
  265. conv_samples = generator(min_batch)
  266. # conv_batch: (batchSize, gen, n_feat)
  267. conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
  268. # conv_batch: (batchSize * gen, n_feat)
  269. ## pass samples into the discriminator to know its decisions
  270. conv_samples = discriminator(conv_samples)
  271. conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
  272. # conv_batch: (batchSize * gen, 2)
  273. maj_batch = discriminator(maj_batch)
  274. maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
  275. # conv_batch: (batchSize * gen, 2)
  276. ## concatenate the decisions
  277. output = tf.concat([conv_samples, maj_batch],axis=1)
  278. # output: (batchSize, 2 * gen, 2)
  279. ## note that, the discriminator will not be traied but will make decisions based
  280. ## on its previous training while using this function
  281. model = Model(inputs=batch_data, outputs=output)
  282. opt = Adam(learning_rate=0.0001)
  283. model.compile(loss='mse', optimizer=opt)
  284. return model
  285. # Training
  286. def _rough_learning(self, data, discTrainCount, batchSize=32):
  287. generator = self.conv_sample_generator
  288. discriminator = self.maj_min_discriminator
  289. convGeN = self.cg
  290. loss_history = [] ## this is for stroring the loss for every run
  291. minSetSize = len(data)
  292. ## Create labels for one neighborhood training.
  293. nLabels = 2 * self.gen
  294. labels = np.array(create01Labels(nLabels, self.gen))
  295. labelsGeN = np.array([labels])
  296. def getNeighborhoods():
  297. for index in range(self.minSetSize):
  298. yield indexToBatches(index)
  299. def indexToBatches(min_idx):
  300. self.timing["NBH"].start()
  301. ## generate minority neighbourhood batch for every minority class sampls by index
  302. min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
  303. min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
  304. ## generate random proximal majority batch
  305. maj_batch = self._BMB(min_batch_indices)
  306. self.timing["NBH"].stop()
  307. return (min_batch, maj_batch)
  308. def unbatch(rows):
  309. def fn():
  310. for row in rows:
  311. for part in row:
  312. for x in part:
  313. yield x
  314. return fn
  315. def genLabels():
  316. for min_idx in range(minSetSize):
  317. for x in labels:
  318. yield x
  319. padd = np.zeros((self.gen - self.neb, self.n_feat))
  320. discTrainCount = 1 + max(0, discTrainCount)
  321. for neb_epoch_count in range(self.neb_epochs):
  322. self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
  323. ## Training of the discriminator.
  324. #
  325. # Get all neighborhoods and synthetic points as data stream.
  326. nbhPairs = tf.data.Dataset.from_generator(getNeighborhoods, output_types=tf.float32).repeat().take(discTrainCount * self.minSetSize)
  327. nbhMin = nbhPairs.map(lambda x: x[0])
  328. batchMaj = nbhPairs.map(lambda x: x[1])
  329. fnCt = self.correct_feature_types()
  330. synth_batch = self.conv_sample_generator.predict(nbhMin.batch(32), verbose=0)
  331. pairMinMaj = tf.data.Dataset.zip(
  332. ( nbhMin
  333. , tf.data.Dataset.from_tensor_slices(synth_batch)
  334. , batchMaj
  335. )).map(lambda x, y, z: [fnCt(x,y), z])
  336. a = tf.data.Dataset.from_generator(unbatch(pairMinMaj), output_types=tf.float32)
  337. # Get all labels as data stream.
  338. b = tf.data.Dataset.from_tensor_slices(labels).repeat()
  339. # Zip data and matching labels together for training.
  340. samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
  341. # train the discriminator with the concatenated samples and the one-hot encoded labels
  342. self.timing["Fit"].start()
  343. discriminator.trainable = True
  344. discriminator.fit(x=samples, verbose=0)
  345. discriminator.trainable = False
  346. self.timing["Fit"].stop()
  347. ## use the complete network to make the generator learn on the decisions
  348. ## made by the previous discriminator training
  349. #
  350. # Get all neighborhoods as data stream.
  351. a = (tf.data.Dataset
  352. .from_generator(getNeighborhoods, output_types=tf.float32)
  353. .map(lambda x: [[tf.concat([x[0], padd], axis=0), x[1]]]))
  354. # Get all labels as data stream.
  355. b = tf.data.Dataset.from_tensor_slices(labelsGeN).repeat()
  356. # Zip data and matching labels together for training.
  357. samples = tf.data.Dataset.zip((a, b)).batch(batchSize)
  358. # Train with the data stream. Store the loss for later usage.
  359. gen_loss_history = convGeN.fit(samples, verbose=0, batch_size=batchSize)
  360. loss_history.append(gen_loss_history.history['loss'])
  361. ## When done: print some statistics.
  362. if self.debug:
  363. run_range = range(1, len(loss_history) + 1)
  364. plt.rcParams["figure.figsize"] = (16,10)
  365. plt.xticks(fontsize=20)
  366. plt.yticks(fontsize=20)
  367. plt.xlabel('runs', fontsize=25)
  368. plt.ylabel('loss', fontsize=25)
  369. plt.title('Rough learning loss for discriminator', fontsize=25)
  370. plt.plot(run_range, loss_history)
  371. plt.show()
  372. ## When done: print some statistics.
  373. self.loss_history = loss_history
  374. def _BMB(self, min_idxs):
  375. ## Generate a borderline majority batch
  376. ## data_maj -> majority class data
  377. ## min_idxs -> indices of points in minority class
  378. ## gen -> convex combinations generated from each neighbourhood
  379. self.timing["BMB"].start()
  380. indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
  381. r = self.nmbMin.basePoints[indices]
  382. self.timing["BMB"].stop()
  383. return r
  384. def retrainDiscriminitor(self, data, labels):
  385. self.maj_min_discriminator.trainable = True
  386. labels = np.array([ [x, 1 - x] for x in labels])
  387. self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
  388. self.maj_min_discriminator.trainable = False
  389. def progressBar(self, x):
  390. x = [int(v * 10) for v in x]
  391. if True not in [self.lastProgress[i] != x[i] for i in range(len(self.lastProgress))]:
  392. return
  393. def bar(v):
  394. r = ""
  395. for n in range(10):
  396. if n > v:
  397. r += " "
  398. else:
  399. r += "="
  400. return r
  401. s = [bar(v) for v in x]
  402. print(f"[{s[0]}] [{s[1]}] [{s[2]}]", end="\r")
  403. def correct_feature_types(self):
  404. # batch[0] = original points (gen x n_feat)
  405. # batch[1] = synthetic points (gen x n_feat)
  406. @tf.function
  407. def voidFunction(reference, synth):
  408. return synth
  409. if self.fdc is None:
  410. return voidFunction
  411. columns = set(self.fdc.nom_list or [])
  412. for y in (self.fdc.ord_list or []):
  413. columns.add(y)
  414. columns = list(columns)
  415. if len(columns) == 0:
  416. return voidFunction
  417. neb = self.neb
  418. n_feat = self.n_feat
  419. nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
  420. if n_feat is None:
  421. print("ERRROR n_feat is None")
  422. if nn is None:
  423. print("ERRROR nn is None")
  424. @tf.function
  425. def bestMatchOf(vi):
  426. value = vi[0]
  427. c = vi[1][0]
  428. r = vi[2]
  429. if c != 0.0:
  430. d = tf.abs(value - r)
  431. return r[tf.math.argmin(d)]
  432. else:
  433. return value[0]
  434. @tf.function
  435. def indexted(v, rt):
  436. vv = tf.reshape(tf.repeat([v], neb, axis=1), (n_feat, neb))
  437. vn = tf.reshape(tf.repeat([nn], neb, axis=1), (n_feat, neb))
  438. return tf.stack((vv, vn, rt), axis=1)
  439. @tf.function
  440. def correctVector(v, rt):
  441. return tf.map_fn(lambda x: bestMatchOf(x), indexted(v, rt))
  442. @tf.function
  443. def fn(reference, synth):
  444. rt = tf.transpose(reference)
  445. return tf.map_fn(lambda x: correctVector(x, rt), synth)
  446. return fn