XConvGeN.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. from library.interfaces import GanBaseClass
  4. from library.dataset import DataSet
  5. from library.timing import timing
  6. from keras.layers import Dense, Input, Multiply, Flatten, Conv1D, Reshape, InputLayer, Add
  7. from keras.models import Model, Sequential
  8. from keras import backend as K
  9. #from tqdm import tqdm
  10. import tensorflow as tf
  11. from tensorflow.keras.optimizers import Adam
  12. from tensorflow.keras.layers import Lambda
  13. import tensorflow_probability as tfp
  14. from sklearn.utils import shuffle
  15. from library.NNSearch import NNSearch, randomIndices
  16. import warnings
  17. warnings.filterwarnings("ignore")
  18. def repeat(x, times):
  19. return [x for _i in range(times)]
  20. def create01Labels(totalSize, sizeFirstHalf):
  21. labels = repeat(np.array([1,0]), sizeFirstHalf)
  22. labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
  23. return np.array(labels)
  24. class XConvGeN(GanBaseClass):
  25. """
  26. This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
  27. """
  28. def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
  29. self.isTrained = False
  30. self.n_feat = n_feat
  31. self.neb = neb
  32. self.nebInitial = neb
  33. self.genInitial = gen
  34. self.gen = gen if gen is not None else self.neb
  35. self.neb_epochs = neb_epochs
  36. self.loss_history = None
  37. self.debug = debug
  38. self.minSetSize = 0
  39. self.conv_sample_generator = None
  40. self.maj_min_discriminator = None
  41. self.maj_proximal = maj_proximal
  42. self.cg = None
  43. self.canPredict = True
  44. self.fdc = fdc
  45. self.lastProgress = (-1,-1,-1)
  46. self.timing = { n: timing(n) for n in [
  47. "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
  48. ] }
  49. if self.neb is not None and self.gen is not None and self.neb > self.gen:
  50. raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
  51. def reset(self, data):
  52. """
  53. Creates the network.
  54. *dataSet* is a instance of /library.dataset.DataSet/ or None.
  55. It contains the training dataset.
  56. It is used to determine the neighbourhood size if /neb/ in /__init__/ was None.
  57. """
  58. self.isTrained = False
  59. if data is not None:
  60. nMinoryPoints = data.shape[0]
  61. if self.nebInitial is None:
  62. self.neb = nMinoryPoints
  63. else:
  64. self.neb = min(self.nebInitial, nMinoryPoints)
  65. else:
  66. self.neb = self.nebInitial
  67. self.gen = self.genInitial if self.genInitial is not None else self.neb
  68. ## instanciate generator network and visualize architecture
  69. self.conv_sample_generator = self._conv_sample_gen()
  70. ## instanciate discriminator network and visualize architecture
  71. self.maj_min_discriminator = self._maj_min_disc()
  72. ## instanciate network and visualize architecture
  73. self.cg = self._convGeN(self.conv_sample_generator, self.maj_min_discriminator)
  74. self.lastProgress = (-1,-1,-1)
  75. if self.debug:
  76. print(f"neb={self.neb}, gen={self.gen}")
  77. print(self.conv_sample_generator.summary())
  78. print('\n')
  79. print(self.maj_min_discriminator.summary())
  80. print('\n')
  81. print(self.cg.summary())
  82. print('\n')
  83. def train(self, data, discTrainCount=5, batchSize=32):
  84. """
  85. Trains the Network.
  86. *dataSet* is a instance of /library.dataset.DataSet/. It contains the training dataset.
  87. *discTrainCount* gives the number of extra training for the discriminator for each epoch. (>= 0)
  88. """
  89. if data.shape[0] <= 0:
  90. raise AttributeError("Train: Expected data class 1 to contain at least one point.")
  91. self.timing["Train"].start()
  92. # Store size of minority class. This is needed during point generation.
  93. self.minSetSize = data.shape[0]
  94. normalizedData = data
  95. if self.fdc is not None:
  96. normalizedData = self.fdc.normalize(data)
  97. print(f"|N| = {normalizedData.shape}")
  98. print(f"|D| = {data.shape}")
  99. self.timing["NbhSearch"].start()
  100. # Precalculate neighborhoods
  101. self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
  102. self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
  103. self.timing["NbhSearch"].stop()
  104. # Do the training.
  105. self._rough_learning(data, discTrainCount, batchSize=batchSize)
  106. # Neighborhood in majority class is no longer needed. So save memory.
  107. self.isTrained = True
  108. self.timing["Train"].stop()
  109. def generateDataPoint(self):
  110. """
  111. Returns one synthetic data point by repeating the stored list.
  112. """
  113. return (self.generateData(1))[0]
  114. def generateData(self, numOfSamples=1):
  115. """
  116. Generates a list of synthetic data-points.
  117. *numOfSamples* is a integer > 0. It gives the number of new generated samples.
  118. """
  119. if not self.isTrained:
  120. raise ValueError("Try to generate data with untrained network.")
  121. ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
  122. synth_num = (numOfSamples // self.minSetSize) + 1
  123. runs = (synth_num // self.gen) + 1
  124. ## Get a random list of all indices
  125. indices = randomIndices(self.minSetSize)
  126. ## generate all neighborhoods
  127. def neighborhoodGenerator():
  128. for index in indices:
  129. yield self.nmbMin.getNbhPointsOfItem(index)
  130. neighborhoods = (tf.data.Dataset
  131. .from_generator(neighborhoodGenerator, output_types=tf.float32)
  132. .repeat()
  133. )
  134. batch = neighborhoods.take(runs * self.minSetSize).batch(32)
  135. synth_batch = self.conv_sample_generator.predict(batch)
  136. n = 0
  137. synth_set = []
  138. for (x,y) in zip(neighborhoods, synth_batch):
  139. synth_set.extend(self.correct_feature_types(x.numpy(), y))
  140. n += len(y)
  141. if n >= numOfSamples:
  142. break
  143. ## extract the exact number of synthetic samples needed to exactly balance the two classes
  144. return np.array(synth_set[:numOfSamples])
  145. def predictReal(self, data):
  146. """
  147. Uses the discriminator on data.
  148. *data* is a numpy array of shape (n, n_feat) where n is the number of datapoints and n_feat the number of features.
  149. """
  150. prediction = self.maj_min_discriminator.predict(data)
  151. return np.array([x[0] for x in prediction])
  152. # ###############################################################
  153. # Hidden internal functions
  154. # ###############################################################
  155. # Creating the Network: Generator
  156. def _conv_sample_gen(self, layerSize=None):
  157. """
  158. The generator network to generate synthetic samples from the convex space
  159. of arbitrary minority neighbourhoods
  160. """
  161. if layerSize is None:
  162. layerSize = (self.gen // 2) + 1
  163. ## takes minority batch as input
  164. min_neb_batch = Input(shape=(self.neb, self.n_feat,))
  165. ## using 1-D convolution, feature dimension remains the same
  166. x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
  167. ## flatten after convolution
  168. x = Flatten(name="InputMatrixToVector")(x)
  169. synth = []
  170. n = 0
  171. while n < self.gen:
  172. w = min(layerSize, self.gen - n)
  173. if w <= 0:
  174. break
  175. n += w
  176. ## add dense layer to transform the vector to a convenient dimension
  177. y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
  178. ## again, witching to 2-D tensor once we have the convenient shape
  179. y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
  180. ## column wise sum
  181. s = K.sum(y, axis=1)
  182. ## adding a small constant to always ensure the column sums are non zero.
  183. ## if this is not done then during initialization the sum can be zero.
  184. s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
  185. ## reprocals of the approximated column sum
  186. sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
  187. ## At this step we ensure that column sum is 1 for every row in x.
  188. ## That means, each column is set of convex co-efficient
  189. y = Multiply(name=f"P{n}_normalize")([sinv, y])
  190. ## Now we transpose the matrix. So each row is now a set of convex coefficients
  191. aff = tf.transpose(y[0], name=f"P{n}_transpose")
  192. ## We now do matrix multiplication of the affine combinations with the original
  193. ## minority batch taken as input. This generates a convex transformation
  194. ## of the input minority batch
  195. y = tf.matmul(aff, min_neb_batch, name=f"P{n}_project")
  196. synth.append(y)
  197. synth = tf.concat(synth, axis=1, name="collect_planes")
  198. nOut = self.gen * self.n_feat
  199. noiseGenerator = Sequential([
  200. InputLayer(input_shape=(self.gen, self.n_feat)),
  201. Flatten(),
  202. Dense(tfp.layers.IndependentNormal.params_size(nOut)),
  203. tfp.layers.IndependentNormal(nOut)
  204. ], name="RandomNoise")
  205. noise = noiseGenerator(synth)
  206. noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
  207. synth = Add(name="AddNoise")([synth, noise])
  208. ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
  209. ## as input and a covex space transformation of the same number of samples as output
  210. model = Model(inputs=min_neb_batch, outputs=synth)
  211. opt = Adam(learning_rate=0.001)
  212. model.compile(loss='mean_squared_logarithmic_error', optimizer=opt)
  213. return model
  214. # Creating the Network: discriminator
  215. def _maj_min_disc(self):
  216. """
  217. the discriminator is trained in two phase:
  218. first phase: while training ConvGeN the discriminator learns to differentiate synthetic
  219. minority samples generated from convex minority data space against
  220. the borderline majority samples
  221. second phase: after the ConvGeN generator learns to create synthetic samples,
  222. it can be used to generate synthetic samples to balance the dataset
  223. and then rettrain the discriminator with the balanced dataset
  224. """
  225. ## takes as input synthetic sample generated as input stacked upon a batch of
  226. ## borderline majority samples
  227. samples = Input(shape=(self.n_feat,))
  228. ## passed through two dense layers
  229. y = Dense(250, activation='relu')(samples)
  230. y = Dense(125, activation='relu')(y)
  231. y = Dense(75, activation='relu')(y)
  232. ## two output nodes. outputs have to be one-hot coded (see labels variable before)
  233. output = Dense(2, activation='sigmoid')(y)
  234. ## compile model
  235. model = Model(inputs=samples, outputs=output)
  236. opt = Adam(learning_rate=0.0001)
  237. model.compile(loss='binary_crossentropy', optimizer=opt)
  238. return model
  239. # Creating the Network: ConvGeN
  240. def _convGeN(self, generator, discriminator):
  241. """
  242. for joining the generator and the discriminator
  243. conv_coeff_generator-> generator network instance
  244. maj_min_discriminator -> discriminator network instance
  245. """
  246. ## by default the discriminator trainability is switched off.
  247. ## Thus training ConvGeN means training the generator network as per previously
  248. ## trained discriminator network.
  249. discriminator.trainable = False
  250. # Shape of data: (batchSize, 2, gen, n_feat)
  251. # Shape of labels: (batchSize, 2 * gen, 2)
  252. ## input receives a neighbourhood minority batch
  253. ## and a proximal majority batch concatenated
  254. batch_data = Input(shape=(2, self.gen, self.n_feat,))
  255. # batch_data: (batchSize, 2, gen, n_feat)
  256. ## extract minority batch
  257. min_batch = Lambda(lambda x: x[:, 0, : ,:], name="SplitForGen")(batch_data)
  258. # min_batch: (batchSize, gen, n_feat)
  259. ## extract majority batch
  260. maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
  261. # maj_batch: (batchSize, gen, n_feat)
  262. maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
  263. # maj_batch: (batchSize * gen, n_feat)
  264. ## pass minority batch into generator to obtain convex space transformation
  265. ## (synthetic samples) of the minority neighbourhood input batch
  266. conv_samples = generator(min_batch)
  267. # conv_batch: (batchSize, gen, n_feat)
  268. conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
  269. # conv_batch: (batchSize * gen, n_feat)
  270. ## pass samples into the discriminator to know its decisions
  271. conv_samples = discriminator(conv_samples)
  272. conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
  273. # conv_batch: (batchSize * gen, 2)
  274. maj_batch = discriminator(maj_batch)
  275. maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
  276. # conv_batch: (batchSize * gen, 2)
  277. ## concatenate the decisions
  278. output = tf.concat([conv_samples, maj_batch],axis=1)
  279. # output: (batchSize, 2 * gen, 2)
  280. ## note that, the discriminator will not be traied but will make decisions based
  281. ## on its previous training while using this function
  282. model = Model(inputs=batch_data, outputs=output)
  283. opt = Adam(learning_rate=0.0001)
  284. model.compile(loss='mse', optimizer=opt)
  285. return model
  286. # Training
  287. def _rough_learning(self, data, discTrainCount, batchSize=32):
  288. generator = self.conv_sample_generator
  289. discriminator = self.maj_min_discriminator
  290. convGeN = self.cg
  291. loss_history = [] ## this is for stroring the loss for every run
  292. minSetSize = len(data)
  293. ## Create labels for one neighborhood training.
  294. nLabels = 2 * self.gen
  295. labels = np.array(create01Labels(nLabels, self.gen))
  296. labelsGeN = np.array([labels])
  297. def indexToBatches(min_idx):
  298. self.timing["NBH"].start()
  299. ## generate minority neighbourhood batch for every minority class sampls by index
  300. min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
  301. min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
  302. ## generate random proximal majority batch
  303. maj_batch = self._BMB(min_batch_indices)
  304. self.timing["NBH"].stop()
  305. return (min_batch, maj_batch)
  306. def createSamples(min_idx):
  307. min_batch, maj_batch = indexToBatches(min_idx)
  308. self.timing["GenSamples"].start()
  309. ## generate synthetic samples from convex space
  310. ## of minority neighbourhood batch using generator
  311. conv_samples = generator.predict(np.array([min_batch]), batch_size=self.neb, verbose=0)
  312. conv_samples = tf.reshape(conv_samples, shape=(self.gen, self.n_feat))
  313. self.timing["GenSamples"].stop()
  314. self.timing["FixType"].start()
  315. ## Fix feature types
  316. conv_samples = self.correct_feature_types(min_batch.numpy(), conv_samples)
  317. self.timing["FixType"].stop()
  318. ## concatenate them with the majority batch
  319. conv_samples = [conv_samples, maj_batch]
  320. return conv_samples
  321. def genSamplesForDisc():
  322. for min_idx in range(minSetSize):
  323. yield createSamples(min_idx)
  324. def genSamplesForGeN():
  325. for min_idx in range(minSetSize):
  326. yield indexToBatches(min_idx)
  327. def unbatch(rows):
  328. def fn():
  329. for row in rows:
  330. for part in row:
  331. for x in part:
  332. yield x
  333. return fn
  334. def genLabels():
  335. for min_idx in range(minSetSize):
  336. for x in labels:
  337. yield x
  338. padd = np.zeros((self.gen - self.neb, self.n_feat))
  339. discTrainCount = 1 + max(0, discTrainCount)
  340. for neb_epoch_count in range(self.neb_epochs):
  341. self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
  342. ## Training of the discriminator.
  343. #
  344. # Get all neighborhoods and synthetic points as data stream.
  345. a = tf.data.Dataset.from_generator(genSamplesForDisc, output_types=tf.float32).repeat().take(discTrainCount * self.minSetSize)
  346. a = tf.data.Dataset.from_generator(unbatch(a), output_types=tf.float32)
  347. # Get all labels as data stream.
  348. b = tf.data.Dataset.from_tensor_slices(labels).repeat()
  349. # Zip data and matching labels together for training.
  350. samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
  351. # train the discriminator with the concatenated samples and the one-hot encoded labels
  352. self.timing["Fit"].start()
  353. discriminator.trainable = True
  354. discriminator.fit(x=samples, verbose=0)
  355. discriminator.trainable = False
  356. self.timing["Fit"].stop()
  357. ## use the complete network to make the generator learn on the decisions
  358. ## made by the previous discriminator training
  359. #
  360. # Get all neighborhoods as data stream.
  361. a = (tf.data.Dataset
  362. .from_generator(genSamplesForGeN, output_types=tf.float32)
  363. .map(lambda x: [[tf.concat([x[0], padd], axis=0), x[1]]]))
  364. # Get all labels as data stream.
  365. b = tf.data.Dataset.from_tensor_slices(labelsGeN).repeat()
  366. # Zip data and matching labels together for training.
  367. samples = tf.data.Dataset.zip((a, b)).batch(batchSize)
  368. # Train with the data stream. Store the loss for later usage.
  369. gen_loss_history = convGeN.fit(samples, verbose=0, batch_size=batchSize)
  370. loss_history.append(gen_loss_history.history['loss'])
  371. ## When done: print some statistics.
  372. if self.debug:
  373. run_range = range(1, len(loss_history) + 1)
  374. plt.rcParams["figure.figsize"] = (16,10)
  375. plt.xticks(fontsize=20)
  376. plt.yticks(fontsize=20)
  377. plt.xlabel('runs', fontsize=25)
  378. plt.ylabel('loss', fontsize=25)
  379. plt.title('Rough learning loss for discriminator', fontsize=25)
  380. plt.plot(run_range, loss_history)
  381. plt.show()
  382. ## When done: print some statistics.
  383. self.loss_history = loss_history
  384. def _BMB(self, min_idxs):
  385. ## Generate a borderline majority batch
  386. ## data_maj -> majority class data
  387. ## min_idxs -> indices of points in minority class
  388. ## gen -> convex combinations generated from each neighbourhood
  389. self.timing["BMB"].start()
  390. indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
  391. r = self.nmbMin.basePoints[indices]
  392. self.timing["BMB"].stop()
  393. return r
  394. def retrainDiscriminitor(self, data, labels):
  395. self.maj_min_discriminator.trainable = True
  396. labels = np.array([ [x, 1 - x] for x in labels])
  397. self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
  398. self.maj_min_discriminator.trainable = False
  399. def progressBar(self, x):
  400. x = [int(v * 10) for v in x]
  401. if True not in [self.lastProgress[i] != x[i] for i in range(len(self.lastProgress))]:
  402. return
  403. def bar(v):
  404. r = ""
  405. for n in range(10):
  406. if n > v:
  407. r += " "
  408. else:
  409. r += "="
  410. return r
  411. s = [bar(v) for v in x]
  412. print(f"[{s[0]}] [{s[1]}] [{s[2]}]", end="\r")
  413. def correct_feature_types(self, batch, synth_batch):
  414. if self.fdc is None:
  415. return synth_batch
  416. def bestMatchOf(referenceValues, value):
  417. if referenceValues is not None:
  418. best = referenceValues[0]
  419. d = abs(best - value)
  420. for x in referenceValues:
  421. dx = abs(x - value)
  422. if dx < d:
  423. best = x
  424. d = dx
  425. return best
  426. else:
  427. return value
  428. def correctVector(referenceLists, v):
  429. return np.array([bestMatchOf(referenceLists[i], v[i]) for i in range(len(v))])
  430. referenceLists = [None for _ in range(self.n_feat)]
  431. for i in (self.fdc.nom_list or []):
  432. referenceLists[i] = list(set(list(batch[:, i])))
  433. for i in (self.fdc.ord_list or []):
  434. referenceLists[i] = list(set(list(batch[:, i])))
  435. # print(batch.shape, synth_batch.shape)
  436. return Lambda(lambda x: np.array([correctVector(referenceLists, y) for y in x]))(synth_batch)