|
@@ -1,21 +1,9 @@
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
-from numpy.random import seed
|
|
|
|
|
-import pandas as pd
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from library.interfaces import GanBaseClass
|
|
from library.interfaces import GanBaseClass
|
|
|
from library.dataset import DataSet
|
|
from library.dataset import DataSet
|
|
|
|
|
|
|
|
-from sklearn.decomposition import PCA
|
|
|
|
|
-from sklearn.metrics import confusion_matrix
|
|
|
|
|
-from sklearn.metrics import f1_score
|
|
|
|
|
-from sklearn.metrics import cohen_kappa_score
|
|
|
|
|
-from sklearn.metrics import precision_score
|
|
|
|
|
-from sklearn.metrics import recall_score
|
|
|
|
|
-from sklearn.neighbors import NearestNeighbors
|
|
|
|
|
-from sklearn.utils import shuffle
|
|
|
|
|
-from imblearn.datasets import fetch_datasets
|
|
|
|
|
-
|
|
|
|
|
from keras.layers import Dense, Input, Multiply, Flatten, Conv1D, Reshape
|
|
from keras.layers import Dense, Input, Multiply, Flatten, Conv1D, Reshape
|
|
|
from keras.models import Model
|
|
from keras.models import Model
|
|
|
from keras import backend as K
|
|
from keras import backend as K
|
|
@@ -45,7 +33,7 @@ class ConvGAN(GanBaseClass):
|
|
|
This is a toy example of a GAN.
|
|
This is a toy example of a GAN.
|
|
|
It repeats the first point of the training-data-set.
|
|
It repeats the first point of the training-data-set.
|
|
|
"""
|
|
"""
|
|
|
- def __init__(self, n_feat, neb=5, gen=5, neb_epochs=10, debug=True):
|
|
|
|
|
|
|
+ def __init__(self, n_feat, neb=5, gen=5, neb_epochs=10, withMajorhoodNbSearch=False, debug=False):
|
|
|
self.isTrained = False
|
|
self.isTrained = False
|
|
|
self.n_feat = n_feat
|
|
self.n_feat = n_feat
|
|
|
self.neb = neb
|
|
self.neb = neb
|
|
@@ -53,10 +41,12 @@ class ConvGAN(GanBaseClass):
|
|
|
self.neb_epochs = 10
|
|
self.neb_epochs = 10
|
|
|
self.loss_history = None
|
|
self.loss_history = None
|
|
|
self.debug = debug
|
|
self.debug = debug
|
|
|
- self.dataSet = None
|
|
|
|
|
|
|
+ self.minSetSize = 0
|
|
|
self.conv_sample_generator = None
|
|
self.conv_sample_generator = None
|
|
|
self.maj_min_discriminator = None
|
|
self.maj_min_discriminator = None
|
|
|
|
|
+ self.withMajorhoodNbSearch = withMajorhoodNbSearch
|
|
|
self.cg = None
|
|
self.cg = None
|
|
|
|
|
+ self.canPredict = True
|
|
|
|
|
|
|
|
if neb > gen:
|
|
if neb > gen:
|
|
|
raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
|
|
raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
|
|
@@ -85,7 +75,7 @@ class ConvGAN(GanBaseClass):
|
|
|
print(self.cg.summary())
|
|
print(self.cg.summary())
|
|
|
print('\n')
|
|
print('\n')
|
|
|
|
|
|
|
|
- def train(self, dataSet):
|
|
|
|
|
|
|
+ def train(self, dataSet, discTrainCount=5):
|
|
|
"""
|
|
"""
|
|
|
Trains the GAN.
|
|
Trains the GAN.
|
|
|
|
|
|
|
@@ -97,9 +87,21 @@ class ConvGAN(GanBaseClass):
|
|
|
if dataSet.data1.shape[0] <= 0:
|
|
if dataSet.data1.shape[0] <= 0:
|
|
|
raise AttributeError("Train: Expected data class 1 to contain at least one point.")
|
|
raise AttributeError("Train: Expected data class 1 to contain at least one point.")
|
|
|
|
|
|
|
|
- self.dataSet = dataSet
|
|
|
|
|
- self.nmb = self._NMB_prepare(dataSet.data1)
|
|
|
|
|
- self._rough_learning(dataSet.data1, dataSet.data0)
|
|
|
|
|
|
|
+ # Store size of minority class. This is needed during point generation.
|
|
|
|
|
+ self.minSetSize = dataSet.data1.shape[0]
|
|
|
|
|
+
|
|
|
|
|
+ # Precalculate neighborhoods
|
|
|
|
|
+ self.nmbMin = NNSearch(self.neb).fit(haystack=dataSet.data1)
|
|
|
|
|
+ if self.withMajorhoodNbSearch:
|
|
|
|
|
+ self.nmbMaj = NNSearch(self.neb).fit(haystack=dataSet.data0, needles=dataSet.data1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ self.nmbMaj = None
|
|
|
|
|
+
|
|
|
|
|
+ # Do the training.
|
|
|
|
|
+ self._rough_learning(dataSet.data1, dataSet.data0, discTrainCount)
|
|
|
|
|
+
|
|
|
|
|
+ # Neighborhood in majority class is no longer needed. So save memory.
|
|
|
|
|
+ self.nmbMaj = None
|
|
|
self.isTrained = True
|
|
self.isTrained = True
|
|
|
|
|
|
|
|
def generateDataPoint(self):
|
|
def generateDataPoint(self):
|
|
@@ -118,14 +120,12 @@ class ConvGAN(GanBaseClass):
|
|
|
if not self.isTrained:
|
|
if not self.isTrained:
|
|
|
raise ValueError("Try to generate data with untrained Re.")
|
|
raise ValueError("Try to generate data with untrained Re.")
|
|
|
|
|
|
|
|
- data_min = self.dataSet.data1
|
|
|
|
|
-
|
|
|
|
|
## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
|
|
## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
|
|
|
- synth_num = (numOfSamples // len(data_min)) + 1
|
|
|
|
|
|
|
+ synth_num = (numOfSamples // self.minSetSize) + 1
|
|
|
|
|
|
|
|
## generate synth_num synthetic samples from each minority neighbourhood
|
|
## generate synth_num synthetic samples from each minority neighbourhood
|
|
|
synth_set=[]
|
|
synth_set=[]
|
|
|
- for i in range(len(data_min)):
|
|
|
|
|
|
|
+ for i in range(self.minSetSize):
|
|
|
synth_set.extend(self._generate_data_for_min_point(i, synth_num))
|
|
synth_set.extend(self._generate_data_for_min_point(i, synth_num))
|
|
|
|
|
|
|
|
## extract the exact number of synthetic samples needed to exactly balance the two classes
|
|
## extract the exact number of synthetic samples needed to exactly balance the two classes
|
|
@@ -133,6 +133,10 @@ class ConvGAN(GanBaseClass):
|
|
|
|
|
|
|
|
return synth_set
|
|
return synth_set
|
|
|
|
|
|
|
|
|
|
+ def predictReal(self, data):
|
|
|
|
|
+ prediction = self.maj_min_discriminator.predict(data)
|
|
|
|
|
+ return np.array([x[0] for x in prediction])
|
|
|
|
|
+
|
|
|
# ###############################################################
|
|
# ###############################################################
|
|
|
# Hidden internal functions
|
|
# Hidden internal functions
|
|
|
# ###############################################################
|
|
# ###############################################################
|
|
@@ -200,6 +204,7 @@ class ConvGAN(GanBaseClass):
|
|
|
## passed through two dense layers
|
|
## passed through two dense layers
|
|
|
y = Dense(250, activation='relu')(samples)
|
|
y = Dense(250, activation='relu')(samples)
|
|
|
y = Dense(125, activation='relu')(y)
|
|
y = Dense(125, activation='relu')(y)
|
|
|
|
|
+ y = Dense(75, activation='relu')(y)
|
|
|
|
|
|
|
|
## two output nodes. outputs have to be one-hot coded (see labels variable before)
|
|
## two output nodes. outputs have to be one-hot coded (see labels variable before)
|
|
|
output = Dense(2, activation='sigmoid')(y)
|
|
output = Dense(2, activation='sigmoid')(y)
|
|
@@ -265,7 +270,7 @@ class ConvGAN(GanBaseClass):
|
|
|
runs = int(synth_num / self.neb) + 1
|
|
runs = int(synth_num / self.neb) + 1
|
|
|
synth_set = []
|
|
synth_set = []
|
|
|
for _run in range(runs):
|
|
for _run in range(runs):
|
|
|
- batch = self._NMB_guided(index)
|
|
|
|
|
|
|
+ batch = self.nmbMin.getNbhPointsOfItem(index)
|
|
|
synth_batch = self.conv_sample_generator.predict(batch)
|
|
synth_batch = self.conv_sample_generator.predict(batch)
|
|
|
synth_set.extend(synth_batch)
|
|
synth_set.extend(synth_batch)
|
|
|
|
|
|
|
@@ -274,52 +279,73 @@ class ConvGAN(GanBaseClass):
|
|
|
|
|
|
|
|
|
|
|
|
|
# Training
|
|
# Training
|
|
|
- def _rough_learning(self, data_min, data_maj):
|
|
|
|
|
|
|
+ def _rough_learning(self, data_min, data_maj, discTrainCount):
|
|
|
generator = self.conv_sample_generator
|
|
generator = self.conv_sample_generator
|
|
|
discriminator = self.maj_min_discriminator
|
|
discriminator = self.maj_min_discriminator
|
|
|
GAN = self.cg
|
|
GAN = self.cg
|
|
|
loss_history = [] ## this is for stroring the loss for every run
|
|
loss_history = [] ## this is for stroring the loss for every run
|
|
|
- min_idx = 0
|
|
|
|
|
- neb_epoch_count = 1
|
|
|
|
|
|
|
+ step = 0
|
|
|
|
|
+ minSetSize = len(data_min)
|
|
|
|
|
|
|
|
labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
|
|
labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
|
|
|
|
|
|
|
|
- for step in range(self.neb_epochs * len(data_min)):
|
|
|
|
|
- ## generate minority neighbourhood batch for every minority class sampls by index
|
|
|
|
|
- min_batch = self._NMB_guided(min_idx)
|
|
|
|
|
- min_idx = min_idx + 1
|
|
|
|
|
- ## generate random proximal majority batch
|
|
|
|
|
- maj_batch = self._BMB(data_min, data_maj)
|
|
|
|
|
-
|
|
|
|
|
- ## generate synthetic samples from convex space
|
|
|
|
|
- ## of minority neighbourhood batch using generator
|
|
|
|
|
- conv_samples = generator.predict(min_batch)
|
|
|
|
|
- ## concatenate them with the majority batch
|
|
|
|
|
- concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
|
|
|
|
|
-
|
|
|
|
|
- ## switch on discriminator training
|
|
|
|
|
- discriminator.trainable = True
|
|
|
|
|
- ## train the discriminator with the concatenated samples and the one-hot encoded labels
|
|
|
|
|
- discriminator.fit(x=concat_sample, y=labels, verbose=0)
|
|
|
|
|
- ## switch off the discriminator training again
|
|
|
|
|
- discriminator.trainable = False
|
|
|
|
|
-
|
|
|
|
|
- ## use the GAN to make the generator learn on the decisions
|
|
|
|
|
- ## made by the previous discriminator training
|
|
|
|
|
- ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
|
|
|
|
|
- gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
|
|
|
|
|
-
|
|
|
|
|
- ## store the loss for the step
|
|
|
|
|
- loss_history.append(gan_loss_history.history['loss'])
|
|
|
|
|
-
|
|
|
|
|
- if self.debug and ((step + 1) % 10 == 0):
|
|
|
|
|
- print(f"{step + 1} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
|
|
|
|
|
-
|
|
|
|
|
- if min_idx == len(data_min) - 1:
|
|
|
|
|
- if self.debug:
|
|
|
|
|
- print(f"Neighbourhood epoch {neb_epoch_count} complete")
|
|
|
|
|
- neb_epoch_count = neb_epoch_count + 1
|
|
|
|
|
- min_idx = 0
|
|
|
|
|
|
|
+ for neb_epoch_count in range(self.neb_epochs):
|
|
|
|
|
+ if discTrainCount > 0:
|
|
|
|
|
+ for n in range(discTrainCount):
|
|
|
|
|
+ for min_idx in range(minSetSize):
|
|
|
|
|
+ ## generate minority neighbourhood batch for every minority class sampls by index
|
|
|
|
|
+ min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
|
|
|
|
|
+ min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
|
|
|
|
|
+ ## generate random proximal majority batch
|
|
|
|
|
+ maj_batch = self._BMB(data_maj, min_batch_indices)
|
|
|
|
|
+
|
|
|
|
|
+ ## generate synthetic samples from convex space
|
|
|
|
|
+ ## of minority neighbourhood batch using generator
|
|
|
|
|
+ conv_samples = generator.predict(min_batch)
|
|
|
|
|
+ ## concatenate them with the majority batch
|
|
|
|
|
+ concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
|
|
|
|
|
+
|
|
|
|
|
+ ## switch on discriminator training
|
|
|
|
|
+ discriminator.trainable = True
|
|
|
|
|
+ ## train the discriminator with the concatenated samples and the one-hot encoded labels
|
|
|
|
|
+ discriminator.fit(x=concat_sample, y=labels, verbose=0)
|
|
|
|
|
+ ## switch off the discriminator training again
|
|
|
|
|
+ discriminator.trainable = False
|
|
|
|
|
+
|
|
|
|
|
+ for min_idx in range(minSetSize):
|
|
|
|
|
+ ## generate minority neighbourhood batch for every minority class sampls by index
|
|
|
|
|
+ min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
|
|
|
|
|
+ min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
|
|
|
|
|
+ ## generate random proximal majority batch
|
|
|
|
|
+ maj_batch = self._BMB(data_maj, min_batch_indices)
|
|
|
|
|
+
|
|
|
|
|
+ ## generate synthetic samples from convex space
|
|
|
|
|
+ ## of minority neighbourhood batch using generator
|
|
|
|
|
+ conv_samples = generator.predict(min_batch)
|
|
|
|
|
+ ## concatenate them with the majority batch
|
|
|
|
|
+ concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
|
|
|
|
|
+
|
|
|
|
|
+ ## switch on discriminator training
|
|
|
|
|
+ discriminator.trainable = True
|
|
|
|
|
+ ## train the discriminator with the concatenated samples and the one-hot encoded labels
|
|
|
|
|
+ discriminator.fit(x=concat_sample, y=labels, verbose=0)
|
|
|
|
|
+ ## switch off the discriminator training again
|
|
|
|
|
+ discriminator.trainable = False
|
|
|
|
|
+
|
|
|
|
|
+ ## use the GAN to make the generator learn on the decisions
|
|
|
|
|
+ ## made by the previous discriminator training
|
|
|
|
|
+ ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
|
|
|
|
|
+ gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
|
|
|
|
|
+
|
|
|
|
|
+ ## store the loss for the step
|
|
|
|
|
+ loss_history.append(gan_loss_history.history['loss'])
|
|
|
|
|
+
|
|
|
|
|
+ step += 1
|
|
|
|
|
+ if self.debug and (step % 10 == 0):
|
|
|
|
|
+ print(f"{step} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
|
|
|
|
|
+
|
|
|
|
|
+ if self.debug:
|
|
|
|
|
+ print(f"Neighbourhood epoch {neb_epoch_count + 1} complete")
|
|
|
|
|
|
|
|
if self.debug:
|
|
if self.debug:
|
|
|
run_range = range(1, len(loss_history) + 1)
|
|
run_range = range(1, len(loss_history) + 1)
|
|
@@ -340,37 +366,16 @@ class ConvGAN(GanBaseClass):
|
|
|
|
|
|
|
|
|
|
|
|
|
## convGAN
|
|
## convGAN
|
|
|
- def _BMB(self, data_min, data_maj):
|
|
|
|
|
|
|
+ def _BMB(self, data_maj, min_idxs):
|
|
|
|
|
|
|
|
## Generate a borderline majority batch
|
|
## Generate a borderline majority batch
|
|
|
- ## data_min -> minority class data
|
|
|
|
|
## data_maj -> majority class data
|
|
## data_maj -> majority class data
|
|
|
- ## neb -> oversampling neighbourhood
|
|
|
|
|
|
|
+ ## min_idxs -> indices of points in minority class
|
|
|
## gen -> convex combinations generated from each neighbourhood
|
|
## gen -> convex combinations generated from each neighbourhood
|
|
|
|
|
|
|
|
- return tf.convert_to_tensor(
|
|
|
|
|
- data_maj[np.random.randint(len(data_maj), size=self.gen)]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- def _NMB_prepare(self, data_min):
|
|
|
|
|
- neigh = NNSearch(self.neb)
|
|
|
|
|
- neigh.fit(data_min)
|
|
|
|
|
- return (data_min, neigh)
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
- def _NMB_guided(self, index):
|
|
|
|
|
-
|
|
|
|
|
- ## generate a minority neighbourhood batch for a particular minority sample
|
|
|
|
|
- ## we need this for minority data generation
|
|
|
|
|
- ## we will generate synthetic samples for each training data neighbourhood
|
|
|
|
|
- ## index -> index of the minority sample in a training data whose neighbourhood we want to obtain
|
|
|
|
|
- ## data_min -> minority class data
|
|
|
|
|
- ## neb -> oversampling neighbourhood
|
|
|
|
|
- (data_min, neigh) = self.nmb
|
|
|
|
|
-
|
|
|
|
|
- nmbi = np.array([neigh.neighbourhoodOfItem(index)])
|
|
|
|
|
- nmbi = shuffle(nmbi)
|
|
|
|
|
- nmb = data_min[nmbi]
|
|
|
|
|
- nmb = tf.convert_to_tensor(nmb[0])
|
|
|
|
|
- return nmb
|
|
|
|
|
-
|
|
|
|
|
|
|
+ if self.nmbMaj is not None:
|
|
|
|
|
+ return self.nmbMaj.neighbourhoodOfItemList(min_idxs, maxCount=self.gen)
|
|
|
|
|
+ else:
|
|
|
|
|
+ return tf.convert_to_tensor(
|
|
|
|
|
+ data_maj[np.random.randint(len(data_maj), size=self.gen)]
|
|
|
|
|
+ )
|