il y a 4 ans · 78af42568d
--- a/library/convGAN.py
+++ b/library/convGAN.py
@@ -1,74 +1,49 @@
 
				-import os
			
 
				-import math
			
 
				-import random
			
 
				 import numpy as np
			
 
				+from numpy.random import seed
			
 
				 import pandas as pd
			
 
				 import matplotlib.pyplot as plt
			
 
				-import random
			
 
				-from scipy import ndarray
			
 
				-from sklearn.neighbors import NearestNeighbors
			
 
				+
			
 
				+from library.interfaces import GanBaseClass
			
 
				+from library.dataset import DataSet
			
 
				+
			
 
				 from sklearn.decomposition import PCA
			
 
				 from sklearn.metrics import confusion_matrix
			
 
				 from sklearn.metrics import f1_score
			
 
				 from sklearn.metrics import cohen_kappa_score
			
 
				 from sklearn.metrics import precision_score
			
 
				 from sklearn.metrics import recall_score
			
 
				-from collections import Counter
			
 
				+from sklearn.neighbors import NearestNeighbors
			
 
				+from sklearn.utils import shuffle
			
 
				 from imblearn.datasets import fetch_datasets
			
 
				-from sklearn.preprocessing import StandardScaler
			
 
				 
			
 
				-import keras
			
 
				-from keras.layers import Dense, Dropout, Input
			
 
				-from keras.models import Model,Sequential
			
 
				-from tqdm import tqdm
			
 
				-from keras.layers.advanced_activations import LeakyReLU
			
 
				-from tensorflow.keras.optimizers import Adam
			
 
				-from keras import losses
			
 
				+from keras.layers import Dense, Input, Multiply, Flatten, Conv1D, Reshape
			
 
				+from keras.models import Model
			
 
				 from keras import backend as K
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				 import tensorflow as tf
			
 
				+from tensorflow.keras.optimizers import Adam
			
 
				+from tensorflow.keras.layers import Lambda
			
 
				 
			
 
				 import warnings
			
 
				 warnings.filterwarnings("ignore")
			
 
				 
			
 
				-from sklearn.neighbors import KNeighborsClassifier
			
 
				-from sklearn.ensemble import RandomForestClassifier
			
 
				-from sklearn.ensemble import GradientBoostingClassifier
			
 
				-
			
 
				-from numpy.random import seed
			
 
				-seed_num=1
			
 
				-seed(seed_num)
			
 
				-tf.random.set_seed(seed_num) 
			
 
				-
			
 
				-from library.interfaces import GanBaseClass
			
 
				-from library.dataset import DataSet
			
 
				-
			
 
				-from sklearn.utils import shuffle
			
 
				-
			
 
				-## Import dataset
			
 
				-data = fetch_datasets()['yeast_me2']
			
 
				-
			
 
				-## Creating label and feature matrices
			
 
				-labels_x=data.target ## labels of the data
			
 
				-labels_x.shape
			
 
				-
			
 
				-features_x=data.data ## features of the data
			
 
				-features_x.shape
			
 
				-
			
 
				-# Until now we have obtained the data. We divided it into training and test sets. we separated obtained seperate variables for the majority and miority classes and their labels for both sets.
			
 
				-
			
 
				-
			
 
				 class ConvGAN(GanBaseClass):
			
 
				     """
			
 
				     This is a toy example of a GAN.
			
 
				     It repeats the first point of the training-data-set.
			
 
				     """
			
 
				-    def __init__(self, neb, gen, debug=True):
			
 
				+    def __init__(self, n_feat, neb, gen, debug=True):
			
 
				         self.isTrained = False
			
 
				+        self.n_feat = n_feat
			
 
				         self.neb = neb
			
 
				         self.gen = gen
			
 
				         self.loss_history = None
			
 
				         self.debug = debug
			
 
				         self.dataSet = None
			
 
				+        self.conv_sample_generator = None
			
 
				+        self.maj_min_discriminator = None
			
 
				+        self.cg = None
			
 
				 
			
 
				     def reset(self):
			
 
				         """
			
@@ -76,13 +51,13 @@ class ConvGAN(GanBaseClass):
 
				         """
			
 
				         self.isTrained = False
			
 
				         ## instanciate generator network and visualize architecture
			
 
				-        self.conv_sample_generator = conv_sample_gen()
			
 
				+        self.conv_sample_generator = self._conv_sample_gen()
			
 
				 
			
 
				         ## instanciate discriminator network and visualize architecture
			
 
				-        self.maj_min_discriminator = maj_min_disc()
			
 
				+        self.maj_min_discriminator = self._maj_min_disc()
			
 
				 
			
 
				         ## instanciate network and visualize architecture
			
 
				-        self.cg = convGAN(self.conv_sample_generator, self.maj_min_discriminator)
			
 
				+        self.cg = self._convGAN(self.conv_sample_generator, self.maj_min_discriminator)
			
 
				 
			
 
				     def train(self, dataSet, neb_epochs=5):
			
 
				         """
			
@@ -117,10 +92,6 @@ class ConvGAN(GanBaseClass):
 
				             raise ValueError("Try to generate data with untrained Re.")
			
 
				 
			
 
				         data_min = self.dataSet.data1
			
 
				-        data_maj = self.dataSet.data0
			
 
				-        neb = self.neb
			
 
				-
			
 
				-        # ---
			
 
				 
			
 
				         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
			
 
				         synth_num = (numOfSamples // len(data_min)) + 1
			
@@ -129,7 +100,7 @@ class ConvGAN(GanBaseClass):
 
				         synth_set=[]
			
 
				         for i in range(len(data_min)):
			
 
				             synth_set.extend(self.generate_data_for_min_point(data_min, i, synth_num))
			
 
				-    
			
 
				+
			
 
				         synth_set = synth_set[:numOfSamples] ## extract the exact number of synthetic samples needed to exactly balance the two classes
			
 
				 
			
 
				         return np.array(synth_set)
			
@@ -138,9 +109,121 @@ class ConvGAN(GanBaseClass):
 
				     # Hidden internal functions
			
 
				     # ###############################################################
			
 
				 
			
 
				-    def _generate_data_for_min_point(self, data_min, index, synth_num, generator):
			
 
				+    # Creating the GAN
			
 
				+    def _conv_sample_gen(self):
			
 
				+        """
			
 
				+        the generator network to generate synthetic samples from the convex space
			
 
				+        of arbitrary minority neighbourhoods
			
 
				+        """
			
 
				+
			
 
				+        ## takes minority batch as input
			
 
				+        min_neb_batch = Input(shape=(self.n_feat,))
			
 
				+
			
 
				+        ## reshaping the 2D tensor to 3D for using 1-D convolution,
			
 
				+        ## otherwise 1-D convolution won't work.
			
 
				+        x = tf.reshape(min_neb_batch, (1, self.neb, self.n_feat), name=None)
			
 
				+        ## using 1-D convolution, feature dimension remains the same
			
 
				+        x = Conv1D(self.n_feat, 3, activation='relu')(x)
			
 
				+        ## flatten after convolution
			
 
				+        x = Flatten()(x)
			
 
				+        ## add dense layer to transform the vector to a convenient dimension
			
 
				+        x = Dense(self.neb * self.gen, activation='relu')(x)
			
 
				+
			
 
				+        ## again, witching to 2-D tensor once we have the convenient shape
			
 
				+        x = Reshape((self.neb, self.gen))(x)
			
 
				+        ## row wise sum
			
 
				+        s = K.sum(x, axis=1)
			
 
				+        ## adding a small constant to always ensure the row sums are non zero.
			
 
				+        ## if this is not done then during initialization the sum can be zero.
			
 
				+        s_non_zero = Lambda(lambda x: x + .000001)(s)
			
 
				+        ## reprocals of the approximated row sum
			
 
				+        sinv = tf.math.reciprocal(s_non_zero)
			
 
				+        ## At this step we ensure that row sum is 1 for every row in x.
			
 
				+        ## That means, each row is set of convex co-efficient
			
 
				+        x = Multiply()([sinv, x])
			
 
				+        ## Now we transpose the matrix. So each column is now a set of convex coefficients
			
 
				+        aff=tf.transpose(x[0])
			
 
				+        ## We now do matrix multiplication of the affine combinations with the original
			
 
				+        ## minority batch taken as input. This generates a convex transformation
			
 
				+        ## of the input minority batch
			
 
				+        synth=tf.matmul(aff, min_neb_batch)
			
 
				+        ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
			
 
				+        ## as input and a covex space transformation of the same number of samples as output
			
 
				+        model = Model(inputs=min_neb_batch, outputs=synth)
			
 
				+        opt = Adam(learning_rate=0.001)
			
 
				+        model.compile(loss='mean_squared_logarithmic_error', optimizer=opt)
			
 
				+        return model
			
 
				+
			
 
				+    def _maj_min_disc(self):
			
 
				+        """
			
 
				+        the discriminator is trained intwo phase:
			
 
				+        first phase:  while training GAN the discriminator learns to differentiate synthetic
			
 
				+                      minority samples generated from convex minority data space against
			
 
				+                      the borderline majority samples
			
 
				+        second phase: after the GAN generator learns to create synthetic samples,
			
 
				+                      it can be used to generate synthetic samples to balance the dataset
			
 
				+                      and then rettrain the discriminator with the balanced dataset
			
 
				+        """
			
 
				+
			
 
				+        ## takes as input synthetic sample generated as input stacked upon a batch of
			
 
				+        ## borderline majority samples
			
 
				+        samples = Input(shape=(self.n_feat,))
			
 
				+        
			
 
				+        ## passed through two dense layers
			
 
				+        y = Dense(250, activation='relu')(samples)
			
 
				+        y = Dense(125, activation='relu')(y)
			
 
				+        
			
 
				+        ## two output nodes. outputs have to be one-hot coded (see labels variable before)
			
 
				+        output = Dense(2, activation='sigmoid')(y)
			
 
				+        
			
 
				+        ## compile model
			
 
				+        model = Model(inputs=samples, outputs=output)
			
 
				+        opt = Adam(learning_rate=0.0001)
			
 
				+        model.compile(loss='binary_crossentropy', optimizer=opt)
			
 
				+        return model
			
 
				+
			
 
				+    def _convGAN(self, generator, discriminator):
			
 
				+        """
			
 
				+        for joining the generator and the discriminator
			
 
				+        conv_coeff_generator-> generator network instance
			
 
				+        maj_min_discriminator -> discriminator network instance
			
 
				+        """
			
 
				+        ## by default the discriminator trainability is switched off.
			
 
				+        ## Thus training the GAN means training the generator network as per previously
			
 
				+        ## trained discriminator network.
			
 
				+        discriminator.trainable = False
			
 
				+
			
 
				+        ## input receives a neighbourhood minority batch
			
 
				+        ## and a proximal majority batch concatenated
			
 
				+        batch_data = Input(shape=(self.n_feat,))
			
 
				+        
			
 
				+        ## extract minority batch
			
 
				+        min_batch = Lambda(lambda x: x[:self.neb])(batch_data)
			
 
				+        
			
 
				+        ## extract majority batch
			
 
				+        maj_batch = Lambda(lambda x: x[self.neb:])(batch_data)
			
 
				+        
			
 
				+        ## pass minority batch into generator to obtain convex space transformation
			
 
				+        ## (synthetic samples) of the minority neighbourhood input batch
			
 
				+        conv_samples = generator(min_batch)
			
 
				+        
			
 
				+        ## concatenate the synthetic samples with the majority samples
			
 
				+        new_samples = tf.concat([conv_samples, maj_batch],axis=0)
			
 
				+        
			
 
				+        ## pass the concatenated vector into the discriminator to know its decisions
			
 
				+        output = discriminator(new_samples)
			
 
				+        
			
 
				+        ## note that, the discriminator will not be traied but will make decisions based
			
 
				+        ## on its previous training while using this function
			
 
				+        model = Model(inputs=batch_data, outputs=output)
			
 
				+        opt = Adam(learning_rate=0.0001)
			
 
				+        model.compile(loss='mse', optimizer=opt)
			
 
				+        return model
			
 
				+
			
 
				+    # Create synthetic points
			
 
				+    def _generate_data_for_min_point(self, data_min, index, synth_num):
			
 
				         """
			
 
				-        generate synth_num synthetic points for a particular minoity sample 
			
 
				+        generate synth_num synthetic points for a particular minoity sample
			
 
				         synth_num -> required number of data points that can be generated from a neighbourhood
			
 
				         data_min -> minority class data
			
 
				         neb -> oversampling neighbourhood
			
@@ -149,12 +232,12 @@ class ConvGAN(GanBaseClass):
 
				 
			
 
				         runs = int(synth_num / self.neb) + 1
			
 
				         synth_set = []
			
 
				-        for run in range(runs):
			
 
				+        for _run in range(runs):
			
 
				             batch = self._NMB_guided(data_min, index)
			
 
				             synth_batch = self.conv_sample_generator.predict(batch)
			
 
				             for x in synth_batch:
			
 
				                 synth_set.append(x)
			
 
				-        
			
 
				+
			
 
				         return synth_set[:synth_num]
			
 
				 
			
 
				 
			
@@ -167,32 +250,43 @@ class ConvGAN(GanBaseClass):
 
				         loss_history=[] ## this is for stroring the loss for every run
			
 
				         min_idx = 0
			
 
				         neb_epoch_count = 1
			
 
				-        
			
 
				+
			
 
				         labels = []
			
 
				         for i in range(2 * self.gen):
			
 
				-            if i < gen:
			
 
				+            if i < self.gen:
			
 
				                 labels.append(np.array([1,0]))
			
 
				             else:
			
 
				                 labels.append(np.array([0,1]))
			
 
				         labels = np.array(labels)
			
 
				         labels = tf.convert_to_tensor(labels)
			
 
				-        
			
 
				-        
			
 
				-        for step in range(neb_epochs * len(data_min)):
			
 
				-            min_batch = self._NMB_guided(data_min, min_idx) ## generate minority neighbourhood batch for every minority class sampls by index
			
 
				-            min_idx = min_idx + 1 
			
 
				-            maj_batch = self._BMB(data_min, data_maj) ## generate random proximal majority batch 
			
 
				 
			
 
				-            conv_samples = generator.predict(min_batch) ## generate synthetic samples from convex space of minority neighbourhood batch using generator
			
 
				-            concat_sample = tf.concat([conv_samples, maj_batch], axis=0) ## concatenate them with the majority batch
			
 
				 
			
 
				-            discriminator.trainable = True ## switch on discriminator training
			
 
				-            discriminator.fit(x=concat_sample, y=labels, verbose=0) ## train the discriminator with the concatenated samples and the one-hot encoded labels 
			
 
				-            discriminator.trainable = False ## switch off the discriminator training again
			
 
				-
			
 
				-            gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0) ## use the GAN to make the generator learn on the decisions made by the previous discriminator training
			
 
				-
			
 
				-            loss_history.append(gan_loss_history.history['loss']) ## store the loss for the step
			
 
				+        for step in range(neb_epochs * len(data_min)):
			
 
				+            ## generate minority neighbourhood batch for every minority class sampls by index
			
 
				+            min_batch = self._NMB_guided(data_min, min_idx)
			
 
				+            min_idx = min_idx + 1
			
 
				+            ## generate random proximal majority batch
			
 
				+            maj_batch = self._BMB(data_min, data_maj)
			
 
				+
			
 
				+            ## generate synthetic samples from convex space
			
 
				+            ## of minority neighbourhood batch using generator
			
 
				+            conv_samples = generator.predict(min_batch)
			
 
				+            ## concatenate them with the majority batch
			
 
				+            concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
			
 
				+
			
 
				+            ## switch on discriminator training
			
 
				+            discriminator.trainable = True
			
 
				+            ## train the discriminator with the concatenated samples and the one-hot encoded labels
			
 
				+            discriminator.fit(x=concat_sample, y=labels, verbose=0)
			
 
				+            ## switch off the discriminator training again
			
 
				+            discriminator.trainable = False
			
 
				+
			
 
				+            ## use the GAN to make the generator learn on the decisions
			
 
				+            ## made by the previous discriminator training
			
 
				+            gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
			
 
				+
			
 
				+            ## store the loss for the step
			
 
				+            loss_history.append(gan_loss_history.history['loss'])
			
 
				 
			
 
				             if self.debug and ((step + 1) % 10 == 0):
			
 
				                 print(f"{step + 1} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
			
@@ -223,15 +317,14 @@ class ConvGAN(GanBaseClass):
 
				 
			
 
				     ## convGAN
			
 
				     def _BMB(self, data_min, data_maj):
			
 
				-        
			
 
				+
			
 
				         ## Generate a borderline majority batch
			
 
				         ## data_min -> minority class data
			
 
				         ## data_maj -> majority class data
			
 
				         ## neb -> oversampling neighbourhood
			
 
				         ## gen -> convex combinations generated from each neighbourhood
			
 
				-        
			
 
				+
			
 
				         neigh = NearestNeighbors(self.neb)
			
 
				-        n_feat = data_min.shape[1]
			
 
				         neigh.fit(data_maj)
			
 
				         bmbi = [
			
 
				             neigh.kneighbors([data_min[i]], self.neb, return_distance=False)
			
@@ -245,91 +338,35 @@ class ConvGAN(GanBaseClass):
 
				 
			
 
				 
			
 
				     def _NMB_guided(self, data_min, index):
			
 
				-        
			
 
				+
			
 
				         ## generate a minority neighbourhood batch for a particular minority sample
			
 
				         ## we need this for minority data generation
			
 
				         ## we will generate synthetic samples for each training data neighbourhood
			
 
				         ## index -> index of the minority sample in a training data whose neighbourhood we want to obtain
			
 
				         ## data_min -> minority class data
			
 
				         ## neb -> oversampling neighbourhood
			
 
				-        
			
 
				+
			
 
				         neigh = NearestNeighbors(self.neb)
			
 
				         neigh.fit(data_min)
			
 
				         nmbi = neigh.kneighbors([data_min[index]], self.neb, return_distance=False)
			
 
				         nmbi = shuffle(nmbi)
			
 
				         nmb = data_min[nmbi]
			
 
				         nmb = tf.convert_to_tensor(nmb[0])
			
 
				-        return (nmb)
			
 
				-
			
 
				-def conv_sample_gen():
			
 
				-    
			
 
				-    ## the generator network to generate synthetic samples from the convex space of arbitrary minority neighbourhoods
			
 
				-    
			
 
				-    min_neb_batch = keras.layers.Input(shape=(n_feat,)) ## takes minority batch as input
			
 
				-    x=tf.reshape(min_neb_batch, (1,neb,n_feat), name=None) ## reshaping the 2D tensor to 3D for using 1-D convolution, otherwise 1-D convolution won't work.
			
 
				-    x= keras.layers.Conv1D(n_feat, 3, activation='relu')(x) ## using 1-D convolution, feature dimension remains the same
			
 
				-    x= keras.layers.Flatten()(x) ## flatten after convolution
			
 
				-    x= keras.layers.Dense(neb*gen, activation='relu')(x) ## add dense layer to transform the vector to a convenient dimension
			
 
				-    x= keras.layers.Reshape((neb,gen))(x)## again, witching to 2-D tensor once we have the convenient shape
			
 
				-    s=K.sum(x,axis=1) ## row wise sum
			
 
				-    s_non_zero=tf.keras.layers.Lambda(lambda x: x+.000001)(s) ## adding a small constant to always ensure the row sums are non zero. if this is not done then during initialization the sum can be zero
			
 
				-    sinv=tf.math.reciprocal(s_non_zero) ## reprocals of the approximated row sum
			
 
				-    x=keras.layers.Multiply()([sinv,x]) ## At this step we ensure that row sum is 1 for every row in x. That means, each row is set of convex co-efficient
			
 
				-    aff=tf.transpose(x[0]) ## Now we transpose the matrix. So each column is now a set of convex coefficients
			
 
				-    synth=tf.matmul(aff,min_neb_batch) ## We now do matrix multiplication of the affine combinations with the original minority batch taken as input. This generates a convex transformation of the input minority batch
			
 
				-    model = Model(inputs=min_neb_batch, outputs=synth) ## finally we compile the generator with an arbitrary minortiy neighbourhood batch as input and a covex space transformation of the same number of samples as output
			
 
				-    opt = Adam(learning_rate=0.001)
			
 
				-    model.compile(loss='mean_squared_logarithmic_error', optimizer=opt)
			
 
				-    return model
			
 
				-
			
 
				-def maj_min_disc():
			
 
				-    
			
 
				-    ## the discriminator is trained intwo phase:  
			
 
				-    ## first phase: while training GAN the discriminator learns to differentiate synthetic minority samples generated from convex minority data space against the borderline majority samples
			
 
				-    ## second phase: after the GAN generator learns to create synthetic samples, it can be used to generate synthetic samples to balance the dataset
			
 
				-    ## and then rettrain the discriminator with the balanced dataset
			
 
				-    
			
 
				-    samples=keras.layers.Input(shape=(n_feat,)) ## takes as input synthetic sample generated as input stacked upon a batch of borderline majority samples 
			
 
				-    y= keras.layers.Dense(250, activation='relu')(samples) ## passed through two dense layers 
			
 
				-    y= keras.layers.Dense(125, activation='relu')(y)
			
 
				-    output= keras.layers.Dense(2, activation='sigmoid')(y) ## two output nodes. outputs have to be one-hot coded (see labels variable before)
			
 
				-    model = Model(inputs=samples, outputs=output) ## compile model
			
 
				-    opt = Adam(learning_rate=0.0001)
			
 
				-    model.compile(loss='binary_crossentropy', optimizer=opt)
			
 
				-    return model
			
 
				-
			
 
				-
			
 
				-def convGAN(generator,discriminator):
			
 
				-    
			
 
				-    ## for joining the generator and the discriminator
			
 
				-    ## conv_coeff_generator-> generator network instance
			
 
				-    ## maj_min_discriminator -> discriminator network instance
			
 
				-    
			
 
				-    maj_min_disc.trainable=False ## by default the discriminator trainability is switched off. 
			
 
				-    ## Thus training the GAN means training the generator network as per previously trained discriminator network.
			
 
				-    batch_data = keras.layers.Input(shape=(n_feat,)) ## input receives a neighbourhood minority batch and a proximal majority batch concatenated
			
 
				-    min_batch = tf.keras.layers.Lambda(lambda x: x[:neb])(batch_data) ## extract minority batch
			
 
				-    maj_batch = tf.keras.layers.Lambda(lambda x: x[neb:])(batch_data) ## extract majority batch 
			
 
				-    conv_samples=generator(min_batch) ## pass minority batch into generator to obtain convex space transformation (synthetic samples) of the minority neighbourhood input batch
			
 
				-    new_samples=tf.concat([conv_samples,maj_batch],axis=0) ## concatenate the synthetic samples with the majority samples  
			
 
				-    output=discriminator(new_samples) ## pass the concatenated vector into the discriminator to know its decisions
			
 
				-    ## note that, the discriminator will not be traied but will make decisions based on its previous training while using this function
			
 
				-    model = Model(inputs=batch_data, outputs=output)
			
 
				-    opt = Adam(learning_rate=0.0001)
			
 
				-    model.compile(loss='mse', optimizer=opt)
			
 
				-    return model
			
 
				+        return nmb
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 ## this is the main training process where the GAn learns to generate appropriate samples from the convex space
			
 
				 ## this is the first training phase for the discriminator and the only training phase for the generator.
			
 
				 
			
 
				 
			
 
				-    
			
 
				+
			
 
				 
			
 
				 
			
 
				 def rough_learning_predictions(discriminator,test_data_numpy,test_labels_numpy):
			
 
				-    
			
 
				-    ## after the first phase of training the discriminator can be used for classification 
			
 
				+
			
 
				+    ## after the first phase of training the discriminator can be used for classification
			
 
				     ## it already learns to differentiate the convex minority points with majority points during the first training phase
			
 
				     y_pred_2d=discriminator.predict(tf.convert_to_tensor(test_data_numpy))
			
 
				     ## discretisation of the labels
			
@@ -360,7 +397,7 @@ def generate_synthetic_data(gan, data_min, data_maj):
 
				     ovs_min_class=np.concatenate((data_min,synth_set),axis=0)
			
 
				     ovs_training_dataset=np.concatenate((ovs_min_class,data_maj),axis=0)
			
 
				     ovs_pca_labels=np.concatenate((np.zeros(len(data_min)),np.zeros(len(synth_set))+1,np.zeros(len(data_maj))+2))
			
 
				-    ovs_training_labels=np.concatenate((np.zeros(len(ovs_min_class))+1,np.zeros(len(data_maj))+0))
			
 
				+    # TODO ovs_training_labels=np.concatenate((np.zeros(len(ovs_min_class))+1,np.zeros(len(data_maj))+0))
			
 
				     ovs_training_labels_oh=[]
			
 
				     for i in range(len(ovs_training_dataset)):
			
 
				         if i<len(ovs_min_class):
			
@@ -369,18 +406,18 @@ def generate_synthetic_data(gan, data_min, data_maj):
 
				             ovs_training_labels_oh.append(np.array([0,1]))
			
 
				     ovs_training_labels_oh=np.array(ovs_training_labels_oh)
			
 
				     ovs_training_labels_oh=tf.convert_to_tensor(ovs_training_labels_oh)
			
 
				-    
			
 
				-    
			
 
				+
			
 
				+
			
 
				     ## PCA visualization of the synthetic sata
			
 
				     ## observe how the minority samples from convex space have optimal variance and avoids overlap with the majority
			
 
				     pca = PCA(n_components=2)
			
 
				     pca.fit(ovs_training_dataset)
			
 
				     data_pca= pca.transform(ovs_training_dataset)
			
 
				-    
			
 
				+
			
 
				     ## plot PCA
			
 
				     plt.rcParams["figure.figsize"] = (12,12)
			
 
				 
			
 
				-    colors=['r', 'b', 'g']
			
 
				+    # TODO colors=['r', 'b', 'g']
			
 
				     plt.xticks(fontsize=20)
			
 
				     plt.yticks(fontsize=20)
			
 
				     plt.xlabel('PCA1',fontsize=25)
			
@@ -391,20 +428,20 @@ def generate_synthetic_data(gan, data_min, data_maj):
 
				     scatter=plt.scatter(data_pca[:,0], data_pca[:,1], c=ovs_pca_labels, cmap='Set1')
			
 
				     plt.legend(handles=scatter.legend_elements()[0], labels=classes, fontsize=20)
			
 
				     plt.show()
			
 
				-    
			
 
				+
			
 
				     return ovs_training_dataset, ovs_pca_labels, ovs_training_labels_oh
			
 
				 
			
 
				 
			
 
				 def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh, test_data_numpy, test_labels_numpy, num_epochs):
			
 
				-    
			
 
				+
			
 
				     print('\n')
			
 
				     print('Final round training of the discrminator as a majority-minority classifier')
			
 
				     print('\n')
			
 
				     ## second phase training of the discriminator with balanced data
			
 
				-    
			
 
				+
			
 
				     history_second_learning=discriminator.fit(x=ovs_training_dataset,y=ovs_training_labels_oh, batch_size=20, epochs=num_epochs)
			
 
				-    
			
 
				-    ## loss of the second phase learning smoothly decreses 
			
 
				+
			
 
				+    ## loss of the second phase learning smoothly decreses
			
 
				     ## this is because now the data is fixed and diverse convex combinations are no longer fed into the discriminator at every training step
			
 
				     run_range=range(1,num_epochs+1)
			
 
				     plt.rcParams["figure.figsize"] = (16,10)
			
@@ -415,7 +452,7 @@ def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh,
 
				     plt.title('Final learning loss for discriminator', fontsize=25)
			
 
				     plt.plot(run_range, history_second_learning.history['loss'])
			
 
				     plt.show()
			
 
				-    
			
 
				+
			
 
				     ## finally after second phase training the discriminator classifier has a more balanced performance
			
 
				     ## meaning better F1-Score
			
 
				     ## the recall decreases but the precision improves
			
@@ -437,7 +474,7 @@ def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh,
 
				 
			
 
				 
			
 
				 def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels, neb, gen, neb_epochs,epochs_retrain_disc):
			
 
				-    
			
 
				+
			
 
				     ##minority class
			
 
				     data_min=training_data[np.where(training_labels == 1)[0]]
			
 
				     ##majority class
			
@@ -445,16 +482,16 @@ def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels
 
				 
			
 
				     dataSet = DataSet(data0=data_maj, data1=data_min)
			
 
				 
			
 
				-    gan = ConvGAN(neb, gen)
			
 
				+    gan = ConvGAN(data_min.shape[1], neb, gen)
			
 
				     gan.reset()
			
 
				-   
			
 
				+
			
 
				     ## instanciate generator network and visualize architecture
			
 
				-    conv_sample_generator = gan.conv_sample_generator 
			
 
				+    conv_sample_generator = gan.conv_sample_generator
			
 
				     print(conv_sample_generator.summary())
			
 
				     print('\n')
			
 
				 
			
 
				     ## instanciate discriminator network and visualize architecture
			
 
				-    maj_min_discriminator = gan.maj_min_discriminator 
			
 
				+    maj_min_discriminator = gan.maj_min_discriminator
			
 
				     print(maj_min_discriminator.summary())
			
 
				     print('\n')
			
 
				 
			
@@ -462,25 +499,25 @@ def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels
 
				     cg = gan.cg
			
 
				     print(cg.summary())
			
 
				     print('\n')
			
 
				-    
			
 
				+
			
 
				     print('Training the GAN, first round training of the discrminator as a majority-minority classifier')
			
 
				     print('\n')
			
 
				 
			
 
				     ## train gan generator ## rough_train_discriminator
			
 
				     gan.train(dataSet, neb_epochs)
			
 
				     print('\n')
			
 
				-    
			
 
				+
			
 
				     ## rough learning results
			
 
				     c_r,f_r,pr_r,rc_r,k_r = rough_learning_predictions(gan.maj_min_discriminator_r, test_data, test_labels)
			
 
				     print('\n')
			
 
				-    
			
 
				+
			
 
				     ## generate synthetic data
			
 
				     ovs_training_dataset, ovs_pca_labels, ovs_training_labels_oh = generate_synthetic_data(gan, data_min, data_maj)
			
 
				     print('\n')
			
 
				-    
			
 
				+
			
 
				     ## final training results
			
 
				     c,f,pr,rc,k=final_learning(gan.maj_min_discriminator, ovs_training_dataset, ovs_training_labels_oh, test_data, test_labels, epochs_retrain_disc)
			
 
				-    
			
 
				+
			
 
				     return ((c_r,f_r,pr_r,rc_r,k_r),(c,f,pr,rc,k))
			
 
				 
			
 
				 
			
@@ -492,121 +529,142 @@ def unison_shuffled_copies(a, b,seed_perm):
 
				     return a[p], b[p]
			
 
				 
			
 
				 
			
 
				-## specify parameters
			
 
				 
			
 
				-neb=gen=5 ##neb=gen required
			
 
				-neb_epochs=10
			
 
				-epochs_retrain_disc=50
			
 
				-n_feat=len(features_x[1]) ## number of features
			
 
				+def runTest():
			
 
				+    seed_num=1
			
 
				+    seed(seed_num)
			
 
				+    tf.random.set_seed(seed_num)
			
 
				 
			
 
				 
			
 
				-## Training
			
 
				-np.random.seed(42)
			
 
				-strata=5
			
 
				-results=[]
			
 
				-for seed_perm in range(strata):
			
 
				-    
			
 
				-    features_x,labels_x=unison_shuffled_copies(features_x,labels_x,seed_perm)
			
 
				+    ## Import dataset
			
 
				+    data = fetch_datasets()['yeast_me2']
			
 
				 
			
 
				-    ### Extracting all features and labels
			
 
				-    print('Extracting all features and labels for seed:'+ str(seed_perm)+'\n')
			
 
				-    
			
 
				-    ## Dividing data into training and testing datasets for 10-fold CV
			
 
				-    print('Dividing data into training and testing datasets for 10-fold CV for seed:'+ str(seed_perm)+'\n')
			
 
				-    label_1=list(np.where(labels_x == 1)[0])
			
 
				-    features_1=features_x[label_1]
			
 
				-    
			
 
				-    label_0=list(np.where(labels_x != 1)[0])
			
 
				-    features_0=features_x[label_0]
			
 
				-    
			
 
				-    a=len(features_1)//5
			
 
				-    b=len(features_0)//5
			
 
				+    ## Creating label and feature matrices
			
 
				+    labels_x = data.target ## labels of the data
			
 
				 
			
 
				-    fold_1_min=features_1[0:a]
			
 
				-    fold_1_maj=features_0[0:b]
			
 
				-    fold_1_tst=np.concatenate((fold_1_min,fold_1_maj))
			
 
				-    lab_1_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				+    features_x = data.data ## features of the data
			
 
				 
			
 
				-    fold_2_min=features_1[a:2*a]
			
 
				-    fold_2_maj=features_0[b:2*b]
			
 
				-    fold_2_tst=np.concatenate((fold_2_min,fold_2_maj))
			
 
				-    lab_2_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				+    # Until now we have obtained the data. We divided it into training and test sets. we separated obtained seperate variables for the majority and miority classes and their labels for both sets.
			
 
				 
			
 
				-    fold_3_min=features_1[2*a:3*a]
			
 
				-    fold_3_maj=features_0[2*b:3*b]
			
 
				-    fold_3_tst=np.concatenate((fold_3_min,fold_3_maj))
			
 
				-    lab_3_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				+    ## specify parameters
			
 
				 
			
 
				-    fold_4_min=features_1[3*a:4*a]
			
 
				-    fold_4_maj=features_0[3*b:4*b]
			
 
				-    fold_4_tst=np.concatenate((fold_4_min,fold_4_maj))
			
 
				-    lab_4_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				+    neb=gen=5 ##neb=gen required
			
 
				+    neb_epochs=10
			
 
				+    epochs_retrain_disc=50
			
 
				+    # TODO n_feat=len(features_x[1]) ## number of features
			
 
				 
			
 
				 
			
 
				-    fold_5_min=features_1[4*a:]
			
 
				-    fold_5_maj=features_0[4*b:]
			
 
				-    fold_5_tst=np.concatenate((fold_5_min,fold_5_maj))
			
 
				-    lab_5_tst=np.concatenate((np.zeros(len(fold_5_min))+1, np.zeros(len(fold_5_maj))))
			
 
				+    ## Training
			
 
				+    np.random.seed(42)
			
 
				+    strata=5
			
 
				+    results=[]
			
 
				+    for seed_perm in range(strata):
			
 
				 
			
 
				-    fold_1_trn=np.concatenate((fold_2_min,fold_3_min,fold_4_min,fold_5_min, fold_2_maj,fold_3_maj,fold_4_maj,fold_5_maj))
			
 
				+        features_x,labels_x=unison_shuffled_copies(features_x,labels_x,seed_perm)
			
 
				 
			
 
				-    lab_1_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+        ### Extracting all features and labels
			
 
				+        print('Extracting all features and labels for seed:'+ str(seed_perm)+'\n')
			
 
				 
			
 
				-    fold_2_trn=np.concatenate((fold_1_min,fold_3_min,fold_4_min,fold_5_min,fold_1_maj,fold_3_maj,fold_4_maj,fold_5_maj))
			
 
				+        ## Dividing data into training and testing datasets for 10-fold CV
			
 
				+        print('Dividing data into training and testing datasets for 10-fold CV for seed:'+ str(seed_perm)+'\n')
			
 
				+        label_1=list(np.where(labels_x == 1)[0])
			
 
				+        features_1=features_x[label_1]
			
 
				 
			
 
				-    lab_2_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+        label_0=list(np.where(labels_x != 1)[0])
			
 
				+        features_0=features_x[label_0]
			
 
				 
			
 
				-    fold_3_trn=np.concatenate((fold_2_min,fold_1_min,fold_4_min,fold_5_min,fold_2_maj,fold_1_maj,fold_4_maj,fold_5_maj))
			
 
				+        a=len(features_1)//5
			
 
				+        b=len(features_0)//5
			
 
				 
			
 
				-    lab_3_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+        fold_1_min=features_1[0:a]
			
 
				+        fold_1_maj=features_0[0:b]
			
 
				+        fold_1_tst=np.concatenate((fold_1_min,fold_1_maj))
			
 
				+        lab_1_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				 
			
 
				-    fold_4_trn=np.concatenate((fold_2_min,fold_3_min,fold_1_min,fold_5_min,fold_2_maj,fold_3_maj,fold_1_maj,fold_5_maj))
			
 
				+        fold_2_min=features_1[a:2*a]
			
 
				+        fold_2_maj=features_0[b:2*b]
			
 
				+        fold_2_tst=np.concatenate((fold_2_min,fold_2_maj))
			
 
				+        lab_2_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				 
			
 
				-    lab_4_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+        fold_3_min=features_1[2*a:3*a]
			
 
				+        fold_3_maj=features_0[2*b:3*b]
			
 
				+        fold_3_tst=np.concatenate((fold_3_min,fold_3_maj))
			
 
				+        lab_3_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				 
			
 
				-    fold_5_trn=np.concatenate((fold_2_min,fold_3_min,fold_4_min,fold_1_min,fold_2_maj,fold_3_maj,fold_4_maj,fold_1_maj))
			
 
				+        fold_4_min=features_1[3*a:4*a]
			
 
				+        fold_4_maj=features_0[3*b:4*b]
			
 
				+        fold_4_tst=np.concatenate((fold_4_min,fold_4_maj))
			
 
				+        lab_4_tst=np.concatenate((np.zeros(len(fold_1_min))+1, np.zeros(len(fold_1_maj))))
			
 
				 
			
 
				-    lab_5_trn=np.concatenate((np.zeros(4*a)+1,np.zeros(4*b)))
			
 
				 
			
 
				+        fold_5_min=features_1[4*a:]
			
 
				+        fold_5_maj=features_0[4*b:]
			
 
				+        fold_5_tst=np.concatenate((fold_5_min,fold_5_maj))
			
 
				+        lab_5_tst=np.concatenate((np.zeros(len(fold_5_min))+1, np.zeros(len(fold_5_maj))))
			
 
				 
			
 
				-    training_folds_feats=[fold_1_trn,fold_2_trn,fold_3_trn,fold_4_trn,fold_5_trn]
			
 
				+        fold_1_trn=np.concatenate((fold_2_min,fold_3_min,fold_4_min,fold_5_min, fold_2_maj,fold_3_maj,fold_4_maj,fold_5_maj))
			
 
				 
			
 
				-    testing_folds_feats=[fold_1_tst,fold_2_tst,fold_3_tst,fold_4_tst,fold_5_tst]
			
 
				+        lab_1_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				 
			
 
				-    training_folds_labels=[lab_1_trn,lab_2_trn,lab_3_trn,lab_4_trn,lab_5_trn]
			
 
				+        fold_2_trn=np.concatenate((fold_1_min,fold_3_min,fold_4_min,fold_5_min,fold_1_maj,fold_3_maj,fold_4_maj,fold_5_maj))
			
 
				+
			
 
				+        lab_2_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+
			
 
				+        fold_3_trn=np.concatenate((fold_2_min,fold_1_min,fold_4_min,fold_5_min,fold_2_maj,fold_1_maj,fold_4_maj,fold_5_maj))
			
 
				+
			
 
				+        lab_3_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+
			
 
				+        fold_4_trn=np.concatenate((fold_2_min,fold_3_min,fold_1_min,fold_5_min,fold_2_maj,fold_3_maj,fold_1_maj,fold_5_maj))
			
 
				+
			
 
				+        lab_4_trn=np.concatenate((np.zeros(3*a+len(fold_5_min))+1,np.zeros(3*b+len(fold_5_maj))))
			
 
				+
			
 
				+        fold_5_trn=np.concatenate((fold_2_min,fold_3_min,fold_4_min,fold_1_min,fold_2_maj,fold_3_maj,fold_4_maj,fold_1_maj))
			
 
				+
			
 
				+        lab_5_trn=np.concatenate((np.zeros(4*a)+1,np.zeros(4*b)))
			
 
				 
			
 
				-    testing_folds_labels=[lab_1_tst,lab_2_tst,lab_3_tst,lab_4_tst,lab_5_tst]
			
 
				-    
			
 
				-    
			
 
				-    
			
 
				-    for i in range(5):
			
 
				-        
			
 
				-        print('\n')
			
 
				-        print('Executing fold: '+str(i+1))
			
 
				-        print('\n')
			
 
				-        
			
 
				-        r1,r2=convGAN_train_end_to_end(training_folds_feats[i],training_folds_labels[i],testing_folds_feats[i],testing_folds_labels[i], neb, gen, neb_epochs, epochs_retrain_disc)
			
 
				-        results.append(np.array([list(r1[1:]),list(r2[1:])]))
			
 
				-results=np.array(results)
			
 
				 
			
 
				+        training_folds_feats=[fold_1_trn,fold_2_trn,fold_3_trn,fold_4_trn,fold_5_trn]
			
 
				 
			
 
				+        testing_folds_feats=[fold_1_tst,fold_2_tst,fold_3_tst,fold_4_tst,fold_5_tst]
			
 
				 
			
 
				-## Benchmark
			
 
				-mean_rough=np.mean(results[:,0], axis=0)
			
 
				-data_r={'F1-Score_r':[mean_rough[0]], 'Precision_r' : [mean_rough[1]], 'Recall_r' : [mean_rough[2]], 'Kappa_r': [mean_rough[3]]}
			
 
				-df_r=pd.DataFrame(data=data_r)
			
 
				+        training_folds_labels=[lab_1_trn,lab_2_trn,lab_3_trn,lab_4_trn,lab_5_trn]
			
 
				 
			
 
				+        testing_folds_labels=[lab_1_tst,lab_2_tst,lab_3_tst,lab_4_tst,lab_5_tst]
			
 
				 
			
 
				-print('Rough training results:')
			
 
				-print('\n')
			
 
				-print(df_r)
			
 
				 
			
 
				 
			
 
				-mean_final=np.mean(results[:,1], axis=0)
			
 
				-data_f={'F1-Score_f':[mean_final[0]], 'Precision_f' : [mean_final[1]], 'Recall_f' : [mean_final[2]], 'Kappa_f': [mean_final[3]]}
			
 
				-df_f=pd.DataFrame(data=data_f)
			
 
				+        for i in range(5):
			
 
				+
			
 
				+            print('\n')
			
 
				+            print('Executing fold: '+str(i+1))
			
 
				+            print('\n')
			
 
				+
			
 
				+            r1,r2=convGAN_train_end_to_end(training_folds_feats[i],training_folds_labels[i],testing_folds_feats[i],testing_folds_labels[i], neb, gen, neb_epochs, epochs_retrain_disc)
			
 
				+            results.append(np.array([list(r1[1:]),list(r2[1:])]))
			
 
				+    results=np.array(results)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    ## Benchmark
			
 
				+    mean_rough=np.mean(results[:,0], axis=0)
			
 
				+    data_r={'F1-Score_r':[mean_rough[0]], 'Precision_r' : [mean_rough[1]], 'Recall_r' : [mean_rough[2]], 'Kappa_r': [mean_rough[3]]}
			
 
				+    df_r=pd.DataFrame(data=data_r)
			
 
				+
			
 
				+
			
 
				+    print('Rough training results:')
			
 
				+    print('\n')
			
 
				+    print(df_r)
			
 
				+
			
 
				+
			
 
				+    mean_final=np.mean(results[:,1], axis=0)
			
 
				+    data_f={'F1-Score_f':[mean_final[0]], 'Precision_f' : [mean_final[1]], 'Recall_f' : [mean_final[2]], 'Kappa_f': [mean_final[3]]}
			
 
				+    df_f=pd.DataFrame(data=data_f)
			
 
				+
			
 
				+
			
 
				+    print('Final training results:')
			
 
				+    print('\n')
			
 
				+    print(df_f)
			
 
				 
			
 
				 
			
 
				-print('Final training results:')
			
 
				-print('\n')
			
 
				-print(df_f)
			
 
				+if __name__ == "__main__":
			
 
				+    runTest()