3 years ago · 0e5a51946e
--- a/XConvGeN-Example.ipynb
+++ b/XConvGeN-Example.ipynb
--- a/library/generators/XConvGeN.py
+++ b/library/generators/XConvGeN.py
@@ -32,24 +32,89 @@ def create01Labels(totalSize, sizeFirstHalf):
 
				     labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
			
 
				     return np.array(labels)
			
 
				 
			
 
				+
			
 
				+class GeneratorConfig:
			
 
				+    def __init__(self, n_feat=None, neb=5, gen=None, neb_epochs=10, genLayerSizes=None, genAddNoise=True):
			
 
				+        self.n_feat = n_feat
			
 
				+        self.neb = neb
			
 
				+        self.gen = gen
			
 
				+        self.neb_epochs = neb_epochs
			
 
				+        self.genAddNoise = genAddNoise
			
 
				+        self.genLayerSizes = genLayerSizes
			
 
				+
			
 
				+    def isConfigMissing(self):
			
 
				+        return any( x is None for x in
			
 
				+            [ self.n_feat
			
 
				+            , self.neb
			
 
				+            , self.gen
			
 
				+            , self.genAddNoise
			
 
				+            , self.genLayerSizes
			
 
				+            , self.neb_epochs
			
 
				+            ])
			
 
				+
			
 
				+    def checkForValidConfig(self):
			
 
				+        if self.isConfigMissing():
			
 
				+            raise ValueError(f"Some configuration is missing.")
			
 
				+
			
 
				+        if self.neb > self.gen:
			
 
				+            raise ValueError(f"Expected neb <= gen but got neb={self.neb} and gen={self.gen}.")
			
 
				+
			
 
				+        if sum(self.genLayerSizes) != self.gen:
			
 
				+            raise ValueError(f"Expected the layer sizes to sum up to gen={self.gen}.")
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def fixMissingValuesByInputData(self, data):
			
 
				+        config = GeneratorConfig()
			
 
				+        config.neb = self.neb
			
 
				+        config.gen = self.gen
			
 
				+        config.genAddNoise = self.genAddNoise
			
 
				+        config.genLayerSizes = self.genLayerSizes
			
 
				+        
			
 
				+        if data is not None:
			
 
				+            if config.n_feat is None:
			
 
				+                config.n_feat = data.shape[1]
			
 
				+
			
 
				+            if config.neb is None:
			
 
				+                config.neb = data.shape[0]
			
 
				+            else:
			
 
				+                config.neb = min(config.neb, data.shape[0])
			
 
				+
			
 
				+        if config.gen is None:
			
 
				+            config.gen = config.neb
			
 
				+
			
 
				+        if config.genLayerSizes is None:
			
 
				+            config.genLayerSizes = [config.gen]
			
 
				+
			
 
				+        return config
			
 
				+
			
 
				+    def nebShape(self, aboveSize=None):
			
 
				+        if aboveSize is None:
			
 
				+            return (self.neb, self.n_feat)
			
 
				+        else:
			
 
				+            return (aboveSize, self.neb, self.n_feat)
			
 
				+
			
 
				+    def genShape(self, aboveSize=None):
			
 
				+        if aboveSize is None:
			
 
				+            return (self.gen, self.n_feat)
			
 
				+        else:
			
 
				+            return (aboveSize, self.gen, self.n_feat)
			
 
				+
			
 
				+
			
 
				+
			
 
				 class XConvGeN(GanBaseClass):
			
 
				     """
			
 
				     This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
			
 
				     """
			
 
				-    def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
			
 
				+    def __init__(self, config=None, fdc=None, debug=False):
			
 
				         self.isTrained = False
			
 
				-        self.n_feat = n_feat
			
 
				-        self.neb = neb
			
 
				-        self.nebInitial = neb
			
 
				-        self.genInitial = gen
			
 
				-        self.gen = gen if gen is not None else self.neb
			
 
				-        self.neb_epochs = neb_epochs
			
 
				+        self.config = config
			
 
				+        self.defaultConfig = config
			
 
				         self.loss_history = None
			
 
				         self.debug = debug
			
 
				         self.minSetSize = 0
			
 
				         self.conv_sample_generator = None
			
 
				         self.maj_min_discriminator = None
			
 
				-        self.maj_proximal = maj_proximal
			
 
				         self.cg = None
			
 
				         self.canPredict = True
			
 
				         self.fdc = fdc
			
@@ -59,8 +124,8 @@ class XConvGeN(GanBaseClass):
 
				             "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
			
 
				             ] }
			
 
				 
			
 
				-        if self.neb is not None and self.gen is not None and self.neb > self.gen:
			
 
				-            raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
			
 
				+        if not self.config.isConfigMissing():
			
 
				+            self.config.checkForValidConfig()
			
 
				 
			
 
				     def reset(self, data):
			
 
				         """
			
@@ -72,16 +137,8 @@ class XConvGeN(GanBaseClass):
 
				         """
			
 
				         self.isTrained = False
			
 
				 
			
 
				-        if data is not None:
			
 
				-            nMinoryPoints = data.shape[0]
			
 
				-            if self.nebInitial is None:
			
 
				-                self.neb = nMinoryPoints
			
 
				-            else:
			
 
				-                self.neb = min(self.nebInitial, nMinoryPoints)
			
 
				-        else:
			
 
				-            self.neb = self.nebInitial
			
 
				-
			
 
				-        self.gen = self.genInitial if self.genInitial is not None else self.neb
			
 
				+        self.config = self.defaultConfig.fixMissingValuesByInputData(data)
			
 
				+        self.config.checkForValidConfig()
			
 
				 
			
 
				         ## instanciate generator network and visualize architecture
			
 
				         self.conv_sample_generator = self._conv_sample_gen()
			
@@ -94,7 +151,7 @@ class XConvGeN(GanBaseClass):
 
				 
			
 
				         self.lastProgress = (-1,-1,-1)
			
 
				         if self.debug:
			
 
				-            print(f"neb={self.neb}, gen={self.gen}")
			
 
				+            print(f"neb={self.config.neb}, gen={self.config.gen}")
			
 
				 
			
 
				             print(self.conv_sample_generator.summary())
			
 
				             print('\n')
			
@@ -124,12 +181,9 @@ class XConvGeN(GanBaseClass):
 
				         if self.fdc is not None:
			
 
				             normalizedData = self.fdc.normalize(data)
			
 
				             
			
 
				-        print(f"|N| = {normalizedData.shape}")
			
 
				-        print(f"|D| = {data.shape}")
			
 
				-        
			
 
				         self.timing["NbhSearch"].start()
			
 
				         # Precalculate neighborhoods
			
 
				-        self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
			
 
				+        self.nmbMin = NNSearch(self.config.neb).fit(haystack=normalizedData)
			
 
				         self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
			
 
				         self.timing["NbhSearch"].stop()
			
 
				 
			
@@ -158,7 +212,7 @@ class XConvGeN(GanBaseClass):
 
				 
			
 
				         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
			
 
				         synth_num = (numOfSamples // self.minSetSize) + 1
			
 
				-        runs = (synth_num // self.gen) + 1
			
 
				+        runs = (synth_num // self.config.gen) + 1
			
 
				 
			
 
				         ## Get a random list of all indices
			
 
				         indices = randomIndices(self.minSetSize)
			
@@ -184,7 +238,7 @@ class XConvGeN(GanBaseClass):
 
				         corrected = pairs.map(self.correct_feature_types())
			
 
				 
			
 
				         ## extract the exact number of synthetic samples needed to exactly balance the two classes
			
 
				-        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.gen)))), axis=0)[:numOfSamples]
			
 
				+        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.config.gen)))), axis=0)[:numOfSamples]
			
 
				 
			
 
				         return r
			
 
				 
			
@@ -202,48 +256,59 @@ class XConvGeN(GanBaseClass):
 
				     # ###############################################################
			
 
				 
			
 
				     # Creating the Network: Generator
			
 
				-    def _conv_sample_gen(self, layerSize=None):
			
 
				+    def _conv_sample_gen(self):
			
 
				         """
			
 
				         The generator network to generate synthetic samples from the convex space
			
 
				         of arbitrary minority neighbourhoods
			
 
				         """
			
 
				 
			
 
				-        if layerSize is None:
			
 
				-            layerSize = (self.gen // 2) + 1
			
 
				+        n_feat = self.config.n_feat
			
 
				+        neb = self.config.neb
			
 
				+        gen = self.config.gen
			
 
				+        genLayerSizes = self.config.genLayerSizes
			
 
				 
			
 
				         ## takes minority batch as input
			
 
				-        min_neb_batch = Input(shape=(self.neb, self.n_feat,))
			
 
				+        min_neb_batch = Input(shape=(neb, n_feat))
			
 
				 
			
 
				         ## using 1-D convolution, feature dimension remains the same
			
 
				-        x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
			
 
				+        x = Conv1D(n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
			
 
				         ## flatten after convolution
			
 
				         x = Flatten(name="InputMatrixToVector")(x)
			
 
				 
			
 
				         synth = []
			
 
				         n = 0
			
 
				-        while n < self.gen:
			
 
				-            w = min(layerSize, self.gen - n)
			
 
				+        if sum(genLayerSizes) < gen:
			
 
				+            genLayerSizes.append(gen)
			
 
				+
			
 
				+        for layerSize in genLayerSizes:
			
 
				+            w = min(layerSize, gen - n)
			
 
				             if w <= 0:
			
 
				                 break
			
 
				             n += w
			
 
				     
			
 
				             ## add dense layer to transform the vector to a convenient dimension
			
 
				-            y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
			
 
				+            y = Dense(neb * w, activation='relu', name=f"P{n}_dense")(x)
			
 
				 
			
 
				             ## again, witching to 2-D tensor once we have the convenient shape
			
 
				-            y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
			
 
				+            y = Reshape((neb, w), name=f"P{n}_reshape")(y)
			
 
				+
			
 
				             ## column wise sum
			
 
				             s = K.sum(y, axis=1)
			
 
				+
			
 
				             ## adding a small constant to always ensure the column sums are non zero.
			
 
				             ## if this is not done then during initialization the sum can be zero.
			
 
				             s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
			
 
				+
			
 
				             ## reprocals of the approximated column sum
			
 
				             sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
			
 
				+
			
 
				             ## At this step we ensure that column sum is 1 for every row in x.
			
 
				             ## That means, each column is set of convex co-efficient
			
 
				             y = Multiply(name=f"P{n}_normalize")([sinv, y])
			
 
				+
			
 
				             ## Now we transpose the matrix. So each row is now a set of convex coefficients
			
 
				             aff = tf.transpose(y[0], name=f"P{n}_transpose")
			
 
				+
			
 
				             ## We now do matrix multiplication of the affine combinations with the original
			
 
				             ## minority batch taken as input. This generates a convex transformation
			
 
				             ## of the input minority batch
			
@@ -252,18 +317,19 @@ class XConvGeN(GanBaseClass):
 
				 
			
 
				         synth = tf.concat(synth, axis=1, name="collect_planes")
			
 
				 
			
 
				-        nOut = self.gen * self.n_feat
			
 
				+        nOut = gen * n_feat
			
 
				 
			
 
				-        noiseGenerator = Sequential([
			
 
				-          InputLayer(input_shape=(self.gen, self.n_feat)),
			
 
				-          Flatten(),
			
 
				-          Dense(tfp.layers.IndependentNormal.params_size(nOut)),
			
 
				-          tfp.layers.IndependentNormal(nOut)
			
 
				-        ], name="RandomNoise")
			
 
				+        if self.config.genAddNoise:
			
 
				+            noiseGenerator = Sequential([
			
 
				+              InputLayer(input_shape=(gen, n_feat)),
			
 
				+              Flatten(),
			
 
				+              Dense(tfp.layers.IndependentNormal.params_size(nOut)),
			
 
				+              tfp.layers.IndependentNormal(nOut)
			
 
				+            ], name="RandomNoise")
			
 
				 
			
 
				-        noise = noiseGenerator(synth)
			
 
				-        noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
			
 
				-        synth = Add(name="AddNoise")([synth, noise])
			
 
				+            noise = noiseGenerator(synth)
			
 
				+            noise = Reshape((gen, n_feat), name="ReshapeNoise")(noise)
			
 
				+            synth = Add(name="AddNoise")([synth, noise])
			
 
				 
			
 
				         ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
			
 
				         ## as input and a covex space transformation of the same number of samples as output
			
@@ -286,7 +352,7 @@ class XConvGeN(GanBaseClass):
 
				 
			
 
				         ## takes as input synthetic sample generated as input stacked upon a batch of
			
 
				         ## borderline majority samples
			
 
				-        samples = Input(shape=(self.n_feat,))
			
 
				+        samples = Input(shape=(self.config.n_feat,))
			
 
				         
			
 
				         ## passed through two dense layers
			
 
				         y = Dense(250, activation='relu')(samples)
			
@@ -309,6 +375,11 @@ class XConvGeN(GanBaseClass):
 
				         conv_coeff_generator-> generator network instance
			
 
				         maj_min_discriminator -> discriminator network instance
			
 
				         """
			
 
				+
			
 
				+        n_feat = self.config.n_feat
			
 
				+        neb = self.config.neb
			
 
				+        gen = self.config.gen
			
 
				+
			
 
				         ## by default the discriminator trainability is switched off.
			
 
				         ## Thus training ConvGeN means training the generator network as per previously
			
 
				         ## trained discriminator network.
			
@@ -319,7 +390,7 @@ class XConvGeN(GanBaseClass):
 
				 
			
 
				         ## input receives a neighbourhood minority batch
			
 
				         ## and a proximal majority batch concatenated
			
 
				-        batch_data = Input(shape=(2, self.gen, self.n_feat,))
			
 
				+        batch_data = Input(shape=(2, gen, n_feat))
			
 
				         # batch_data: (batchSize, 2, gen, n_feat)
			
 
				         
			
 
				         ## extract minority batch
			
@@ -329,23 +400,23 @@ class XConvGeN(GanBaseClass):
 
				         ## extract majority batch
			
 
				         maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
			
 
				         # maj_batch: (batchSize, gen, n_feat)
			
 
				-        maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
			
 
				+        maj_batch = tf.reshape(maj_batch, (-1, n_feat), name="ReshapeForDisc")
			
 
				         # maj_batch: (batchSize * gen, n_feat)
			
 
				         
			
 
				         ## pass minority batch into generator to obtain convex space transformation
			
 
				         ## (synthetic samples) of the minority neighbourhood input batch
			
 
				         conv_samples = generator(min_batch)
			
 
				         # conv_batch: (batchSize, gen, n_feat)
			
 
				-        conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
			
 
				+        conv_samples = tf.reshape(conv_samples, (-1, n_feat), name="ReshapeGenOutput")
			
 
				         # conv_batch: (batchSize * gen, n_feat)
			
 
				 
			
 
				         ## pass samples into the discriminator to know its decisions
			
 
				         conv_samples = discriminator(conv_samples)
			
 
				-        conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
			
 
				+        conv_samples = tf.reshape(conv_samples, (-1, gen, 2), name="ReshapeGenDiscOutput")
			
 
				         # conv_batch: (batchSize * gen, 2)
			
 
				 
			
 
				         maj_batch = discriminator(maj_batch)
			
 
				-        maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
			
 
				+        maj_batch = tf.reshape(maj_batch, (-1, gen, 2), name="ReshapeMajDiscOutput")
			
 
				         # conv_batch: (batchSize * gen, 2)
			
 
				         
			
 
				         ## concatenate the decisions
			
@@ -361,6 +432,11 @@ class XConvGeN(GanBaseClass):
 
				 
			
 
				     # Training
			
 
				     def _rough_learning(self, data, discTrainCount, batchSize=32):
			
 
				+
			
 
				+        n_feat = self.config.n_feat
			
 
				+        neb = self.config.neb
			
 
				+        gen = self.config.gen
			
 
				+
			
 
				         generator = self.conv_sample_generator
			
 
				         discriminator = self.maj_min_discriminator
			
 
				         convGeN = self.cg
			
@@ -368,8 +444,8 @@ class XConvGeN(GanBaseClass):
 
				         minSetSize = len(data)
			
 
				 
			
 
				         ## Create labels for one neighborhood training.
			
 
				-        nLabels = 2 * self.gen
			
 
				-        labels = np.array(create01Labels(nLabels, self.gen))
			
 
				+        nLabels = 2 * gen
			
 
				+        labels = np.array(create01Labels(nLabels, gen))
			
 
				         labelsGeN = np.array([labels])
			
 
				 
			
 
				         def getNeighborhoods():
			
@@ -401,11 +477,11 @@ class XConvGeN(GanBaseClass):
 
				                 for x in labels:
			
 
				                     yield x
			
 
				         
			
 
				-        padd = np.zeros((self.gen - self.neb, self.n_feat))
			
 
				+        padd = np.zeros((gen - neb, n_feat))
			
 
				         discTrainCount = 1 + max(0, discTrainCount)    
			
 
				 
			
 
				-        for neb_epoch_count in range(self.neb_epochs):
			
 
				-            self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
			
 
				+        for neb_epoch_count in range(self.config.neb_epochs):
			
 
				+            self.progressBar([(neb_epoch_count + 1) / self.config.neb_epochs, 0.5, 0.5])
			
 
				 
			
 
				             ## Training of the discriminator.
			
 
				             #
			
@@ -428,7 +504,7 @@ class XConvGeN(GanBaseClass):
 
				             b = tf.data.Dataset.from_tensor_slices(labels).repeat()
			
 
				 
			
 
				             # Zip data and matching labels together for training. 
			
 
				-            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
			
 
				+            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * gen)
			
 
				 
			
 
				             # train the discriminator with the concatenated samples and the one-hot encoded labels
			
 
				             self.timing["Fit"].start()
			
@@ -479,7 +555,7 @@ class XConvGeN(GanBaseClass):
 
				         ## min_idxs -> indices of points in minority class
			
 
				         ## gen -> convex combinations generated from each neighbourhood
			
 
				         self.timing["BMB"].start()
			
 
				-        indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
			
 
				+        indices = randomIndices(self.minSetSize, outputSize=self.config.gen, indicesToIgnore=min_idxs)
			
 
				         r = self.nmbMin.basePoints[indices]
			
 
				         self.timing["BMB"].stop()
			
 
				         return r
			
@@ -488,7 +564,7 @@ class XConvGeN(GanBaseClass):
 
				     def retrainDiscriminitor(self, data, labels):
			
 
				         self.maj_min_discriminator.trainable = True
			
 
				         labels = np.array([ [x, 1 - x] for x in labels])
			
 
				-        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
			
 
				+        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.config.neb_epochs)
			
 
				         self.maj_min_discriminator.trainable = False
			
 
				 
			
 
				     def progressBar(self, x):
			
@@ -527,8 +603,8 @@ class XConvGeN(GanBaseClass):
 
				         if len(columns) == 0:
			
 
				             return voidFunction
			
 
				         
			
 
				-        neb = self.neb
			
 
				-        n_feat = self.n_feat
			
 
				+        neb = self.config.neb
			
 
				+        n_feat = self.config.n_feat
			
 
				         nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
			
 
				         if n_feat is None:
			
 
				             print("ERRROR n_feat is None")