3 лет назад · 0e5a51946e
--- a/XConvGeN-Example.ipynb
+++ b/XConvGeN-Example.ipynb
--- a/library/generators/XConvGeN.py
+++ b/library/generators/XConvGeN.py
@@ -32,24 +32,89 @@ def create01Labels(totalSize, sizeFirstHalf):
 
															     labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
														
 
															     return np.array(labels)
														
 
															+
														
 
															+class GeneratorConfig:
														
 
															+    def __init__(self, n_feat=None, neb=5, gen=None, neb_epochs=10, genLayerSizes=None, genAddNoise=True):
														
 
															+        self.n_feat = n_feat
														
 
															+        self.neb = neb
														
 
															+        self.gen = gen
														
 
															+        self.neb_epochs = neb_epochs
														
 
															+        self.genAddNoise = genAddNoise
														
 
															+        self.genLayerSizes = genLayerSizes
														
 
															+
														
 
															+    def isConfigMissing(self):
														
 
															+        return any( x is None for x in
														
 
															+            [ self.n_feat
														
 
															+            , self.neb
														
 
															+            , self.gen
														
 
															+            , self.genAddNoise
														
 
															+            , self.genLayerSizes
														
 
															+            , self.neb_epochs
														
 
															+            ])
														
 
															+
														
 
															+    def checkForValidConfig(self):
														
 
															+        if self.isConfigMissing():
														
 
															+            raise ValueError(f"Some configuration is missing.")
														
 
															+
														
 
															+        if self.neb > self.gen:
														
 
															+            raise ValueError(f"Expected neb <= gen but got neb={self.neb} and gen={self.gen}.")
														
 
															+
														
 
															+        if sum(self.genLayerSizes) != self.gen:
														
 
															+            raise ValueError(f"Expected the layer sizes to sum up to gen={self.gen}.")
														
 
															+
														
 
															+        return True
														
 
															+
														
 
															+    def fixMissingValuesByInputData(self, data):
														
 
															+        config = GeneratorConfig()
														
 
															+        config.neb = self.neb
														
 
															+        config.gen = self.gen
														
 
															+        config.genAddNoise = self.genAddNoise
														
 
															+        config.genLayerSizes = self.genLayerSizes
														
 
															+        
														
 
															+        if data is not None:
														
 
															+            if config.n_feat is None:
														
 
															+                config.n_feat = data.shape[1]
														
 
															+
														
 
															+            if config.neb is None:
														
 
															+                config.neb = data.shape[0]
														
 
															+            else:
														
 
															+                config.neb = min(config.neb, data.shape[0])
														
 
															+
														
 
															+        if config.gen is None:
														
 
															+            config.gen = config.neb
														
 
															+
														
 
															+        if config.genLayerSizes is None:
														
 
															+            config.genLayerSizes = [config.gen]
														
 
															+
														
 
															+        return config
														
 
															+
														
 
															+    def nebShape(self, aboveSize=None):
														
 
															+        if aboveSize is None:
														
 
															+            return (self.neb, self.n_feat)
														
 
															+        else:
														
 
															+            return (aboveSize, self.neb, self.n_feat)
														
 
															+
														
 
															+    def genShape(self, aboveSize=None):
														
 
															+        if aboveSize is None:
														
 
															+            return (self.gen, self.n_feat)
														
 
															+        else:
														
 
															+            return (aboveSize, self.gen, self.n_feat)
														
 
															+
														
 
															+
														
 
															+
														
 
															 class XConvGeN(GanBaseClass):
														
 
															     """
														
 
															     This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
														
 
															     """
														
 
															-    def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
														
 
															+    def __init__(self, config=None, fdc=None, debug=False):
														
 
															         self.isTrained = False
														
 
															-        self.n_feat = n_feat
														
 
															-        self.neb = neb
														
 
															-        self.nebInitial = neb
														
 
															-        self.genInitial = gen
														
 
															-        self.gen = gen if gen is not None else self.neb
														
 
															-        self.neb_epochs = neb_epochs
														
 
															+        self.config = config
														
 
															+        self.defaultConfig = config
														
 
															         self.loss_history = None
														
 
															         self.debug = debug
														
 
															         self.minSetSize = 0
														
 
															         self.conv_sample_generator = None
														
 
															         self.maj_min_discriminator = None
														
 
															-        self.maj_proximal = maj_proximal
														
 
															         self.cg = None
														
 
															         self.canPredict = True
														
 
															         self.fdc = fdc
														
@@ -59,8 +124,8 @@ class XConvGeN(GanBaseClass):
 
															             "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
														
 
															             ] }
														
 
															-        if self.neb is not None and self.gen is not None and self.neb > self.gen:
														
 
															-            raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
														
 
															+        if not self.config.isConfigMissing():
														
 
															+            self.config.checkForValidConfig()
														
 
															     def reset(self, data):
														
 
															         """
														
@@ -72,16 +137,8 @@ class XConvGeN(GanBaseClass):
 
															         """
														
 
															         self.isTrained = False
														
 
															-        if data is not None:
														
 
															-            nMinoryPoints = data.shape[0]
														
 
															-            if self.nebInitial is None:
														
 
															-                self.neb = nMinoryPoints
														
 
															-            else:
														
 
															-                self.neb = min(self.nebInitial, nMinoryPoints)
														
 
															-        else:
														
 
															-            self.neb = self.nebInitial
														
 
															-
														
 
															-        self.gen = self.genInitial if self.genInitial is not None else self.neb
														
 
															+        self.config = self.defaultConfig.fixMissingValuesByInputData(data)
														
 
															+        self.config.checkForValidConfig()
														
 
															         ## instanciate generator network and visualize architecture
														
 
															         self.conv_sample_generator = self._conv_sample_gen()
														
@@ -94,7 +151,7 @@ class XConvGeN(GanBaseClass):
 
															         self.lastProgress = (-1,-1,-1)
														
 
															         if self.debug:
														
 
															-            print(f"neb={self.neb}, gen={self.gen}")
														
 
															+            print(f"neb={self.config.neb}, gen={self.config.gen}")
														
 
															             print(self.conv_sample_generator.summary())
														
 
															             print('\n')
														
@@ -124,12 +181,9 @@ class XConvGeN(GanBaseClass):
 
															         if self.fdc is not None:
														
 
															             normalizedData = self.fdc.normalize(data)
														
 
															-        print(f"|N| = {normalizedData.shape}")
														
 
															-        print(f"|D| = {data.shape}")
														
 
															-        
														
 
															         self.timing["NbhSearch"].start()
														
 
															         # Precalculate neighborhoods
														
 
															-        self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
														
 
															+        self.nmbMin = NNSearch(self.config.neb).fit(haystack=normalizedData)
														
 
															         self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
														
 
															         self.timing["NbhSearch"].stop()
														
@@ -158,7 +212,7 @@ class XConvGeN(GanBaseClass):
 
															         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
														
 
															         synth_num = (numOfSamples // self.minSetSize) + 1
														
 
															-        runs = (synth_num // self.gen) + 1
														
 
															+        runs = (synth_num // self.config.gen) + 1
														
 
															         ## Get a random list of all indices
														
 
															         indices = randomIndices(self.minSetSize)
														
@@ -184,7 +238,7 @@ class XConvGeN(GanBaseClass):
 
															         corrected = pairs.map(self.correct_feature_types())
														
 
															         ## extract the exact number of synthetic samples needed to exactly balance the two classes
														
 
															-        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.gen)))), axis=0)[:numOfSamples]
														
 
															+        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.config.gen)))), axis=0)[:numOfSamples]
														
 
															         return r
														
@@ -202,48 +256,59 @@ class XConvGeN(GanBaseClass):
 
															     # ###############################################################
														
 
															     # Creating the Network: Generator
														
 
															-    def _conv_sample_gen(self, layerSize=None):
														
 
															+    def _conv_sample_gen(self):
														
 
															         """
														
 
															         The generator network to generate synthetic samples from the convex space
														
 
															         of arbitrary minority neighbourhoods
														
 
															         """
														
 
															-        if layerSize is None:
														
 
															-            layerSize = (self.gen // 2) + 1
														
 
															+        n_feat = self.config.n_feat
														
 
															+        neb = self.config.neb
														
 
															+        gen = self.config.gen
														
 
															+        genLayerSizes = self.config.genLayerSizes
														
 
															         ## takes minority batch as input
														
 
															-        min_neb_batch = Input(shape=(self.neb, self.n_feat,))
														
 
															+        min_neb_batch = Input(shape=(neb, n_feat))
														
 
															         ## using 1-D convolution, feature dimension remains the same
														
 
															-        x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
														
 
															+        x = Conv1D(n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
														
 
															         ## flatten after convolution
														
 
															         x = Flatten(name="InputMatrixToVector")(x)
														
 
															         synth = []
														
 
															         n = 0
														
 
															-        while n < self.gen:
														
 
															-            w = min(layerSize, self.gen - n)
														
 
															+        if sum(genLayerSizes) < gen:
														
 
															+            genLayerSizes.append(gen)
														
 
															+
														
 
															+        for layerSize in genLayerSizes:
														
 
															+            w = min(layerSize, gen - n)
														
 
															             if w <= 0:
														
 
															                 break
														
 
															             n += w
														
 
															             ## add dense layer to transform the vector to a convenient dimension
														
 
															-            y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
														
 
															+            y = Dense(neb * w, activation='relu', name=f"P{n}_dense")(x)
														
 
															             ## again, witching to 2-D tensor once we have the convenient shape
														
 
															-            y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
														
 
															+            y = Reshape((neb, w), name=f"P{n}_reshape")(y)
														
 
															+
														
 
															             ## column wise sum
														
 
															             s = K.sum(y, axis=1)
														
 
															+
														
 
															             ## adding a small constant to always ensure the column sums are non zero.
														
 
															             ## if this is not done then during initialization the sum can be zero.
														
 
															             s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
														
 
															+
														
 
															             ## reprocals of the approximated column sum
														
 
															             sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
														
 
															+
														
 
															             ## At this step we ensure that column sum is 1 for every row in x.
														
 
															             ## That means, each column is set of convex co-efficient
														
 
															             y = Multiply(name=f"P{n}_normalize")([sinv, y])
														
 
															+
														
 
															             ## Now we transpose the matrix. So each row is now a set of convex coefficients
														
 
															             aff = tf.transpose(y[0], name=f"P{n}_transpose")
														
 
															+
														
 
															             ## We now do matrix multiplication of the affine combinations with the original
														
 
															             ## minority batch taken as input. This generates a convex transformation
														
 
															             ## of the input minority batch
														
@@ -252,18 +317,19 @@ class XConvGeN(GanBaseClass):
 
															         synth = tf.concat(synth, axis=1, name="collect_planes")
														
 
															-        nOut = self.gen * self.n_feat
														
 
															+        nOut = gen * n_feat
														
 
															-        noiseGenerator = Sequential([
														
 
															-          InputLayer(input_shape=(self.gen, self.n_feat)),
														
 
															-          Flatten(),
														
 
															-          Dense(tfp.layers.IndependentNormal.params_size(nOut)),
														
 
															-          tfp.layers.IndependentNormal(nOut)
														
 
															-        ], name="RandomNoise")
														
 
															+        if self.config.genAddNoise:
														
 
															+            noiseGenerator = Sequential([
														
 
															+              InputLayer(input_shape=(gen, n_feat)),
														
 
															+              Flatten(),
														
 
															+              Dense(tfp.layers.IndependentNormal.params_size(nOut)),
														
 
															+              tfp.layers.IndependentNormal(nOut)
														
 
															+            ], name="RandomNoise")
														
 
															-        noise = noiseGenerator(synth)
														
 
															-        noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
														
 
															-        synth = Add(name="AddNoise")([synth, noise])
														
 
															+            noise = noiseGenerator(synth)
														
 
															+            noise = Reshape((gen, n_feat), name="ReshapeNoise")(noise)
														
 
															+            synth = Add(name="AddNoise")([synth, noise])
														
 
															         ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
														
 
															         ## as input and a covex space transformation of the same number of samples as output
														
@@ -286,7 +352,7 @@ class XConvGeN(GanBaseClass):
 
															         ## takes as input synthetic sample generated as input stacked upon a batch of
														
 
															         ## borderline majority samples
														
 
															-        samples = Input(shape=(self.n_feat,))
														
 
															+        samples = Input(shape=(self.config.n_feat,))
														
 
															         ## passed through two dense layers
														
 
															         y = Dense(250, activation='relu')(samples)
														
@@ -309,6 +375,11 @@ class XConvGeN(GanBaseClass):
 
															         conv_coeff_generator-> generator network instance
														
 
															         maj_min_discriminator -> discriminator network instance
														
 
															         """
														
 
															+
														
 
															+        n_feat = self.config.n_feat
														
 
															+        neb = self.config.neb
														
 
															+        gen = self.config.gen
														
 
															+
														
 
															         ## by default the discriminator trainability is switched off.
														
 
															         ## Thus training ConvGeN means training the generator network as per previously
														
 
															         ## trained discriminator network.
														
@@ -319,7 +390,7 @@ class XConvGeN(GanBaseClass):
 
															         ## input receives a neighbourhood minority batch
														
 
															         ## and a proximal majority batch concatenated
														
 
															-        batch_data = Input(shape=(2, self.gen, self.n_feat,))
														
 
															+        batch_data = Input(shape=(2, gen, n_feat))
														
 
															         # batch_data: (batchSize, 2, gen, n_feat)
														
 
															         ## extract minority batch
														
@@ -329,23 +400,23 @@ class XConvGeN(GanBaseClass):
 
															         ## extract majority batch
														
 
															         maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
														
 
															         # maj_batch: (batchSize, gen, n_feat)
														
 
															-        maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
														
 
															+        maj_batch = tf.reshape(maj_batch, (-1, n_feat), name="ReshapeForDisc")
														
 
															         # maj_batch: (batchSize * gen, n_feat)
														
 
															         ## pass minority batch into generator to obtain convex space transformation
														
 
															         ## (synthetic samples) of the minority neighbourhood input batch
														
 
															         conv_samples = generator(min_batch)
														
 
															         # conv_batch: (batchSize, gen, n_feat)
														
 
															-        conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
														
 
															+        conv_samples = tf.reshape(conv_samples, (-1, n_feat), name="ReshapeGenOutput")
														
 
															         # conv_batch: (batchSize * gen, n_feat)
														
 
															         ## pass samples into the discriminator to know its decisions
														
 
															         conv_samples = discriminator(conv_samples)
														
 
															-        conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
														
 
															+        conv_samples = tf.reshape(conv_samples, (-1, gen, 2), name="ReshapeGenDiscOutput")
														
 
															         # conv_batch: (batchSize * gen, 2)
														
 
															         maj_batch = discriminator(maj_batch)
														
 
															-        maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
														
 
															+        maj_batch = tf.reshape(maj_batch, (-1, gen, 2), name="ReshapeMajDiscOutput")
														
 
															         # conv_batch: (batchSize * gen, 2)
														
 
															         ## concatenate the decisions
														
@@ -361,6 +432,11 @@ class XConvGeN(GanBaseClass):
 
															     # Training
														
 
															     def _rough_learning(self, data, discTrainCount, batchSize=32):
														
 
															+
														
 
															+        n_feat = self.config.n_feat
														
 
															+        neb = self.config.neb
														
 
															+        gen = self.config.gen
														
 
															+
														
 
															         generator = self.conv_sample_generator
														
 
															         discriminator = self.maj_min_discriminator
														
 
															         convGeN = self.cg
														
@@ -368,8 +444,8 @@ class XConvGeN(GanBaseClass):
 
															         minSetSize = len(data)
														
 
															         ## Create labels for one neighborhood training.
														
 
															-        nLabels = 2 * self.gen
														
 
															-        labels = np.array(create01Labels(nLabels, self.gen))
														
 
															+        nLabels = 2 * gen
														
 
															+        labels = np.array(create01Labels(nLabels, gen))
														
 
															         labelsGeN = np.array([labels])
														
 
															         def getNeighborhoods():
														
@@ -401,11 +477,11 @@ class XConvGeN(GanBaseClass):
 
															                 for x in labels:
														
 
															                     yield x
														
 
															-        padd = np.zeros((self.gen - self.neb, self.n_feat))
														
 
															+        padd = np.zeros((gen - neb, n_feat))
														
 
															         discTrainCount = 1 + max(0, discTrainCount)    
														
 
															-        for neb_epoch_count in range(self.neb_epochs):
														
 
															-            self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
														
 
															+        for neb_epoch_count in range(self.config.neb_epochs):
														
 
															+            self.progressBar([(neb_epoch_count + 1) / self.config.neb_epochs, 0.5, 0.5])
														
 
															             ## Training of the discriminator.
														
 
															             #
														
@@ -428,7 +504,7 @@ class XConvGeN(GanBaseClass):
 
															             b = tf.data.Dataset.from_tensor_slices(labels).repeat()
														
 
															             # Zip data and matching labels together for training. 
														
 
															-            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
														
 
															+            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * gen)
														
 
															             # train the discriminator with the concatenated samples and the one-hot encoded labels
														
 
															             self.timing["Fit"].start()
														
@@ -479,7 +555,7 @@ class XConvGeN(GanBaseClass):
 
															         ## min_idxs -> indices of points in minority class
														
 
															         ## gen -> convex combinations generated from each neighbourhood
														
 
															         self.timing["BMB"].start()
														
 
															-        indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
														
 
															+        indices = randomIndices(self.minSetSize, outputSize=self.config.gen, indicesToIgnore=min_idxs)
														
 
															         r = self.nmbMin.basePoints[indices]
														
 
															         self.timing["BMB"].stop()
														
 
															         return r
														
@@ -488,7 +564,7 @@ class XConvGeN(GanBaseClass):
 
															     def retrainDiscriminitor(self, data, labels):
														
 
															         self.maj_min_discriminator.trainable = True
														
 
															         labels = np.array([ [x, 1 - x] for x in labels])
														
 
															-        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
														
 
															+        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.config.neb_epochs)
														
 
															         self.maj_min_discriminator.trainable = False
														
 
															     def progressBar(self, x):
														
@@ -527,8 +603,8 @@ class XConvGeN(GanBaseClass):
 
															         if len(columns) == 0:
														
 
															             return voidFunction
														
 
															-        neb = self.neb
														
 
															-        n_feat = self.n_feat
														
 
															+        neb = self.config.neb
														
 
															+        n_feat = self.config.n_feat
														
 
															         nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
														
 
															         if n_feat is None:
														
 
															             print("ERRROR n_feat is None")