Преглед изворни кода

Added Class for configuration.

Kristian Schultz пре 3 година
родитељ
комит
0e5a51946e
2 измењених фајлова са 171 додато и 235 уклоњено
  1. 33 173
      XConvGeN-Example.ipynb
  2. 138 62
      library/generators/XConvGeN.py

Разлика између датотеке није приказан због своје велике величине
+ 33 - 173
XConvGeN-Example.ipynb


+ 138 - 62
library/generators/XConvGeN.py

@@ -32,24 +32,89 @@ def create01Labels(totalSize, sizeFirstHalf):
     labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
     return np.array(labels)
 
+
+class GeneratorConfig:
+    def __init__(self, n_feat=None, neb=5, gen=None, neb_epochs=10, genLayerSizes=None, genAddNoise=True):
+        self.n_feat = n_feat
+        self.neb = neb
+        self.gen = gen
+        self.neb_epochs = neb_epochs
+        self.genAddNoise = genAddNoise
+        self.genLayerSizes = genLayerSizes
+
+    def isConfigMissing(self):
+        return any( x is None for x in
+            [ self.n_feat
+            , self.neb
+            , self.gen
+            , self.genAddNoise
+            , self.genLayerSizes
+            , self.neb_epochs
+            ])
+
+    def checkForValidConfig(self):
+        if self.isConfigMissing():
+            raise ValueError(f"Some configuration is missing.")
+
+        if self.neb > self.gen:
+            raise ValueError(f"Expected neb <= gen but got neb={self.neb} and gen={self.gen}.")
+
+        if sum(self.genLayerSizes) != self.gen:
+            raise ValueError(f"Expected the layer sizes to sum up to gen={self.gen}.")
+
+        return True
+
+    def fixMissingValuesByInputData(self, data):
+        config = GeneratorConfig()
+        config.neb = self.neb
+        config.gen = self.gen
+        config.genAddNoise = self.genAddNoise
+        config.genLayerSizes = self.genLayerSizes
+        
+        if data is not None:
+            if config.n_feat is None:
+                config.n_feat = data.shape[1]
+
+            if config.neb is None:
+                config.neb = data.shape[0]
+            else:
+                config.neb = min(config.neb, data.shape[0])
+
+        if config.gen is None:
+            config.gen = config.neb
+
+        if config.genLayerSizes is None:
+            config.genLayerSizes = [config.gen]
+
+        return config
+
+    def nebShape(self, aboveSize=None):
+        if aboveSize is None:
+            return (self.neb, self.n_feat)
+        else:
+            return (aboveSize, self.neb, self.n_feat)
+
+    def genShape(self, aboveSize=None):
+        if aboveSize is None:
+            return (self.gen, self.n_feat)
+        else:
+            return (aboveSize, self.gen, self.n_feat)
+
+
+
 class XConvGeN(GanBaseClass):
     """
     This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
     """
-    def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
+    def __init__(self, config=None, fdc=None, debug=False):
         self.isTrained = False
-        self.n_feat = n_feat
-        self.neb = neb
-        self.nebInitial = neb
-        self.genInitial = gen
-        self.gen = gen if gen is not None else self.neb
-        self.neb_epochs = neb_epochs
+        self.config = config
+        self.defaultConfig = config
         self.loss_history = None
         self.debug = debug
         self.minSetSize = 0
         self.conv_sample_generator = None
         self.maj_min_discriminator = None
-        self.maj_proximal = maj_proximal
         self.cg = None
         self.canPredict = True
         self.fdc = fdc
@@ -59,8 +124,8 @@ class XConvGeN(GanBaseClass):
             "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
             ] }
 
-        if self.neb is not None and self.gen is not None and self.neb > self.gen:
-            raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
+        if not self.config.isConfigMissing():
+            self.config.checkForValidConfig()
 
     def reset(self, data):
         """
@@ -72,16 +137,8 @@ class XConvGeN(GanBaseClass):
         """
         self.isTrained = False
 
-        if data is not None:
-            nMinoryPoints = data.shape[0]
-            if self.nebInitial is None:
-                self.neb = nMinoryPoints
-            else:
-                self.neb = min(self.nebInitial, nMinoryPoints)
-        else:
-            self.neb = self.nebInitial
-
-        self.gen = self.genInitial if self.genInitial is not None else self.neb
+        self.config = self.defaultConfig.fixMissingValuesByInputData(data)
+        self.config.checkForValidConfig()
 
         ## instanciate generator network and visualize architecture
         self.conv_sample_generator = self._conv_sample_gen()
@@ -94,7 +151,7 @@ class XConvGeN(GanBaseClass):
 
         self.lastProgress = (-1,-1,-1)
         if self.debug:
-            print(f"neb={self.neb}, gen={self.gen}")
+            print(f"neb={self.config.neb}, gen={self.config.gen}")
 
             print(self.conv_sample_generator.summary())
             print('\n')
@@ -124,12 +181,9 @@ class XConvGeN(GanBaseClass):
         if self.fdc is not None:
             normalizedData = self.fdc.normalize(data)
             
-        print(f"|N| = {normalizedData.shape}")
-        print(f"|D| = {data.shape}")
-        
         self.timing["NbhSearch"].start()
         # Precalculate neighborhoods
-        self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
+        self.nmbMin = NNSearch(self.config.neb).fit(haystack=normalizedData)
         self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
         self.timing["NbhSearch"].stop()
 
@@ -158,7 +212,7 @@ class XConvGeN(GanBaseClass):
 
         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
         synth_num = (numOfSamples // self.minSetSize) + 1
-        runs = (synth_num // self.gen) + 1
+        runs = (synth_num // self.config.gen) + 1
 
         ## Get a random list of all indices
         indices = randomIndices(self.minSetSize)
@@ -184,7 +238,7 @@ class XConvGeN(GanBaseClass):
         corrected = pairs.map(self.correct_feature_types())
 
         ## extract the exact number of synthetic samples needed to exactly balance the two classes
-        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.gen)))), axis=0)[:numOfSamples]
+        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.config.gen)))), axis=0)[:numOfSamples]
 
         return r
 
@@ -202,48 +256,59 @@ class XConvGeN(GanBaseClass):
     # ###############################################################
 
     # Creating the Network: Generator
-    def _conv_sample_gen(self, layerSize=None):
+    def _conv_sample_gen(self):
         """
         The generator network to generate synthetic samples from the convex space
         of arbitrary minority neighbourhoods
         """
 
-        if layerSize is None:
-            layerSize = (self.gen // 2) + 1
+        n_feat = self.config.n_feat
+        neb = self.config.neb
+        gen = self.config.gen
+        genLayerSizes = self.config.genLayerSizes
 
         ## takes minority batch as input
-        min_neb_batch = Input(shape=(self.neb, self.n_feat,))
+        min_neb_batch = Input(shape=(neb, n_feat))
 
         ## using 1-D convolution, feature dimension remains the same
-        x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
+        x = Conv1D(n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
         ## flatten after convolution
         x = Flatten(name="InputMatrixToVector")(x)
 
         synth = []
         n = 0
-        while n < self.gen:
-            w = min(layerSize, self.gen - n)
+        if sum(genLayerSizes) < gen:
+            genLayerSizes.append(gen)
+
+        for layerSize in genLayerSizes:
+            w = min(layerSize, gen - n)
             if w <= 0:
                 break
             n += w
     
             ## add dense layer to transform the vector to a convenient dimension
-            y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
+            y = Dense(neb * w, activation='relu', name=f"P{n}_dense")(x)
 
             ## again, witching to 2-D tensor once we have the convenient shape
-            y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
+            y = Reshape((neb, w), name=f"P{n}_reshape")(y)
+
             ## column wise sum
             s = K.sum(y, axis=1)
+
             ## adding a small constant to always ensure the column sums are non zero.
             ## if this is not done then during initialization the sum can be zero.
             s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
+
             ## reprocals of the approximated column sum
             sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
+
             ## At this step we ensure that column sum is 1 for every row in x.
             ## That means, each column is set of convex co-efficient
             y = Multiply(name=f"P{n}_normalize")([sinv, y])
+
             ## Now we transpose the matrix. So each row is now a set of convex coefficients
             aff = tf.transpose(y[0], name=f"P{n}_transpose")
+
             ## We now do matrix multiplication of the affine combinations with the original
             ## minority batch taken as input. This generates a convex transformation
             ## of the input minority batch
@@ -252,18 +317,19 @@ class XConvGeN(GanBaseClass):
 
         synth = tf.concat(synth, axis=1, name="collect_planes")
 
-        nOut = self.gen * self.n_feat
+        nOut = gen * n_feat
 
-        noiseGenerator = Sequential([
-          InputLayer(input_shape=(self.gen, self.n_feat)),
-          Flatten(),
-          Dense(tfp.layers.IndependentNormal.params_size(nOut)),
-          tfp.layers.IndependentNormal(nOut)
-        ], name="RandomNoise")
+        if self.config.genAddNoise:
+            noiseGenerator = Sequential([
+              InputLayer(input_shape=(gen, n_feat)),
+              Flatten(),
+              Dense(tfp.layers.IndependentNormal.params_size(nOut)),
+              tfp.layers.IndependentNormal(nOut)
+            ], name="RandomNoise")
 
-        noise = noiseGenerator(synth)
-        noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
-        synth = Add(name="AddNoise")([synth, noise])
+            noise = noiseGenerator(synth)
+            noise = Reshape((gen, n_feat), name="ReshapeNoise")(noise)
+            synth = Add(name="AddNoise")([synth, noise])
 
         ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
         ## as input and a covex space transformation of the same number of samples as output
@@ -286,7 +352,7 @@ class XConvGeN(GanBaseClass):
 
         ## takes as input synthetic sample generated as input stacked upon a batch of
         ## borderline majority samples
-        samples = Input(shape=(self.n_feat,))
+        samples = Input(shape=(self.config.n_feat,))
         
         ## passed through two dense layers
         y = Dense(250, activation='relu')(samples)
@@ -309,6 +375,11 @@ class XConvGeN(GanBaseClass):
         conv_coeff_generator-> generator network instance
         maj_min_discriminator -> discriminator network instance
         """
+
+        n_feat = self.config.n_feat
+        neb = self.config.neb
+        gen = self.config.gen
+
         ## by default the discriminator trainability is switched off.
         ## Thus training ConvGeN means training the generator network as per previously
         ## trained discriminator network.
@@ -319,7 +390,7 @@ class XConvGeN(GanBaseClass):
 
         ## input receives a neighbourhood minority batch
         ## and a proximal majority batch concatenated
-        batch_data = Input(shape=(2, self.gen, self.n_feat,))
+        batch_data = Input(shape=(2, gen, n_feat))
         # batch_data: (batchSize, 2, gen, n_feat)
         
         ## extract minority batch
@@ -329,23 +400,23 @@ class XConvGeN(GanBaseClass):
         ## extract majority batch
         maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
         # maj_batch: (batchSize, gen, n_feat)
-        maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
+        maj_batch = tf.reshape(maj_batch, (-1, n_feat), name="ReshapeForDisc")
         # maj_batch: (batchSize * gen, n_feat)
         
         ## pass minority batch into generator to obtain convex space transformation
         ## (synthetic samples) of the minority neighbourhood input batch
         conv_samples = generator(min_batch)
         # conv_batch: (batchSize, gen, n_feat)
-        conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
+        conv_samples = tf.reshape(conv_samples, (-1, n_feat), name="ReshapeGenOutput")
         # conv_batch: (batchSize * gen, n_feat)
 
         ## pass samples into the discriminator to know its decisions
         conv_samples = discriminator(conv_samples)
-        conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
+        conv_samples = tf.reshape(conv_samples, (-1, gen, 2), name="ReshapeGenDiscOutput")
         # conv_batch: (batchSize * gen, 2)
 
         maj_batch = discriminator(maj_batch)
-        maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
+        maj_batch = tf.reshape(maj_batch, (-1, gen, 2), name="ReshapeMajDiscOutput")
         # conv_batch: (batchSize * gen, 2)
         
         ## concatenate the decisions
@@ -361,6 +432,11 @@ class XConvGeN(GanBaseClass):
 
     # Training
     def _rough_learning(self, data, discTrainCount, batchSize=32):
+
+        n_feat = self.config.n_feat
+        neb = self.config.neb
+        gen = self.config.gen
+
         generator = self.conv_sample_generator
         discriminator = self.maj_min_discriminator
         convGeN = self.cg
@@ -368,8 +444,8 @@ class XConvGeN(GanBaseClass):
         minSetSize = len(data)
 
         ## Create labels for one neighborhood training.
-        nLabels = 2 * self.gen
-        labels = np.array(create01Labels(nLabels, self.gen))
+        nLabels = 2 * gen
+        labels = np.array(create01Labels(nLabels, gen))
         labelsGeN = np.array([labels])
 
         def getNeighborhoods():
@@ -401,11 +477,11 @@ class XConvGeN(GanBaseClass):
                 for x in labels:
                     yield x
         
-        padd = np.zeros((self.gen - self.neb, self.n_feat))
+        padd = np.zeros((gen - neb, n_feat))
         discTrainCount = 1 + max(0, discTrainCount)    
 
-        for neb_epoch_count in range(self.neb_epochs):
-            self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
+        for neb_epoch_count in range(self.config.neb_epochs):
+            self.progressBar([(neb_epoch_count + 1) / self.config.neb_epochs, 0.5, 0.5])
 
             ## Training of the discriminator.
             #
@@ -428,7 +504,7 @@ class XConvGeN(GanBaseClass):
             b = tf.data.Dataset.from_tensor_slices(labels).repeat()
 
             # Zip data and matching labels together for training. 
-            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
+            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * gen)
 
             # train the discriminator with the concatenated samples and the one-hot encoded labels
             self.timing["Fit"].start()
@@ -479,7 +555,7 @@ class XConvGeN(GanBaseClass):
         ## min_idxs -> indices of points in minority class
         ## gen -> convex combinations generated from each neighbourhood
         self.timing["BMB"].start()
-        indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
+        indices = randomIndices(self.minSetSize, outputSize=self.config.gen, indicesToIgnore=min_idxs)
         r = self.nmbMin.basePoints[indices]
         self.timing["BMB"].stop()
         return r
@@ -488,7 +564,7 @@ class XConvGeN(GanBaseClass):
     def retrainDiscriminitor(self, data, labels):
         self.maj_min_discriminator.trainable = True
         labels = np.array([ [x, 1 - x] for x in labels])
-        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
+        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.config.neb_epochs)
         self.maj_min_discriminator.trainable = False
 
     def progressBar(self, x):
@@ -527,8 +603,8 @@ class XConvGeN(GanBaseClass):
         if len(columns) == 0:
             return voidFunction
         
-        neb = self.neb
-        n_feat = self.n_feat
+        neb = self.config.neb
+        n_feat = self.config.n_feat
         nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
         if n_feat is None:
             print("ERRROR n_feat is None")

Неке датотеке нису приказане због велике количине промена