Просмотр исходного кода

Added Class for configuration.

Kristian Schultz 3 лет назад
Родитель
Сommit
0e5a51946e
2 измененных файлов с 171 добавлено и 235 удалено
  1. 33 173
      XConvGeN-Example.ipynb
  2. 138 62
      library/generators/XConvGeN.py

Разница между файлами не показана из-за своего большого размера
+ 33 - 173
XConvGeN-Example.ipynb


+ 138 - 62
library/generators/XConvGeN.py

@@ -32,24 +32,89 @@ def create01Labels(totalSize, sizeFirstHalf):
     labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
     labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
     return np.array(labels)
     return np.array(labels)
 
 
+
+class GeneratorConfig:
+    def __init__(self, n_feat=None, neb=5, gen=None, neb_epochs=10, genLayerSizes=None, genAddNoise=True):
+        self.n_feat = n_feat
+        self.neb = neb
+        self.gen = gen
+        self.neb_epochs = neb_epochs
+        self.genAddNoise = genAddNoise
+        self.genLayerSizes = genLayerSizes
+
+    def isConfigMissing(self):
+        return any( x is None for x in
+            [ self.n_feat
+            , self.neb
+            , self.gen
+            , self.genAddNoise
+            , self.genLayerSizes
+            , self.neb_epochs
+            ])
+
+    def checkForValidConfig(self):
+        if self.isConfigMissing():
+            raise ValueError(f"Some configuration is missing.")
+
+        if self.neb > self.gen:
+            raise ValueError(f"Expected neb <= gen but got neb={self.neb} and gen={self.gen}.")
+
+        if sum(self.genLayerSizes) != self.gen:
+            raise ValueError(f"Expected the layer sizes to sum up to gen={self.gen}.")
+
+        return True
+
+    def fixMissingValuesByInputData(self, data):
+        config = GeneratorConfig()
+        config.neb = self.neb
+        config.gen = self.gen
+        config.genAddNoise = self.genAddNoise
+        config.genLayerSizes = self.genLayerSizes
+        
+        if data is not None:
+            if config.n_feat is None:
+                config.n_feat = data.shape[1]
+
+            if config.neb is None:
+                config.neb = data.shape[0]
+            else:
+                config.neb = min(config.neb, data.shape[0])
+
+        if config.gen is None:
+            config.gen = config.neb
+
+        if config.genLayerSizes is None:
+            config.genLayerSizes = [config.gen]
+
+        return config
+
+    def nebShape(self, aboveSize=None):
+        if aboveSize is None:
+            return (self.neb, self.n_feat)
+        else:
+            return (aboveSize, self.neb, self.n_feat)
+
+    def genShape(self, aboveSize=None):
+        if aboveSize is None:
+            return (self.gen, self.n_feat)
+        else:
+            return (aboveSize, self.gen, self.n_feat)
+
+
+
 class XConvGeN(GanBaseClass):
 class XConvGeN(GanBaseClass):
     """
     """
     This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
     This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
     """
     """
-    def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
+    def __init__(self, config=None, fdc=None, debug=False):
         self.isTrained = False
         self.isTrained = False
-        self.n_feat = n_feat
-        self.neb = neb
-        self.nebInitial = neb
-        self.genInitial = gen
-        self.gen = gen if gen is not None else self.neb
-        self.neb_epochs = neb_epochs
+        self.config = config
+        self.defaultConfig = config
         self.loss_history = None
         self.loss_history = None
         self.debug = debug
         self.debug = debug
         self.minSetSize = 0
         self.minSetSize = 0
         self.conv_sample_generator = None
         self.conv_sample_generator = None
         self.maj_min_discriminator = None
         self.maj_min_discriminator = None
-        self.maj_proximal = maj_proximal
         self.cg = None
         self.cg = None
         self.canPredict = True
         self.canPredict = True
         self.fdc = fdc
         self.fdc = fdc
@@ -59,8 +124,8 @@ class XConvGeN(GanBaseClass):
             "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
             "Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
             ] }
             ] }
 
 
-        if self.neb is not None and self.gen is not None and self.neb > self.gen:
-            raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
+        if not self.config.isConfigMissing():
+            self.config.checkForValidConfig()
 
 
     def reset(self, data):
     def reset(self, data):
         """
         """
@@ -72,16 +137,8 @@ class XConvGeN(GanBaseClass):
         """
         """
         self.isTrained = False
         self.isTrained = False
 
 
-        if data is not None:
-            nMinoryPoints = data.shape[0]
-            if self.nebInitial is None:
-                self.neb = nMinoryPoints
-            else:
-                self.neb = min(self.nebInitial, nMinoryPoints)
-        else:
-            self.neb = self.nebInitial
-
-        self.gen = self.genInitial if self.genInitial is not None else self.neb
+        self.config = self.defaultConfig.fixMissingValuesByInputData(data)
+        self.config.checkForValidConfig()
 
 
         ## instanciate generator network and visualize architecture
         ## instanciate generator network and visualize architecture
         self.conv_sample_generator = self._conv_sample_gen()
         self.conv_sample_generator = self._conv_sample_gen()
@@ -94,7 +151,7 @@ class XConvGeN(GanBaseClass):
 
 
         self.lastProgress = (-1,-1,-1)
         self.lastProgress = (-1,-1,-1)
         if self.debug:
         if self.debug:
-            print(f"neb={self.neb}, gen={self.gen}")
+            print(f"neb={self.config.neb}, gen={self.config.gen}")
 
 
             print(self.conv_sample_generator.summary())
             print(self.conv_sample_generator.summary())
             print('\n')
             print('\n')
@@ -124,12 +181,9 @@ class XConvGeN(GanBaseClass):
         if self.fdc is not None:
         if self.fdc is not None:
             normalizedData = self.fdc.normalize(data)
             normalizedData = self.fdc.normalize(data)
             
             
-        print(f"|N| = {normalizedData.shape}")
-        print(f"|D| = {data.shape}")
-        
         self.timing["NbhSearch"].start()
         self.timing["NbhSearch"].start()
         # Precalculate neighborhoods
         # Precalculate neighborhoods
-        self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
+        self.nmbMin = NNSearch(self.config.neb).fit(haystack=normalizedData)
         self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
         self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
         self.timing["NbhSearch"].stop()
         self.timing["NbhSearch"].stop()
 
 
@@ -158,7 +212,7 @@ class XConvGeN(GanBaseClass):
 
 
         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
         synth_num = (numOfSamples // self.minSetSize) + 1
         synth_num = (numOfSamples // self.minSetSize) + 1
-        runs = (synth_num // self.gen) + 1
+        runs = (synth_num // self.config.gen) + 1
 
 
         ## Get a random list of all indices
         ## Get a random list of all indices
         indices = randomIndices(self.minSetSize)
         indices = randomIndices(self.minSetSize)
@@ -184,7 +238,7 @@ class XConvGeN(GanBaseClass):
         corrected = pairs.map(self.correct_feature_types())
         corrected = pairs.map(self.correct_feature_types())
 
 
         ## extract the exact number of synthetic samples needed to exactly balance the two classes
         ## extract the exact number of synthetic samples needed to exactly balance the two classes
-        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.gen)))), axis=0)[:numOfSamples]
+        r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.config.gen)))), axis=0)[:numOfSamples]
 
 
         return r
         return r
 
 
@@ -202,48 +256,59 @@ class XConvGeN(GanBaseClass):
     # ###############################################################
     # ###############################################################
 
 
     # Creating the Network: Generator
     # Creating the Network: Generator
-    def _conv_sample_gen(self, layerSize=None):
+    def _conv_sample_gen(self):
         """
         """
         The generator network to generate synthetic samples from the convex space
         The generator network to generate synthetic samples from the convex space
         of arbitrary minority neighbourhoods
         of arbitrary minority neighbourhoods
         """
         """
 
 
-        if layerSize is None:
-            layerSize = (self.gen // 2) + 1
+        n_feat = self.config.n_feat
+        neb = self.config.neb
+        gen = self.config.gen
+        genLayerSizes = self.config.genLayerSizes
 
 
         ## takes minority batch as input
         ## takes minority batch as input
-        min_neb_batch = Input(shape=(self.neb, self.n_feat,))
+        min_neb_batch = Input(shape=(neb, n_feat))
 
 
         ## using 1-D convolution, feature dimension remains the same
         ## using 1-D convolution, feature dimension remains the same
-        x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
+        x = Conv1D(n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
         ## flatten after convolution
         ## flatten after convolution
         x = Flatten(name="InputMatrixToVector")(x)
         x = Flatten(name="InputMatrixToVector")(x)
 
 
         synth = []
         synth = []
         n = 0
         n = 0
-        while n < self.gen:
-            w = min(layerSize, self.gen - n)
+        if sum(genLayerSizes) < gen:
+            genLayerSizes.append(gen)
+
+        for layerSize in genLayerSizes:
+            w = min(layerSize, gen - n)
             if w <= 0:
             if w <= 0:
                 break
                 break
             n += w
             n += w
     
     
             ## add dense layer to transform the vector to a convenient dimension
             ## add dense layer to transform the vector to a convenient dimension
-            y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
+            y = Dense(neb * w, activation='relu', name=f"P{n}_dense")(x)
 
 
             ## again, witching to 2-D tensor once we have the convenient shape
             ## again, witching to 2-D tensor once we have the convenient shape
-            y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
+            y = Reshape((neb, w), name=f"P{n}_reshape")(y)
+
             ## column wise sum
             ## column wise sum
             s = K.sum(y, axis=1)
             s = K.sum(y, axis=1)
+
             ## adding a small constant to always ensure the column sums are non zero.
             ## adding a small constant to always ensure the column sums are non zero.
             ## if this is not done then during initialization the sum can be zero.
             ## if this is not done then during initialization the sum can be zero.
             s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
             s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
+
             ## reprocals of the approximated column sum
             ## reprocals of the approximated column sum
             sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
             sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
+
             ## At this step we ensure that column sum is 1 for every row in x.
             ## At this step we ensure that column sum is 1 for every row in x.
             ## That means, each column is set of convex co-efficient
             ## That means, each column is set of convex co-efficient
             y = Multiply(name=f"P{n}_normalize")([sinv, y])
             y = Multiply(name=f"P{n}_normalize")([sinv, y])
+
             ## Now we transpose the matrix. So each row is now a set of convex coefficients
             ## Now we transpose the matrix. So each row is now a set of convex coefficients
             aff = tf.transpose(y[0], name=f"P{n}_transpose")
             aff = tf.transpose(y[0], name=f"P{n}_transpose")
+
             ## We now do matrix multiplication of the affine combinations with the original
             ## We now do matrix multiplication of the affine combinations with the original
             ## minority batch taken as input. This generates a convex transformation
             ## minority batch taken as input. This generates a convex transformation
             ## of the input minority batch
             ## of the input minority batch
@@ -252,18 +317,19 @@ class XConvGeN(GanBaseClass):
 
 
         synth = tf.concat(synth, axis=1, name="collect_planes")
         synth = tf.concat(synth, axis=1, name="collect_planes")
 
 
-        nOut = self.gen * self.n_feat
+        nOut = gen * n_feat
 
 
-        noiseGenerator = Sequential([
-          InputLayer(input_shape=(self.gen, self.n_feat)),
-          Flatten(),
-          Dense(tfp.layers.IndependentNormal.params_size(nOut)),
-          tfp.layers.IndependentNormal(nOut)
-        ], name="RandomNoise")
+        if self.config.genAddNoise:
+            noiseGenerator = Sequential([
+              InputLayer(input_shape=(gen, n_feat)),
+              Flatten(),
+              Dense(tfp.layers.IndependentNormal.params_size(nOut)),
+              tfp.layers.IndependentNormal(nOut)
+            ], name="RandomNoise")
 
 
-        noise = noiseGenerator(synth)
-        noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
-        synth = Add(name="AddNoise")([synth, noise])
+            noise = noiseGenerator(synth)
+            noise = Reshape((gen, n_feat), name="ReshapeNoise")(noise)
+            synth = Add(name="AddNoise")([synth, noise])
 
 
         ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
         ## finally we compile the generator with an arbitrary minortiy neighbourhood batch
         ## as input and a covex space transformation of the same number of samples as output
         ## as input and a covex space transformation of the same number of samples as output
@@ -286,7 +352,7 @@ class XConvGeN(GanBaseClass):
 
 
         ## takes as input synthetic sample generated as input stacked upon a batch of
         ## takes as input synthetic sample generated as input stacked upon a batch of
         ## borderline majority samples
         ## borderline majority samples
-        samples = Input(shape=(self.n_feat,))
+        samples = Input(shape=(self.config.n_feat,))
         
         
         ## passed through two dense layers
         ## passed through two dense layers
         y = Dense(250, activation='relu')(samples)
         y = Dense(250, activation='relu')(samples)
@@ -309,6 +375,11 @@ class XConvGeN(GanBaseClass):
         conv_coeff_generator-> generator network instance
         conv_coeff_generator-> generator network instance
         maj_min_discriminator -> discriminator network instance
         maj_min_discriminator -> discriminator network instance
         """
         """
+
+        n_feat = self.config.n_feat
+        neb = self.config.neb
+        gen = self.config.gen
+
         ## by default the discriminator trainability is switched off.
         ## by default the discriminator trainability is switched off.
         ## Thus training ConvGeN means training the generator network as per previously
         ## Thus training ConvGeN means training the generator network as per previously
         ## trained discriminator network.
         ## trained discriminator network.
@@ -319,7 +390,7 @@ class XConvGeN(GanBaseClass):
 
 
         ## input receives a neighbourhood minority batch
         ## input receives a neighbourhood minority batch
         ## and a proximal majority batch concatenated
         ## and a proximal majority batch concatenated
-        batch_data = Input(shape=(2, self.gen, self.n_feat,))
+        batch_data = Input(shape=(2, gen, n_feat))
         # batch_data: (batchSize, 2, gen, n_feat)
         # batch_data: (batchSize, 2, gen, n_feat)
         
         
         ## extract minority batch
         ## extract minority batch
@@ -329,23 +400,23 @@ class XConvGeN(GanBaseClass):
         ## extract majority batch
         ## extract majority batch
         maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
         maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
         # maj_batch: (batchSize, gen, n_feat)
         # maj_batch: (batchSize, gen, n_feat)
-        maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
+        maj_batch = tf.reshape(maj_batch, (-1, n_feat), name="ReshapeForDisc")
         # maj_batch: (batchSize * gen, n_feat)
         # maj_batch: (batchSize * gen, n_feat)
         
         
         ## pass minority batch into generator to obtain convex space transformation
         ## pass minority batch into generator to obtain convex space transformation
         ## (synthetic samples) of the minority neighbourhood input batch
         ## (synthetic samples) of the minority neighbourhood input batch
         conv_samples = generator(min_batch)
         conv_samples = generator(min_batch)
         # conv_batch: (batchSize, gen, n_feat)
         # conv_batch: (batchSize, gen, n_feat)
-        conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
+        conv_samples = tf.reshape(conv_samples, (-1, n_feat), name="ReshapeGenOutput")
         # conv_batch: (batchSize * gen, n_feat)
         # conv_batch: (batchSize * gen, n_feat)
 
 
         ## pass samples into the discriminator to know its decisions
         ## pass samples into the discriminator to know its decisions
         conv_samples = discriminator(conv_samples)
         conv_samples = discriminator(conv_samples)
-        conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
+        conv_samples = tf.reshape(conv_samples, (-1, gen, 2), name="ReshapeGenDiscOutput")
         # conv_batch: (batchSize * gen, 2)
         # conv_batch: (batchSize * gen, 2)
 
 
         maj_batch = discriminator(maj_batch)
         maj_batch = discriminator(maj_batch)
-        maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
+        maj_batch = tf.reshape(maj_batch, (-1, gen, 2), name="ReshapeMajDiscOutput")
         # conv_batch: (batchSize * gen, 2)
         # conv_batch: (batchSize * gen, 2)
         
         
         ## concatenate the decisions
         ## concatenate the decisions
@@ -361,6 +432,11 @@ class XConvGeN(GanBaseClass):
 
 
     # Training
     # Training
     def _rough_learning(self, data, discTrainCount, batchSize=32):
     def _rough_learning(self, data, discTrainCount, batchSize=32):
+
+        n_feat = self.config.n_feat
+        neb = self.config.neb
+        gen = self.config.gen
+
         generator = self.conv_sample_generator
         generator = self.conv_sample_generator
         discriminator = self.maj_min_discriminator
         discriminator = self.maj_min_discriminator
         convGeN = self.cg
         convGeN = self.cg
@@ -368,8 +444,8 @@ class XConvGeN(GanBaseClass):
         minSetSize = len(data)
         minSetSize = len(data)
 
 
         ## Create labels for one neighborhood training.
         ## Create labels for one neighborhood training.
-        nLabels = 2 * self.gen
-        labels = np.array(create01Labels(nLabels, self.gen))
+        nLabels = 2 * gen
+        labels = np.array(create01Labels(nLabels, gen))
         labelsGeN = np.array([labels])
         labelsGeN = np.array([labels])
 
 
         def getNeighborhoods():
         def getNeighborhoods():
@@ -401,11 +477,11 @@ class XConvGeN(GanBaseClass):
                 for x in labels:
                 for x in labels:
                     yield x
                     yield x
         
         
-        padd = np.zeros((self.gen - self.neb, self.n_feat))
+        padd = np.zeros((gen - neb, n_feat))
         discTrainCount = 1 + max(0, discTrainCount)    
         discTrainCount = 1 + max(0, discTrainCount)    
 
 
-        for neb_epoch_count in range(self.neb_epochs):
-            self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
+        for neb_epoch_count in range(self.config.neb_epochs):
+            self.progressBar([(neb_epoch_count + 1) / self.config.neb_epochs, 0.5, 0.5])
 
 
             ## Training of the discriminator.
             ## Training of the discriminator.
             #
             #
@@ -428,7 +504,7 @@ class XConvGeN(GanBaseClass):
             b = tf.data.Dataset.from_tensor_slices(labels).repeat()
             b = tf.data.Dataset.from_tensor_slices(labels).repeat()
 
 
             # Zip data and matching labels together for training. 
             # Zip data and matching labels together for training. 
-            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
+            samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * gen)
 
 
             # train the discriminator with the concatenated samples and the one-hot encoded labels
             # train the discriminator with the concatenated samples and the one-hot encoded labels
             self.timing["Fit"].start()
             self.timing["Fit"].start()
@@ -479,7 +555,7 @@ class XConvGeN(GanBaseClass):
         ## min_idxs -> indices of points in minority class
         ## min_idxs -> indices of points in minority class
         ## gen -> convex combinations generated from each neighbourhood
         ## gen -> convex combinations generated from each neighbourhood
         self.timing["BMB"].start()
         self.timing["BMB"].start()
-        indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
+        indices = randomIndices(self.minSetSize, outputSize=self.config.gen, indicesToIgnore=min_idxs)
         r = self.nmbMin.basePoints[indices]
         r = self.nmbMin.basePoints[indices]
         self.timing["BMB"].stop()
         self.timing["BMB"].stop()
         return r
         return r
@@ -488,7 +564,7 @@ class XConvGeN(GanBaseClass):
     def retrainDiscriminitor(self, data, labels):
     def retrainDiscriminitor(self, data, labels):
         self.maj_min_discriminator.trainable = True
         self.maj_min_discriminator.trainable = True
         labels = np.array([ [x, 1 - x] for x in labels])
         labels = np.array([ [x, 1 - x] for x in labels])
-        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
+        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.config.neb_epochs)
         self.maj_min_discriminator.trainable = False
         self.maj_min_discriminator.trainable = False
 
 
     def progressBar(self, x):
     def progressBar(self, x):
@@ -527,8 +603,8 @@ class XConvGeN(GanBaseClass):
         if len(columns) == 0:
         if len(columns) == 0:
             return voidFunction
             return voidFunction
         
         
-        neb = self.neb
-        n_feat = self.n_feat
+        neb = self.config.neb
+        n_feat = self.config.n_feat
         nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
         nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
         if n_feat is None:
         if n_feat is None:
             print("ERRROR n_feat is None")
             print("ERRROR n_feat is None")

Некоторые файлы не были показаны из-за большого количества измененных файлов