Просмотр исходного кода

Merge branch 'classifyByDiscriminator' of fyrr/LoGAN into master

fyrr 4 лет назад
Родитель
Сommit
f520ffdea3
8 измененных файлов с 246 добавлено и 65 удалено
  1. 132 0
      convGAN-predict.ipynb
  2. 11 11
      library/analysis.py
  3. 10 1
      library/exercise.py
  4. 68 42
      library/generators/convGAN.py
  5. 15 0
      library/interfaces.py
  6. 4 5
      library/testers.py
  7. 4 4
      run_all_exercises.ipynb
  8. 2 2
      statistics.py

Разница между файлами не показана из-за своего большого размера
+ 132 - 0
convGAN-predict.ipynb


+ 11 - 11
library/analysis.py

@@ -260,14 +260,14 @@ def runAllTestSets(dataSetList):
 
 
 
-generators = [ ("ProWRAS",       lambda _data: ProWRAS())
-             , ("Repeater",      lambda _data: Repeater())
-             #, ("SpheredNoise",  lambda _data: SpheredNoise())
-             , ("SimpleGAN",     lambda data: SimpleGan(numOfFeatures=data.data0.shape[1]))
-             , ("ctGAN",         lambda data: CtGAN(data.data0.shape[1]))
-             , ("CTAB-GAN",      lambda _data: CtabGan())
-             , ("convGAN",       lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5))
-             , ("convGAN-full",  lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1]))
-             , ("convGAN-proximary-5",       lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5, withMajorhoodNbSearch=True))
-             , ("convGAN-proxymary-full",  lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1], withMajorhoodNbSearch=True))
-             ]
+generators = { "ProWRAS":                 lambda _data: ProWRAS()
+             , "Repeater":                lambda _data: Repeater()
+             , "SpheredNoise":            lambda _data: SpheredNoise()
+             , "SimpleGAN":               lambda data: SimpleGan(numOfFeatures=data.data0.shape[1])
+             , "ctGAN":                   lambda data: CtGAN(data.data0.shape[1])
+             , "CTAB-GAN":                lambda _data: CtabGan()
+             , "convGAN":                 lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5)
+             , "convGAN-full":            lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1])
+             , "convGAN-proximary-5":     lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5, withMajorhoodNbSearch=True)
+             , "convGAN-proxymary-full":  lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1], withMajorhoodNbSearch=True)
+             }

+ 10 - 1
library/exercise.py

@@ -11,7 +11,7 @@ from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 
 from library.dataset import DataSet, TrainTestData
-from library.testers import lr,knn, gb, TestResult
+from library.testers import lr, knn, gb, TestResult, runTester
 
 
 class Exercise:
@@ -85,6 +85,9 @@ class Exercise:
         # Reset results array.
         self.results = { name: [] for name in self.testFunctions }
 
+        if gan.canPredict and "GAN" not in self.testFunctions.keys():
+            self.results["GAN"] = []
+
         # If a shuffle function is given then shuffle the data before the
         # exercise starts.
         if self.shuffleFunction is not None:
@@ -184,6 +187,12 @@ class Exercise:
 
         # Test this dataset with every given test-function.
         # The results are printed out and stored to the results dictionary.
+        if gan.canPredict and "GAN" not in self.testFunctions.keys():
+            self.debug(f"-> test with GAN.predict")
+            testResult = runTester(dataSlice, gan)
+            self.debug(str(testResult))
+            self.results["GAN"].append(testResult)
+
         for testerName in self.testFunctions:
             self.debug(f"-> test with '{testerName}'")
             testResult = (self.testFunctions[testerName])(dataSlice)

+ 68 - 42
library/generators/convGAN.py

@@ -46,6 +46,7 @@ class ConvGAN(GanBaseClass):
         self.maj_min_discriminator = None
         self.withMajorhoodNbSearch = withMajorhoodNbSearch
         self.cg = None
+        self.canPredict = True
 
         if neb > gen:
             raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
@@ -74,7 +75,7 @@ class ConvGAN(GanBaseClass):
             print(self.cg.summary())
             print('\n')
 
-    def train(self, dataSet):
+    def train(self, dataSet, discTrainCount=5):
         """
         Trains the GAN.
 
@@ -97,7 +98,7 @@ class ConvGAN(GanBaseClass):
             self.nmbMaj = None
 
         # Do the training.
-        self._rough_learning(dataSet.data1, dataSet.data0)
+        self._rough_learning(dataSet.data1, dataSet.data0, discTrainCount)
         
         # Neighborhood in majority class is no longer needed. So save memory.
         self.nmbMaj = None
@@ -132,6 +133,10 @@ class ConvGAN(GanBaseClass):
 
         return synth_set
 
+    def predictReal(self, data):
+        prediction = self.maj_min_discriminator.predict(data)
+        return np.array([x[0] for x in prediction])
+
     # ###############################################################
     # Hidden internal functions
     # ###############################################################
@@ -199,6 +204,7 @@ class ConvGAN(GanBaseClass):
         ## passed through two dense layers
         y = Dense(250, activation='relu')(samples)
         y = Dense(125, activation='relu')(y)
+        y = Dense(75, activation='relu')(y)
         
         ## two output nodes. outputs have to be one-hot coded (see labels variable before)
         output = Dense(2, activation='sigmoid')(y)
@@ -273,53 +279,73 @@ class ConvGAN(GanBaseClass):
 
 
     # Training
-    def _rough_learning(self, data_min, data_maj):
+    def _rough_learning(self, data_min, data_maj, discTrainCount):
         generator = self.conv_sample_generator
         discriminator = self.maj_min_discriminator
         GAN = self.cg
         loss_history = [] ## this is for stroring the loss for every run
-        min_idx = 0
-        neb_epoch_count = 1
+        step = 0
+        minSetSize = len(data_min)
 
         labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
 
-        for step in range(self.neb_epochs * len(data_min)):
-            ## generate minority neighbourhood batch for every minority class sampls by index
-            min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
-            min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
-            min_idx = min_idx + 1
-            ## generate random proximal majority batch
-            maj_batch = self._BMB(data_maj, min_batch_indices)
-
-            ## generate synthetic samples from convex space
-            ## of minority neighbourhood batch using generator
-            conv_samples = generator.predict(min_batch)
-            ## concatenate them with the majority batch
-            concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
-
-            ## switch on discriminator training
-            discriminator.trainable = True
-            ## train the discriminator with the concatenated samples and the one-hot encoded labels
-            discriminator.fit(x=concat_sample, y=labels, verbose=0)
-            ## switch off the discriminator training again
-            discriminator.trainable = False
-
-            ## use the GAN to make the generator learn on the decisions
-            ## made by the previous discriminator training
-            ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
-            gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
-
-            ## store the loss for the step
-            loss_history.append(gan_loss_history.history['loss'])
-
-            if self.debug and ((step + 1) % 10 == 0):
-                print(f"{step + 1} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
-
-            if min_idx == len(data_min) - 1:
-                if self.debug:
-                    print(f"Neighbourhood epoch {neb_epoch_count} complete")
-                neb_epoch_count = neb_epoch_count + 1
-                min_idx = 0
+        for neb_epoch_count in range(self.neb_epochs):
+            if discTrainCount > 0:
+                for n in range(discTrainCount):
+                    for min_idx in range(minSetSize):
+                        ## generate minority neighbourhood batch for every minority class sampls by index
+                        min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
+                        min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
+                        ## generate random proximal majority batch
+                        maj_batch = self._BMB(data_maj, min_batch_indices)
+
+                        ## generate synthetic samples from convex space
+                        ## of minority neighbourhood batch using generator
+                        conv_samples = generator.predict(min_batch)
+                        ## concatenate them with the majority batch
+                        concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
+
+                        ## switch on discriminator training
+                        discriminator.trainable = True
+                        ## train the discriminator with the concatenated samples and the one-hot encoded labels
+                        discriminator.fit(x=concat_sample, y=labels, verbose=0)
+                        ## switch off the discriminator training again
+                        discriminator.trainable = False
+
+            for min_idx in range(minSetSize):
+                ## generate minority neighbourhood batch for every minority class sampls by index
+                min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
+                min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
+                ## generate random proximal majority batch
+                maj_batch = self._BMB(data_maj, min_batch_indices)
+
+                ## generate synthetic samples from convex space
+                ## of minority neighbourhood batch using generator
+                conv_samples = generator.predict(min_batch)
+                ## concatenate them with the majority batch
+                concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
+
+                ## switch on discriminator training
+                discriminator.trainable = True
+                ## train the discriminator with the concatenated samples and the one-hot encoded labels
+                discriminator.fit(x=concat_sample, y=labels, verbose=0)
+                ## switch off the discriminator training again
+                discriminator.trainable = False
+
+                ## use the GAN to make the generator learn on the decisions
+                ## made by the previous discriminator training
+                ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
+                gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
+
+                ## store the loss for the step
+                loss_history.append(gan_loss_history.history['loss'])
+
+                step += 1
+                if self.debug and (step % 10 == 0):
+                    print(f"{step} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
+
+            if self.debug:
+                print(f"Neighbourhood epoch {neb_epoch_count + 1} complete")
 
         if self.debug:
             run_range = range(1, len(loss_history) + 1)

+ 15 - 0
library/interfaces.py

@@ -1,6 +1,7 @@
 """
 This module contains used interfaces for testing the Generative Adversarial Networks.
 """
+import numpy as np
 
 
 class GanBaseClass:
@@ -13,6 +14,7 @@ class GanBaseClass:
         """
         Initializes the class.
         """
+        self.canPredict = False
 
     def reset(self):
         """
@@ -39,3 +41,16 @@ class GanBaseClass:
         *numOfSamples* is an integer > 0. It gives the number of generated samples.
         """
         raise NotImplementedError
+
+    def predict(self, data, limit=0.5):
+        """
+        Takes a list (numpy array) of data points.
+        Returns a list with real values in [0,1] for the propapility
+        that a point is in the minority dataset. With:
+          0.0: point is in majority set
+          1.0: point is in minority set
+        """
+        return np.array([max(0, min(1, int(x + 1.0 - limit))) for x in self.predictReal(data)])
+
+    def predictReal(self, data):
+        raise NotImplemented

+ 4 - 5
library/testers.py

@@ -182,9 +182,7 @@ def knn(ttd):
     checkType(ttd)
     knnTester = KNeighborsClassifier(n_neighbors=10)
     knnTester.fit(ttd.train.data, ttd.train.labels)
-
-    prediction = knnTester.predict(ttd.test.data)
-    return TestResult("KNN", ttd.test.labels, prediction)
+    return runTester(ttd, knnTester, "KNN")
 
 
 def gb(ttd):
@@ -197,10 +195,11 @@ def gb(ttd):
     checkType(ttd)
     tester = GradientBoostingClassifier()
     tester.fit(ttd.train.data, ttd.train.labels)
+    return runTester(ttd, tester, "GB")
 
+def runTester(ttd, tester, name="GAN"):
     prediction = tester.predict(ttd.test.data)
-    return TestResult("GB", ttd.test.labels, prediction)
-
+    return TestResult(name, ttd.test.labels, prediction)
 
 def checkType(t):
     if str(type(t)) == "<class 'numpy.ndarray'>":

+ 4 - 4
run_all_exercises.ipynb

@@ -20,14 +20,14 @@
    "outputs": [],
    "source": [
     "for dataset in testSets:\n",
-    "    for f in generators:\n",
-    "        runExercise(dataset, None, name, f)"
+    "    for name in generators.keys():\n",
+    "        runExercise(dataset, None, name, generators[name])"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -41,7 +41,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,

+ 2 - 2
statistics.py

@@ -10,8 +10,8 @@ f1Score = "f1 score"
 
 ignoreSet = ["yeast_me2"]
 
-gans = [g[0] for g in generators]
-algs = {"LR", "GB", "KNN"}
+gans = generators.keys()
+algs = {"LR", "GB", "KNN", "GAN"}
 
 dataset  = [
     "folding_abalone9-18",

Некоторые файлы не были показаны из-за большого количества измененных файлов