Kaynağa Gözat

Merge branch 'master' with 'master' from server.

Kristian Schultz 3 yıl önce
ebeveyn
işleme
0ceeba2b69

Dosya farkı çok büyük olduğundan ihmal edildi
+ 132 - 0
convGAN-predict.ipynb


+ 36 - 5
library/NNSearch.py

@@ -3,6 +3,7 @@ import math
 import tensorflow as tf
 import numpy as np
 from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import shuffle
 from library.timing import timing
 
 
@@ -11,6 +12,7 @@ class NNSearch:
         self.nebSize = nebSize
         self.neighbourhoods = []
         self.timingDict = timingDict
+        self.basePoints = []
 
 
     def timerStart(self, name):
@@ -28,21 +30,50 @@ class NNSearch:
     def neighbourhoodOfItem(self, i):
         return self.neighbourhoods[i]
 
+    def getNbhPointsOfItem(self, index):
+        return self.getPointsFromIndices(self.neighbourhoodOfItem(index))
 
-    def fit(self, X, nebSize=None):
+    def getPointsFromIndices(self, indices):
+        nmbi = shuffle(np.array([indices]))
+        nmb = self.basePoints[nmbi]
+        return tf.convert_to_tensor(nmb[0])
+
+    def neighbourhoodOfItemList(self, items, maxCount=None):
+        nbhIndices = set()
+        duplicates = []
+        for i in items:
+            for x in self.neighbourhoodOfItem(i):
+                if x in nbhIndices:
+                    duplicates.append(x)
+                else:
+                    nbhIndices.add(x)
+
+        nbhIndices = list(nbhIndices)
+        if maxCount is not None:
+            if len(nbhIndices) < maxCount:
+                nbhIndices.extend(duplicates)
+            nbhIndices = nbhIndices[0:maxCount]
+
+        return self.getPointsFromIndices(nbhIndices)
+
+
+    def fit(self, haystack, needles=None, nebSize=None):
         self.timerStart("NN_fit_chained_init")
         if nebSize == None:
             nebSize = self.nebSize
 
-        nPoints = len(X)
-        nFeatures = len(X[0])
+        if needles is None:
+            needles = haystack
+
+        self.basePoints = haystack
 
         neigh = NearestNeighbors(n_neighbors=nebSize)
-        neigh.fit(X)
+        neigh.fit(haystack)
         self.timerStop("NN_fit_chained_init")
         self.timerStart("NN_fit_chained_toList")
         self.neighbourhoods = [
                 (neigh.kneighbors([x], nebSize, return_distance=False))[0]
-                for (i, x) in enumerate(X)
+                for (i, x) in enumerate(needles)
                 ]
         self.timerStop("NN_fit_chained_toList")
+        return self

+ 11 - 9
library/analysis.py

@@ -260,12 +260,14 @@ def runAllTestSets(dataSetList):
 
 
 
-generators = [ ("ProWRAS",       lambda _data: ProWRAS())
-             , ("Repeater",      lambda _data: Repeater())
-             #, ("SpheredNoise",  lambda _data: SpheredNoise())
-             , ("SimpleGAN",     lambda data: SimpleGan(numOfFeatures=data.data0.shape[1]))
-             , ("ctGAN",         lambda data: CtGAN(data.data0.shape[1]))
-             , ("CTAB-GAN",      lambda _data: CtabGan())
-             , ("convGAN",       lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5))
-             , ("convGAN-full",  lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1]))
-             ]
+generators = { "ProWRAS":                 lambda _data: ProWRAS()
+             , "Repeater":                lambda _data: Repeater()
+             , "SpheredNoise":            lambda _data: SpheredNoise()
+             , "SimpleGAN":               lambda data: SimpleGan(numOfFeatures=data.data0.shape[1])
+             , "ctGAN":                   lambda data: CtGAN(data.data0.shape[1])
+             , "CTAB-GAN":                lambda _data: CtabGan()
+             , "convGAN":                 lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5)
+             , "convGAN-full":            lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1])
+             , "convGAN-proximary-5":     lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5, withMajorhoodNbSearch=True)
+             , "convGAN-proxymary-full":  lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1], withMajorhoodNbSearch=True)
+             }

+ 10 - 1
library/exercise.py

@@ -11,7 +11,7 @@ from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
 
 from library.dataset import DataSet, TrainTestData
-from library.testers import lr,knn, gb, TestResult
+from library.testers import lr, knn, gb, TestResult, runTester
 
 
 class Exercise:
@@ -85,6 +85,9 @@ class Exercise:
         # Reset results array.
         self.results = { name: [] for name in self.testFunctions }
 
+        if gan.canPredict and "GAN" not in self.testFunctions.keys():
+            self.results["GAN"] = []
+
         # If a shuffle function is given then shuffle the data before the
         # exercise starts.
         if self.shuffleFunction is not None:
@@ -184,6 +187,12 @@ class Exercise:
 
         # Test this dataset with every given test-function.
         # The results are printed out and stored to the results dictionary.
+        if gan.canPredict and "GAN" not in self.testFunctions.keys():
+            self.debug(f"-> test with GAN.predict")
+            testResult = runTester(dataSlice, gan)
+            self.debug(str(testResult))
+            self.results["GAN"].append(testResult)
+
         for testerName in self.testFunctions:
             self.debug(f"-> test with '{testerName}'")
             testResult = (self.testFunctions[testerName])(dataSlice)

+ 96 - 91
library/generators/convGAN.py

@@ -1,21 +1,9 @@
 import numpy as np
-from numpy.random import seed
-import pandas as pd
 import matplotlib.pyplot as plt
 
 from library.interfaces import GanBaseClass
 from library.dataset import DataSet
 
-from sklearn.decomposition import PCA
-from sklearn.metrics import confusion_matrix
-from sklearn.metrics import f1_score
-from sklearn.metrics import cohen_kappa_score
-from sklearn.metrics import precision_score
-from sklearn.metrics import recall_score
-from sklearn.neighbors import NearestNeighbors
-from sklearn.utils import shuffle
-from imblearn.datasets import fetch_datasets
-
 from keras.layers import Dense, Input, Multiply, Flatten, Conv1D, Reshape
 from keras.models import Model
 from keras import backend as K
@@ -45,7 +33,7 @@ class ConvGAN(GanBaseClass):
     This is a toy example of a GAN.
     It repeats the first point of the training-data-set.
     """
-    def __init__(self, n_feat, neb=5, gen=5, neb_epochs=10, debug=True):
+    def __init__(self, n_feat, neb=5, gen=5, neb_epochs=10, withMajorhoodNbSearch=False, debug=False):
         self.isTrained = False
         self.n_feat = n_feat
         self.neb = neb
@@ -53,10 +41,12 @@ class ConvGAN(GanBaseClass):
         self.neb_epochs = 10
         self.loss_history = None
         self.debug = debug
-        self.dataSet = None
+        self.minSetSize = 0
         self.conv_sample_generator = None
         self.maj_min_discriminator = None
+        self.withMajorhoodNbSearch = withMajorhoodNbSearch
         self.cg = None
+        self.canPredict = True
 
         if neb > gen:
             raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
@@ -85,7 +75,7 @@ class ConvGAN(GanBaseClass):
             print(self.cg.summary())
             print('\n')
 
-    def train(self, dataSet):
+    def train(self, dataSet, discTrainCount=5):
         """
         Trains the GAN.
 
@@ -97,9 +87,21 @@ class ConvGAN(GanBaseClass):
         if dataSet.data1.shape[0] <= 0:
             raise AttributeError("Train: Expected data class 1 to contain at least one point.")
 
-        self.dataSet = dataSet
-        self.nmb = self._NMB_prepare(dataSet.data1)
-        self._rough_learning(dataSet.data1, dataSet.data0)
+        # Store size of minority class. This is needed during point generation.
+        self.minSetSize = dataSet.data1.shape[0]
+
+        # Precalculate neighborhoods
+        self.nmbMin = NNSearch(self.neb).fit(haystack=dataSet.data1)
+        if self.withMajorhoodNbSearch:
+            self.nmbMaj = NNSearch(self.neb).fit(haystack=dataSet.data0, needles=dataSet.data1)
+        else:
+            self.nmbMaj = None
+
+        # Do the training.
+        self._rough_learning(dataSet.data1, dataSet.data0, discTrainCount)
+        
+        # Neighborhood in majority class is no longer needed. So save memory.
+        self.nmbMaj = None
         self.isTrained = True
 
     def generateDataPoint(self):
@@ -118,14 +120,12 @@ class ConvGAN(GanBaseClass):
         if not self.isTrained:
             raise ValueError("Try to generate data with untrained Re.")
 
-        data_min = self.dataSet.data1
-
         ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
-        synth_num = (numOfSamples // len(data_min)) + 1
+        synth_num = (numOfSamples // self.minSetSize) + 1
 
         ## generate synth_num synthetic samples from each minority neighbourhood
         synth_set=[]
-        for i in range(len(data_min)):
+        for i in range(self.minSetSize):
             synth_set.extend(self._generate_data_for_min_point(i, synth_num))
 
         ## extract the exact number of synthetic samples needed to exactly balance the two classes
@@ -133,6 +133,10 @@ class ConvGAN(GanBaseClass):
 
         return synth_set
 
+    def predictReal(self, data):
+        prediction = self.maj_min_discriminator.predict(data)
+        return np.array([x[0] for x in prediction])
+
     # ###############################################################
     # Hidden internal functions
     # ###############################################################
@@ -200,6 +204,7 @@ class ConvGAN(GanBaseClass):
         ## passed through two dense layers
         y = Dense(250, activation='relu')(samples)
         y = Dense(125, activation='relu')(y)
+        y = Dense(75, activation='relu')(y)
         
         ## two output nodes. outputs have to be one-hot coded (see labels variable before)
         output = Dense(2, activation='sigmoid')(y)
@@ -265,7 +270,7 @@ class ConvGAN(GanBaseClass):
         runs = int(synth_num / self.neb) + 1
         synth_set = []
         for _run in range(runs):
-            batch = self._NMB_guided(index)
+            batch = self.nmbMin.getNbhPointsOfItem(index)
             synth_batch = self.conv_sample_generator.predict(batch)
             synth_set.extend(synth_batch)
 
@@ -274,52 +279,73 @@ class ConvGAN(GanBaseClass):
 
 
     # Training
-    def _rough_learning(self, data_min, data_maj):
+    def _rough_learning(self, data_min, data_maj, discTrainCount):
         generator = self.conv_sample_generator
         discriminator = self.maj_min_discriminator
         GAN = self.cg
         loss_history = [] ## this is for stroring the loss for every run
-        min_idx = 0
-        neb_epoch_count = 1
+        step = 0
+        minSetSize = len(data_min)
 
         labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
 
-        for step in range(self.neb_epochs * len(data_min)):
-            ## generate minority neighbourhood batch for every minority class sampls by index
-            min_batch = self._NMB_guided(min_idx)
-            min_idx = min_idx + 1
-            ## generate random proximal majority batch
-            maj_batch = self._BMB(data_min, data_maj)
-
-            ## generate synthetic samples from convex space
-            ## of minority neighbourhood batch using generator
-            conv_samples = generator.predict(min_batch)
-            ## concatenate them with the majority batch
-            concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
-
-            ## switch on discriminator training
-            discriminator.trainable = True
-            ## train the discriminator with the concatenated samples and the one-hot encoded labels
-            discriminator.fit(x=concat_sample, y=labels, verbose=0)
-            ## switch off the discriminator training again
-            discriminator.trainable = False
-
-            ## use the GAN to make the generator learn on the decisions
-            ## made by the previous discriminator training
-            ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
-            gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
-
-            ## store the loss for the step
-            loss_history.append(gan_loss_history.history['loss'])
-
-            if self.debug and ((step + 1) % 10 == 0):
-                print(f"{step + 1} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
-
-            if min_idx == len(data_min) - 1:
-                if self.debug:
-                    print(f"Neighbourhood epoch {neb_epoch_count} complete")
-                neb_epoch_count = neb_epoch_count + 1
-                min_idx = 0
+        for neb_epoch_count in range(self.neb_epochs):
+            if discTrainCount > 0:
+                for n in range(discTrainCount):
+                    for min_idx in range(minSetSize):
+                        ## generate minority neighbourhood batch for every minority class sampls by index
+                        min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
+                        min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
+                        ## generate random proximal majority batch
+                        maj_batch = self._BMB(data_maj, min_batch_indices)
+
+                        ## generate synthetic samples from convex space
+                        ## of minority neighbourhood batch using generator
+                        conv_samples = generator.predict(min_batch)
+                        ## concatenate them with the majority batch
+                        concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
+
+                        ## switch on discriminator training
+                        discriminator.trainable = True
+                        ## train the discriminator with the concatenated samples and the one-hot encoded labels
+                        discriminator.fit(x=concat_sample, y=labels, verbose=0)
+                        ## switch off the discriminator training again
+                        discriminator.trainable = False
+
+            for min_idx in range(minSetSize):
+                ## generate minority neighbourhood batch for every minority class sampls by index
+                min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
+                min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
+                ## generate random proximal majority batch
+                maj_batch = self._BMB(data_maj, min_batch_indices)
+
+                ## generate synthetic samples from convex space
+                ## of minority neighbourhood batch using generator
+                conv_samples = generator.predict(min_batch)
+                ## concatenate them with the majority batch
+                concat_sample = tf.concat([conv_samples, maj_batch], axis=0)
+
+                ## switch on discriminator training
+                discriminator.trainable = True
+                ## train the discriminator with the concatenated samples and the one-hot encoded labels
+                discriminator.fit(x=concat_sample, y=labels, verbose=0)
+                ## switch off the discriminator training again
+                discriminator.trainable = False
+
+                ## use the GAN to make the generator learn on the decisions
+                ## made by the previous discriminator training
+                ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
+                gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
+
+                ## store the loss for the step
+                loss_history.append(gan_loss_history.history['loss'])
+
+                step += 1
+                if self.debug and (step % 10 == 0):
+                    print(f"{step} neighbourhood batches trained; running neighbourhood epoch {neb_epoch_count}")
+
+            if self.debug:
+                print(f"Neighbourhood epoch {neb_epoch_count + 1} complete")
 
         if self.debug:
             run_range = range(1, len(loss_history) + 1)
@@ -340,37 +366,16 @@ class ConvGAN(GanBaseClass):
 
 
     ## convGAN
-    def _BMB(self, data_min, data_maj):
+    def _BMB(self, data_maj, min_idxs):
 
         ## Generate a borderline majority batch
-        ## data_min -> minority class data
         ## data_maj -> majority class data
-        ## neb -> oversampling neighbourhood
+        ## min_idxs -> indices of points in minority class
         ## gen -> convex combinations generated from each neighbourhood
 
-        return tf.convert_to_tensor(
-            data_maj[np.random.randint(len(data_maj), size=self.gen)]
-            )
-
-    def _NMB_prepare(self, data_min):
-        neigh = NNSearch(self.neb)
-        neigh.fit(data_min)
-        return (data_min, neigh)
-
-
-    def _NMB_guided(self, index):
-
-        ## generate a minority neighbourhood batch for a particular minority sample
-        ## we need this for minority data generation
-        ## we will generate synthetic samples for each training data neighbourhood
-        ## index -> index of the minority sample in a training data whose neighbourhood we want to obtain
-        ## data_min -> minority class data
-        ## neb -> oversampling neighbourhood
-        (data_min, neigh) = self.nmb
-
-        nmbi = np.array([neigh.neighbourhoodOfItem(index)])
-        nmbi = shuffle(nmbi)
-        nmb = data_min[nmbi]
-        nmb = tf.convert_to_tensor(nmb[0])
-        return nmb
-
+        if self.nmbMaj is not None:
+            return self.nmbMaj.neighbourhoodOfItemList(min_idxs, maxCount=self.gen)
+        else:
+            return tf.convert_to_tensor(
+                data_maj[np.random.randint(len(data_maj), size=self.gen)]
+                )

+ 15 - 0
library/interfaces.py

@@ -1,6 +1,7 @@
 """
 This module contains used interfaces for testing the Generative Adversarial Networks.
 """
+import numpy as np
 
 
 class GanBaseClass:
@@ -13,6 +14,7 @@ class GanBaseClass:
         """
         Initializes the class.
         """
+        self.canPredict = False
 
     def reset(self):
         """
@@ -39,3 +41,16 @@ class GanBaseClass:
         *numOfSamples* is an integer > 0. It gives the number of generated samples.
         """
         raise NotImplementedError
+
+    def predict(self, data, limit=0.5):
+        """
+        Takes a list (numpy array) of data points.
+        Returns a list with real values in [0,1] for the propapility
+        that a point is in the minority dataset. With:
+          0.0: point is in majority set
+          1.0: point is in minority set
+        """
+        return np.array([max(0, min(1, int(x + 1.0 - limit))) for x in self.predictReal(data)])
+
+    def predictReal(self, data):
+        raise NotImplemented

+ 4 - 5
library/testers.py

@@ -182,9 +182,7 @@ def knn(ttd):
     checkType(ttd)
     knnTester = KNeighborsClassifier(n_neighbors=10)
     knnTester.fit(ttd.train.data, ttd.train.labels)
-
-    prediction = knnTester.predict(ttd.test.data)
-    return TestResult("KNN", ttd.test.labels, prediction)
+    return runTester(ttd, knnTester, "KNN")
 
 
 def gb(ttd):
@@ -197,10 +195,11 @@ def gb(ttd):
     checkType(ttd)
     tester = GradientBoostingClassifier()
     tester.fit(ttd.train.data, ttd.train.labels)
+    return runTester(ttd, tester, "GB")
 
+def runTester(ttd, tester, name="GAN"):
     prediction = tester.predict(ttd.test.data)
-    return TestResult("GB", ttd.test.labels, prediction)
-
+    return TestResult(name, ttd.test.labels, prediction)
 
 def checkType(t):
     if str(type(t)) == "<class 'numpy.ndarray'>":

+ 4 - 4
run_all_exercises.ipynb

@@ -20,14 +20,14 @@
    "outputs": [],
    "source": [
     "for dataset in testSets:\n",
-    "    for f in generators:\n",
-    "        runExercise(dataset, None, name, f)"
+    "    for name in generators.keys():\n",
+    "        runExercise(dataset, None, name, generators[name])"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -41,7 +41,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,

+ 2 - 2
statistics.py

@@ -10,8 +10,8 @@ f1Score = "f1 score"
 
 ignoreSet = ["yeast_me2"]
 
-gans = [g[0] for g in generators]
-algs = {"LR", "GB", "KNN"}
+gans = generators.keys()
+algs = {"LR", "GB", "KNN", "GAN"}
 
 dataset  = [
     "folding_abalone9-18",

Bu fark içinde çok fazla dosya değişikliği olduğu için bazı dosyalar gösterilmiyor