Преглед на файлове

Fixed missing shuffeling / added RF classifier

Kristian Schultz преди 3 години
родител
ревизия
b364a61053

+ 8 - 11
library/analysis.py

@@ -243,11 +243,11 @@ testSets = [
     "folding_yeast4",
     "folding_yeast5",
     "folding_yeast6",
-    "imblearn_webpage",
-    "imblearn_mammography",
-    "imblearn_protein_homo",
-    "imblearn_ozone_level",
-    "kaggle_creditcard"
+    #"imblearn_webpage",
+    #"imblearn_mammography",
+    #"imblearn_protein_homo",
+    #"imblearn_ozone_level",
+    #"kaggle_creditcard"
     ]
 
 def runAllTestSets(dataSetList):
@@ -260,14 +260,11 @@ def runAllTestSets(dataSetList):
 
 
 
-generators = { "ProWRAS":                 lambda _data: ProWRAS()
-             , "Repeater":                lambda _data: Repeater()
-             #, "SpheredNoise":            lambda _data: SpheredNoise()
-             , "SimpleGAN":               lambda data: SimpleGan(numOfFeatures=data.data0.shape[1])
+generators = { "Repeater":                lambda _data: Repeater()
+             , "ProWRAS":                 lambda _data: ProWRAS()
+             , "GAN":                     lambda data: SimpleGan(numOfFeatures=data.data0.shape[1])
              , "ctGAN":                   lambda data: CtGAN(data.data0.shape[1])
              , "CTAB-GAN":                lambda _data: CtabGan()
-             # , "convGAN-old-5":      lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5)
-             # , "convGAN-old-full":   lambda data: ConvGAN(data.data0.shape[1], neb=data.data0.shape[1], gen=data.data0.shape[1])
              , "convGAN-majority-5":      lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5)
              , "convGAN-majority-full":   lambda data: ConvGAN(data.data0.shape[1], neb=None)
              , "convGAN-proximary-5":     lambda data: ConvGAN(data.data0.shape[1], neb=5, gen=5, withMajorhoodNbSearch=True)

+ 13 - 1
library/exercise.py

@@ -8,10 +8,11 @@ import os
 import numpy as np
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
 import matplotlib.pyplot as plt
 
 from library.dataset import DataSet, TrainTestData
-from library.testers import lr, knn, gb, TestResult, runTester
+from library.testers import lr, knn, gb, rf, TestResult, runTester
 
 
 class Exercise:
@@ -45,6 +46,7 @@ class Exercise:
         if self.testFunctions is None:
             self.testFunctions = {
                 "LR": lr,
+                "RF": rf,
                 "GB": gb,
                 "KNN": knn
                 }
@@ -188,6 +190,16 @@ class Exercise:
         # Test this dataset with every given test-function.
         # The results are printed out and stored to the results dictionary.
         if gan.canPredict and "GAN" not in self.testFunctions.keys():
+            self.debug(f"-> retrain GAN for predict")
+            trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
+            trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
+            indices = shuffle(np.array(range(len(trainData))))
+            trainData = trainData[indices]
+            trainLabels = trainLabels[indices]
+            indices = None
+            gan.retrainDiscriminitor(trainData, trainLabels)
+            trainData = None
+            trainLabels = None
             self.debug(f"-> test with GAN.predict")
             testResult = runTester(dataSlice, gan)
             self.debug(str(testResult))

+ 1 - 0
library/generators/LoRAS_ProWRAS.py

@@ -32,6 +32,7 @@ class ProWRAS(GanBaseClass):
         self.sigma = sigma
         self.n_jobs = n_jobs
         self.debug = debug
+        self.canPredict = False
 
     def reset(self, _dataSet):
         """

+ 1 - 0
library/generators/Repeater.py

@@ -19,6 +19,7 @@ class Repeater(GanBaseClass):
     It repeats the first point of the training-data-set.
     """
     def __init__(self):
+        self.canPredict = False
         self.isTrained = False
         self.exampleItems = None
         self.nextIndex = 0

+ 1 - 0
library/generators/SpheredNoise.py

@@ -87,6 +87,7 @@ class SpheredNoise(GanBaseClass):
     A class for a simple GAN.
     """
     def __init__(self, noiseSize=101):
+        self.canPredict = False
         self.isTrained = False
         self.noiseSize = noiseSize
         self.disc = []

+ 1 - 0
library/generators/autoencoder.py

@@ -49,6 +49,7 @@ class Autoencoder(GanBaseClass):
     It repeats the first point of the training-data-set.
     """
     def __init__(self, n_feat, middleSize=4, eps=0.0001, debug=True):
+        self.canPredict = False
         self.isTrained = False
         self.n_feat = n_feat
         self.middleSize = middleSize

+ 15 - 8
library/generators/convGAN.py

@@ -13,6 +13,8 @@ import tensorflow as tf
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.layers import Lambda
 
+from sklearn.utils import shuffle
+
 from library.NNSearch import NNSearch
 
 import warnings
@@ -311,7 +313,7 @@ class ConvGAN(GanBaseClass):
                 for n in range(discTrainCount):
                     for min_idx in range(minSetSize):
                         ## generate minority neighbourhood batch for every minority class sampls by index
-                        min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
+                        min_batch_indices = shuffle(self.nmbMin.neighbourhoodOfItem(min_idx))
                         min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
                         ## generate random proximal majority batch
                         maj_batch = self._BMB(data_maj, min_batch_indices)
@@ -325,13 +327,13 @@ class ConvGAN(GanBaseClass):
                         ## switch on discriminator training
                         discriminator.trainable = True
                         ## train the discriminator with the concatenated samples and the one-hot encoded labels
-                        discriminator.fit(x=concat_sample, y=labels, verbose=0, batch_size=nLabels)
+                        discriminator.fit(x=concat_sample, y=labels, verbose=0, batch_size=20)
                         ## switch off the discriminator training again
                         discriminator.trainable = False
 
             for min_idx in range(minSetSize):
                 ## generate minority neighbourhood batch for every minority class sampls by index
-                min_batch_indices = self.nmbMin.neighbourhoodOfItem(min_idx)
+                min_batch_indices = shuffle(self.nmbMin.neighbourhoodOfItem(min_idx))
                 min_batch = self.nmbMin.getPointsFromIndices(min_batch_indices)
                 ## generate random proximal majority batch
                 maj_batch = self._BMB(data_maj, min_batch_indices)
@@ -345,7 +347,7 @@ class ConvGAN(GanBaseClass):
                 ## switch on discriminator training
                 discriminator.trainable = True
                 ## train the discriminator with the concatenated samples and the one-hot encoded labels
-                discriminator.fit(x=concat_sample, y=labels, verbose=0, batch_size=nLabels)
+                discriminator.fit(x=concat_sample, y=labels, verbose=0, batch_size=20)
                 ## switch off the discriminator training again
                 discriminator.trainable = False
 
@@ -391,8 +393,13 @@ class ConvGAN(GanBaseClass):
         ## gen -> convex combinations generated from each neighbourhood
 
         if self.nmbMaj is not None:
-            return self.nmbMaj.neighbourhoodOfItemList(min_idxs, maxCount=self.gen)
+            return self.nmbMaj.neighbourhoodOfItemList(shuffle(min_idxs), maxCount=self.gen)
         else:
-            return tf.convert_to_tensor(
-                data_maj[np.random.randint(len(data_maj), size=self.gen)]
-                )
+            return tf.convert_to_tensor(data_maj[np.random.randint(len(data_maj), size=self.gen)])
+
+
+    def retrainDiscriminitor(self, data, labels):
+        self.maj_min_discriminator.trainable = True
+        labels = np.array([ [x, 1 - x] for x in labels])
+        self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
+        self.maj_min_discriminator.trainable = False

+ 1 - 0
library/generators/ctgan.py

@@ -16,6 +16,7 @@ class CtGAN(GanBaseClass):
         self.epochs = epochs
         self.debug = debug
         self.ctgan = None
+        self.canPredict = False
 
     def reset(self, _dataSet):
         """

+ 3 - 0
library/interfaces.py

@@ -54,3 +54,6 @@ class GanBaseClass:
 
     def predictReal(self, data):
         raise NotImplemented
+
+    def retrainDiscriminitor(data, labels):
+        pass

+ 18 - 0
library/testers.py

@@ -7,6 +7,7 @@ results of the tests.
 
 import sklearn
 # needed in function lr
+from sklearn.ensemble import RandomForestClassifier 
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import confusion_matrix
@@ -20,6 +21,7 @@ _tTN = "TN"
 _tTP = "TP"
 _tFN = "FN"
 _tFP = "FP"
+_tFP = "RF"
 _tAps = "average precision score"
 _tCks = "cohens kappa score"
 
@@ -197,6 +199,22 @@ def gb(ttd):
     tester.fit(ttd.train.data, ttd.train.labels)
     return runTester(ttd, tester, "GB")
 
+
+
+def rf(ttd):
+    """
+    Runs a test for a dataset with the random forest algorithm.
+    It returns a /TestResult./
+
+    *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
+    """
+    checkType(ttd)
+    tester = RandomForestClassifier()
+    tester.fit(ttd.train.data, ttd.train.labels)
+    return runTester(ttd, tester, "RF")
+
+
+
 def runTester(ttd, tester, name="GAN"):
     prediction = tester.predict(ttd.test.data)
     return TestResult(name, ttd.test.labels, prediction)