Forráskód Böngészése

Added caching for results. Added G-Mean.

Kristian Schultz 2 éve
szülő
commit
03187e985d
5 módosított fájl, 155 hozzáadás és 76 törlés
  1. 2 0
      Makefile
  2. 25 0
      library/cache.py
  3. 62 39
      library/exercise.py
  4. 3 0
      library/generators/ConvGeN.py
  5. 63 37
      library/testers.py

+ 2 - 0
Makefile

@@ -8,6 +8,8 @@ benchmark-gpu: docker/container.ok
 	docker container run --rm --gpus all -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
 
+fix:
+	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
 
 docker/container.ok: docker/Dockerfile docker/run.sh docker/requirements.txt
 	docker build -t convgenbenchmark docker/.

+ 25 - 0
library/cache.py

@@ -0,0 +1,25 @@
+import os.path
+import json
+
+
+def dataCache(fileName, dataGenerator, x=None):
+    def flatten(z):
+        if str(type(z)) == "<class 'numpy.ndarray'>":
+            return [flatten(x) for x in z]
+        else:
+            return float(z)
+
+    if fileName is not None and os.path.exists(fileName):
+        print(f"load data from previous session '{fileName}'")
+        with open(fileName) as f:
+            return json.load(f)
+    else:
+        d = dataGenerator(x)
+
+        if fileName is not None:
+            print(f"save data for '{fileName}'")
+            with open(fileName, 'w') as f:
+                json.dump({k: flatten(d[k]) for k in d.keys() }, f)
+
+        return d
+                

+ 62 - 39
library/exercise.py

@@ -5,6 +5,7 @@ in generating synthetic samples for datasets with a minority class.
 
 
 import os
+import os.path
 import numpy as np
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
@@ -13,6 +14,7 @@ import matplotlib.pyplot as plt
 
 from library.dataset import DataSet, TrainTestData
 from library.testers import lr, knn, gb, rf, TestResult, runTester
+from library.cache import dataCache
 import json
 
 
@@ -126,11 +128,10 @@ class Exercise:
                 sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
                 self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
                 imageFileName = None
-                pickleFileName = None
+                jsonFileName = None
                 if resultsFileName is not None:
-                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.pdf"
-                    pickleFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.json"
-                self._exerciseWithDataSlice(gan, sliceData, imageFileName, pickleFileName)
+                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}"
+                self._exerciseWithDataSlice(gan, sliceData, imageFileName)
 
         self.debug("### Exercise is done.")
 
@@ -156,7 +157,7 @@ class Exercise:
 
         return {}
 
-    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None, pickleFileName=None):
+    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None):
         """
         Runs one test for the given gan and dataSlice.
 
@@ -167,62 +168,84 @@ class Exercise:
         one data slice with training and testing data.
         """
 
-        # Start over with a new GAN instance.
-        self.debug("-> Reset the GAN")
-        gan.reset(dataSlice.train)
-
-        # Train the gan so it can produce synthetic samples.
-        self.debug("-> Train generator for synthetic samples")
-        gan.train(dataSlice.train)
+        jsonFileName = f"{imageFileName}.json"
 
         # Count how many syhthetic samples are needed.
         numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
 
+        # Start over with a new GAN instance.
+        self.debug("-> Reset the GAN")
+        gan.reset(dataSlice.train)
+
         # Add synthetic samples (generated by the GAN) to the minority class.
         if numOfNeededSamples > 0:
-            self.debug(f"-> create {numOfNeededSamples} synthetic samples")
-            newSamples = gan.generateData(numOfNeededSamples)
+            def synth(params):
+                me = params["self"]
+                train = params["train"]
+
+                # Train the gan so it can produce synthetic samples.
+                me.debug("-> Train generator for synthetic samples")
+                gan.train(train)
 
-            if pickleFileName is not None:
-                with open(pickleFileName, 'w') as f:
-                    json.dump({
-                        "majority": [[float(z) for z in x] for x in dataSlice.train.data0],
-                        "minority": [[float(z) for z in x] for x in dataSlice.train.data1],
-                        "synthetic": [[float(z) for z in x] for x in newSamples]
-                        }, f)
+                me.debug(f"-> create {numOfNeededSamples} synthetic samples")
+                newSamples = gan.generateData(numOfNeededSamples)
 
-            # Print out an overview of the new dataset.
-            plotCloud(dataSlice.train.data0, dataSlice.train.data1, newSamples, outputFile=imageFileName, doShow=False)
+                # Print out an overview of the new dataset.
+                plotCloud(train.data0, train.data1, newSamples, outputFile=imageFileName, doShow=False)
 
+                return {
+                    "majority": train.data0,
+                    "minority": train.data1,
+                    "synthetic": newSamples
+                    }
+
+            j = dataCache(jsonFileName, synth, {"self": self, "train":dataSlice.train})
             dataSlice.train = DataSet(
-                data0=dataSlice.train.data0,
-                data1=np.concatenate((dataSlice.train.data1, newSamples))
+                data0=j["majority"],
+                data1=np.concatenate((j["minority"], j["synthetic"]))
                 )
+            j = None
+
+        if imageFileName is not None:
+            fig_pr, ax_pr = plt.subplots()
+            fig_roc, ax_roc = plt.subplots()
 
         # Test this dataset with every given test-function.
         # The results are printed out and stored to the results dictionary.
         if gan.canPredict and "GAN" not in self.testFunctions.keys():
-            self.debug(f"-> retrain GAN for predict")
-            trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
-            trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
-            indices = shuffle(np.array(range(len(trainData))))
-            trainData = trainData[indices]
-            trainLabels = trainLabels[indices]
-            indices = None
-            gan.retrainDiscriminitor(trainData, trainLabels)
-            trainData = None
-            trainLabels = None
-            self.debug(f"-> test with GAN.predict")
-            testResult = runTester(dataSlice, gan)
+            #self.debug(f"-> retrain GAN for predict")
+            #trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
+            #trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
+            #indices = shuffle(np.array(range(len(trainData))))
+            #trainData = trainData[indices]
+            #trainLabels = trainLabels[indices]
+            #indices = None
+            #gan.retrainDiscriminitor(trainData, trainLabels)
+            #trainData = None
+            #trainLabels = None
+            self.debug(f"-> test with 'GAN'")
+            testResult = runTester(dataSlice, gan, f"{imageFileName}-GAN.json")
             self.debug(str(testResult))
             self.results["GAN"].append(testResult)
 
+            if imageFileName is not None:
+                testResult.plotPR(ax_pr)
+                testResult.plotROC(ax_roc)
+
         for testerName in self.testFunctions:
             self.debug(f"-> test with '{testerName}'")
-            testResult = (self.testFunctions[testerName])(dataSlice)
+            testResult = (self.testFunctions[testerName])(dataSlice, f"{imageFileName}-{testerName}.json")
             self.debug(str(testResult))
             self.results[testerName].append(testResult)
 
+            if imageFileName is not None:
+                testResult.plotPR(ax_pr)
+                testResult.plotROC(ax_roc)
+
+        if imageFileName is not None:
+            fig_pr.savefig(imageFileName + "_PR.pdf")
+            fig_roc.savefig(imageFileName + "_ROC.pdf")
+
 
     def saveResultsTo(self, fileName):
         avgResults = {}
@@ -313,4 +336,4 @@ def plotCloud(data0, data1, dataNew=None, outputFile=None, title="", doShow=True
         plt.show()
 
     if outputFile is not None:
-        fig.savefig(outputFile)
+        fig.savefig(outputFile + ".pdf")

+ 3 - 0
library/generators/ConvGeN.py

@@ -407,3 +407,6 @@ class ConvGeN(GanBaseClass):
         labels = np.array([ [x, 1 - x] for x in labels])
         self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
         self.maj_min_discriminator.trainable = False
+    
+    def fit(self, data, labels):
+        return self.retrainDiscriminitor(data, labels)

+ 63 - 37
library/testers.py

@@ -14,7 +14,12 @@ from sklearn.metrics import confusion_matrix
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import f1_score
 from sklearn.metrics import cohen_kappa_score
+from sklearn.metrics import RocCurveDisplay
+from sklearn.metrics import PrecisionRecallDisplay
 from sklearn.ensemble import GradientBoostingClassifier
+from imblearn.metrics import geometric_mean_score
+
+from library.cache import dataCache
 
 _tF1 = "f1 score"
 _tTN = "TN"
@@ -24,6 +29,7 @@ _tFP = "FP"
 _tFP = "RF"
 _tAps = "average precision score"
 _tCks = "cohens kappa score"
+_tGMean = "G-Mean score"
 
 class TestResult:
     """
@@ -45,11 +51,13 @@ class TestResult:
         *aps* is a real number representing the average precision score.
         """
         self.title = title
-        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks]
-        if aps is not None:
-            self.heading.append(_tAps)
+        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks, _tAps, _tGMean]
         self.data = { n: 0.0 for n in self.heading }
 
+
+        self.labels = labels
+        self.prediction = prediction
+
         if labels is not None and prediction is not None:
             self.data[_tF1]     = f1_score(labels, prediction)
             self.data[_tCks]    = cohen_kappa_score(labels, prediction)
@@ -59,10 +67,14 @@ class TestResult:
             self.data[_tTP] = tp
             self.data[_tFN] = fn
             self.data[_tFP] = fp
+            self.data[_tGMean] = geometric_mean_score(labels, prediction)
+            if aps is None:
+                self.data[_tAps] = average_precision_score(labels, prediction)
 
         if aps is not None:
             self.data[_tAps] = aps
 
+
     def __str__(self):
         """
         Generates a text representing this result.
@@ -146,78 +158,92 @@ class TestResult:
                     a.data[k] = 0.0
             return (mi, ma, a)
 
+    def plotPR(self, ax):
+        PrecisionRecallDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
+
+    def plotROC(self, ax):
+        RocCurveDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
+
         
 
 
-def lr(ttd):
+def lr(ttd, jsonFileName=None):
     """
     Runs a test for a dataset with the logistic regression algorithm.
     It returns a /TestResult./
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
-    checkType(ttd)
-    logreg = LogisticRegression(
-        C=1e5,
-        solver='lbfgs',
-        max_iter=10000,
-        multi_class='multinomial',
-        class_weight={0: 1, 1: 1.3}
-        )
-    logreg.fit(ttd.train.data, ttd.train.labels)
-
-    prediction = logreg.predict(ttd.test.data)
-
-    prob_lr = logreg.predict_proba(ttd.test.data)
-    aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
-    return TestResult("LR", ttd.test.labels, prediction, aps_lr)
-
-
-
-def knn(ttd):
+    def g(nothing):
+        checkType(ttd)
+        logreg = LogisticRegression(
+            C=1e5,
+            solver='lbfgs',
+            max_iter=10000,
+            multi_class='multinomial',
+            class_weight={0: 1, 1: 1.3}
+            )
+        logreg.fit(ttd.train.data, ttd.train.labels)
+        prediction = logreg.predict(ttd.test.data)
+        prob_lr = logreg.predict_proba(ttd.test.data)
+        aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
+        return {
+            "labels": ttd.test.labels,
+            "prediction": prediction,
+            "aps_lr": aps_lr
+            }
+
+    d = dataCache(jsonFileName, g)
+    return TestResult("LR", d["labels"], d["prediction"], d["aps_lr"])
+
+
+
+def knn(ttd, jsonFileName=None):
     """
     Runs a test for a dataset with the k-next neighbourhood algorithm.
     It returns a /TestResult./
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
-    checkType(ttd)
     knnTester = KNeighborsClassifier(n_neighbors=10)
-    knnTester.fit(ttd.train.data, ttd.train.labels)
-    return runTester(ttd, knnTester, "KNN")
+    return runTester(ttd, knnTester, "KNN", jsonFileName)
 
 
-def gb(ttd):
+def gb(ttd, jsonFileName=None):
     """
     Runs a test for a dataset with the gradient boosting algorithm.
     It returns a /TestResult./
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
-    checkType(ttd)
     tester = GradientBoostingClassifier()
-    tester.fit(ttd.train.data, ttd.train.labels)
-    return runTester(ttd, tester, "GB")
+    return runTester(ttd, tester, "GB", jsonFileName)
 
 
 
-def rf(ttd):
+def rf(ttd, jsonFileName=None):
     """
     Runs a test for a dataset with the random forest algorithm.
     It returns a /TestResult./
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
-    checkType(ttd)
     tester = RandomForestClassifier()
-    tester.fit(ttd.train.data, ttd.train.labels)
-    return runTester(ttd, tester, "RF")
+    return runTester(ttd, tester, "RF", jsonFileName)
+
 
 
+def runTester(ttd, tester, name="GAN", jsonFileName=None):
+    def g(nothing):
+        checkType(ttd)
+        tester.fit(ttd.train.data, ttd.train.labels)
+        return {
+            "labels": ttd.test.labels,
+            "prediction": tester.predict(ttd.test.data)
+            }
 
-def runTester(ttd, tester, name="GAN"):
-    prediction = tester.predict(ttd.test.data)
-    return TestResult(name, ttd.test.labels, prediction)
+    d = dataCache(jsonFileName, g)
+    return TestResult(name, d["labels"], d["prediction"])
 
 def checkType(t):
     if str(type(t)) == "<class 'numpy.ndarray'>":