3 éve · 03187e985d
--- a/Makefile
+++ b/Makefile
@@ -8,6 +8,8 @@ benchmark-gpu: docker/container.ok
 
				 	docker container run --rm --gpus all -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
			
 
				 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
			
 
				 
			
 
				+fix:
			
 
				+	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
			
 
				 
			
 
				 docker/container.ok: docker/Dockerfile docker/run.sh docker/requirements.txt
			
 
				 	docker build -t convgenbenchmark docker/.
			
--- a/library/cache.py
+++ b/library/cache.py
@@ -0,0 +1,25 @@
 
				+import os.path
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+def dataCache(fileName, dataGenerator, x=None):
			
 
				+    def flatten(z):
			
 
				+        if str(type(z)) == "<class 'numpy.ndarray'>":
			
 
				+            return [flatten(x) for x in z]
			
 
				+        else:
			
 
				+            return float(z)
			
 
				+
			
 
				+    if fileName is not None and os.path.exists(fileName):
			
 
				+        print(f"load data from previous session '{fileName}'")
			
 
				+        with open(fileName) as f:
			
 
				+            return json.load(f)
			
 
				+    else:
			
 
				+        d = dataGenerator(x)
			
 
				+
			
 
				+        if fileName is not None:
			
 
				+            print(f"save data for '{fileName}'")
			
 
				+            with open(fileName, 'w') as f:
			
 
				+                json.dump({k: flatten(d[k]) for k in d.keys() }, f)
			
 
				+
			
 
				+        return d
			
 
				+                
			
--- a/library/exercise.py
+++ b/library/exercise.py
@@ -5,6 +5,7 @@ in generating synthetic samples for datasets with a minority class.
 
				 
			
 
				 
			
 
				 import os
			
 
				+import os.path
			
 
				 import numpy as np
			
 
				 from sklearn.decomposition import PCA
			
 
				 from sklearn.preprocessing import StandardScaler
			
@@ -13,6 +14,7 @@ import matplotlib.pyplot as plt
 
				 
			
 
				 from library.dataset import DataSet, TrainTestData
			
 
				 from library.testers import lr, knn, gb, rf, TestResult, runTester
			
 
				+from library.cache import dataCache
			
 
				 import json
			
 
				 
			
 
				 
			
@@ -126,11 +128,10 @@ class Exercise:
 
				                 sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
			
 
				                 self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
			
 
				                 imageFileName = None
			
 
				-                pickleFileName = None
			
 
				+                jsonFileName = None
			
 
				                 if resultsFileName is not None:
			
 
				-                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.pdf"
			
 
				-                    pickleFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.json"
			
 
				-                self._exerciseWithDataSlice(gan, sliceData, imageFileName, pickleFileName)
			
 
				+                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}"
			
 
				+                self._exerciseWithDataSlice(gan, sliceData, imageFileName)
			
 
				 
			
 
				         self.debug("### Exercise is done.")
			
 
				 
			
@@ -156,7 +157,7 @@ class Exercise:
 
				 
			
 
				         return {}
			
 
				 
			
 
				-    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None, pickleFileName=None):
			
 
				+    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None):
			
 
				         """
			
 
				         Runs one test for the given gan and dataSlice.
			
 
				 
			
@@ -167,62 +168,84 @@ class Exercise:
 
				         one data slice with training and testing data.
			
 
				         """
			
 
				 
			
 
				-        # Start over with a new GAN instance.
			
 
				-        self.debug("-> Reset the GAN")
			
 
				-        gan.reset(dataSlice.train)
			
 
				-
			
 
				-        # Train the gan so it can produce synthetic samples.
			
 
				-        self.debug("-> Train generator for synthetic samples")
			
 
				-        gan.train(dataSlice.train)
			
 
				+        jsonFileName = f"{imageFileName}.json"
			
 
				 
			
 
				         # Count how many syhthetic samples are needed.
			
 
				         numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
			
 
				 
			
 
				+        # Start over with a new GAN instance.
			
 
				+        self.debug("-> Reset the GAN")
			
 
				+        gan.reset(dataSlice.train)
			
 
				+
			
 
				         # Add synthetic samples (generated by the GAN) to the minority class.
			
 
				         if numOfNeededSamples > 0:
			
 
				-            self.debug(f"-> create {numOfNeededSamples} synthetic samples")
			
 
				-            newSamples = gan.generateData(numOfNeededSamples)
			
 
				+            def synth(params):
			
 
				+                me = params["self"]
			
 
				+                train = params["train"]
			
 
				+
			
 
				+                # Train the gan so it can produce synthetic samples.
			
 
				+                me.debug("-> Train generator for synthetic samples")
			
 
				+                gan.train(train)
			
 
				 
			
 
				-            if pickleFileName is not None:
			
 
				-                with open(pickleFileName, 'w') as f:
			
 
				-                    json.dump({
			
 
				-                        "majority": [[float(z) for z in x] for x in dataSlice.train.data0],
			
 
				-                        "minority": [[float(z) for z in x] for x in dataSlice.train.data1],
			
 
				-                        "synthetic": [[float(z) for z in x] for x in newSamples]
			
 
				-                        }, f)
			
 
				+                me.debug(f"-> create {numOfNeededSamples} synthetic samples")
			
 
				+                newSamples = gan.generateData(numOfNeededSamples)
			
 
				 
			
 
				-            # Print out an overview of the new dataset.
			
 
				-            plotCloud(dataSlice.train.data0, dataSlice.train.data1, newSamples, outputFile=imageFileName, doShow=False)
			
 
				+                # Print out an overview of the new dataset.
			
 
				+                plotCloud(train.data0, train.data1, newSamples, outputFile=imageFileName, doShow=False)
			
 
				 
			
 
				+                return {
			
 
				+                    "majority": train.data0,
			
 
				+                    "minority": train.data1,
			
 
				+                    "synthetic": newSamples
			
 
				+                    }
			
 
				+
			
 
				+            j = dataCache(jsonFileName, synth, {"self": self, "train":dataSlice.train})
			
 
				             dataSlice.train = DataSet(
			
 
				-                data0=dataSlice.train.data0,
			
 
				-                data1=np.concatenate((dataSlice.train.data1, newSamples))
			
 
				+                data0=j["majority"],
			
 
				+                data1=np.concatenate((j["minority"], j["synthetic"]))
			
 
				                 )
			
 
				+            j = None
			
 
				+
			
 
				+        if imageFileName is not None:
			
 
				+            fig_pr, ax_pr = plt.subplots()
			
 
				+            fig_roc, ax_roc = plt.subplots()
			
 
				 
			
 
				         # Test this dataset with every given test-function.
			
 
				         # The results are printed out and stored to the results dictionary.
			
 
				         if gan.canPredict and "GAN" not in self.testFunctions.keys():
			
 
				-            self.debug(f"-> retrain GAN for predict")
			
 
				-            trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
			
 
				-            trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
			
 
				-            indices = shuffle(np.array(range(len(trainData))))
			
 
				-            trainData = trainData[indices]
			
 
				-            trainLabels = trainLabels[indices]
			
 
				-            indices = None
			
 
				-            gan.retrainDiscriminitor(trainData, trainLabels)
			
 
				-            trainData = None
			
 
				-            trainLabels = None
			
 
				-            self.debug(f"-> test with GAN.predict")
			
 
				-            testResult = runTester(dataSlice, gan)
			
 
				+            #self.debug(f"-> retrain GAN for predict")
			
 
				+            #trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
			
 
				+            #trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
			
 
				+            #indices = shuffle(np.array(range(len(trainData))))
			
 
				+            #trainData = trainData[indices]
			
 
				+            #trainLabels = trainLabels[indices]
			
 
				+            #indices = None
			
 
				+            #gan.retrainDiscriminitor(trainData, trainLabels)
			
 
				+            #trainData = None
			
 
				+            #trainLabels = None
			
 
				+            self.debug(f"-> test with 'GAN'")
			
 
				+            testResult = runTester(dataSlice, gan, f"{imageFileName}-GAN.json")
			
 
				             self.debug(str(testResult))
			
 
				             self.results["GAN"].append(testResult)
			
 
				 
			
 
				+            if imageFileName is not None:
			
 
				+                testResult.plotPR(ax_pr)
			
 
				+                testResult.plotROC(ax_roc)
			
 
				+
			
 
				         for testerName in self.testFunctions:
			
 
				             self.debug(f"-> test with '{testerName}'")
			
 
				-            testResult = (self.testFunctions[testerName])(dataSlice)
			
 
				+            testResult = (self.testFunctions[testerName])(dataSlice, f"{imageFileName}-{testerName}.json")
			
 
				             self.debug(str(testResult))
			
 
				             self.results[testerName].append(testResult)
			
 
				 
			
 
				+            if imageFileName is not None:
			
 
				+                testResult.plotPR(ax_pr)
			
 
				+                testResult.plotROC(ax_roc)
			
 
				+
			
 
				+        if imageFileName is not None:
			
 
				+            fig_pr.savefig(imageFileName + "_PR.pdf")
			
 
				+            fig_roc.savefig(imageFileName + "_ROC.pdf")
			
 
				+
			
 
				 
			
 
				     def saveResultsTo(self, fileName):
			
 
				         avgResults = {}
			
@@ -313,4 +336,4 @@ def plotCloud(data0, data1, dataNew=None, outputFile=None, title="", doShow=True
 
				         plt.show()
			
 
				 
			
 
				     if outputFile is not None:
			
 
				-        fig.savefig(outputFile)
			
 
				+        fig.savefig(outputFile + ".pdf")
			
--- a/library/generators/ConvGeN.py
+++ b/library/generators/ConvGeN.py
@@ -407,3 +407,6 @@ class ConvGeN(GanBaseClass):
 
				         labels = np.array([ [x, 1 - x] for x in labels])
			
 
				         self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
			
 
				         self.maj_min_discriminator.trainable = False
			
 
				+    
			
 
				+    def fit(self, data, labels):
			
 
				+        return self.retrainDiscriminitor(data, labels)
			
--- a/library/testers.py
+++ b/library/testers.py
@@ -14,7 +14,12 @@ from sklearn.metrics import confusion_matrix
 
				 from sklearn.metrics import average_precision_score
			
 
				 from sklearn.metrics import f1_score
			
 
				 from sklearn.metrics import cohen_kappa_score
			
 
				+from sklearn.metrics import RocCurveDisplay
			
 
				+from sklearn.metrics import PrecisionRecallDisplay
			
 
				 from sklearn.ensemble import GradientBoostingClassifier
			
 
				+from imblearn.metrics import geometric_mean_score
			
 
				+
			
 
				+from library.cache import dataCache
			
 
				 
			
 
				 _tF1 = "f1 score"
			
 
				 _tTN = "TN"
			
@@ -24,6 +29,7 @@ _tFP = "FP"
 
				 _tFP = "RF"
			
 
				 _tAps = "average precision score"
			
 
				 _tCks = "cohens kappa score"
			
 
				+_tGMean = "G-Mean score"
			
 
				 
			
 
				 class TestResult:
			
 
				     """
			
@@ -45,11 +51,13 @@ class TestResult:
 
				         *aps* is a real number representing the average precision score.
			
 
				         """
			
 
				         self.title = title
			
 
				-        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks]
			
 
				-        if aps is not None:
			
 
				-            self.heading.append(_tAps)
			
 
				+        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks, _tAps, _tGMean]
			
 
				         self.data = { n: 0.0 for n in self.heading }
			
 
				 
			
 
				+
			
 
				+        self.labels = labels
			
 
				+        self.prediction = prediction
			
 
				+
			
 
				         if labels is not None and prediction is not None:
			
 
				             self.data[_tF1]     = f1_score(labels, prediction)
			
 
				             self.data[_tCks]    = cohen_kappa_score(labels, prediction)
			
@@ -59,10 +67,14 @@ class TestResult:
 
				             self.data[_tTP] = tp
			
 
				             self.data[_tFN] = fn
			
 
				             self.data[_tFP] = fp
			
 
				+            self.data[_tGMean] = geometric_mean_score(labels, prediction)
			
 
				+            if aps is None:
			
 
				+                self.data[_tAps] = average_precision_score(labels, prediction)
			
 
				 
			
 
				         if aps is not None:
			
 
				             self.data[_tAps] = aps
			
 
				 
			
 
				+
			
 
				     def __str__(self):
			
 
				         """
			
 
				         Generates a text representing this result.
			
@@ -146,78 +158,92 @@ class TestResult:
 
				                     a.data[k] = 0.0
			
 
				             return (mi, ma, a)
			
 
				 
			
 
				+    def plotPR(self, ax):
			
 
				+        PrecisionRecallDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
			
 
				+
			
 
				+    def plotROC(self, ax):
			
 
				+        RocCurveDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
			
 
				+
			
 
				         
			
 
				 
			
 
				 
			
 
				-def lr(ttd):
			
 
				+def lr(ttd, jsonFileName=None):
			
 
				     """
			
 
				     Runs a test for a dataset with the logistic regression algorithm.
			
 
				     It returns a /TestResult./
			
 
				 
			
 
				     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
			
 
				     """
			
 
				-    checkType(ttd)
			
 
				-    logreg = LogisticRegression(
			
 
				-        C=1e5,
			
 
				-        solver='lbfgs',
			
 
				-        max_iter=10000,
			
 
				-        multi_class='multinomial',
			
 
				-        class_weight={0: 1, 1: 1.3}
			
 
				-        )
			
 
				-    logreg.fit(ttd.train.data, ttd.train.labels)
			
 
				-
			
 
				-    prediction = logreg.predict(ttd.test.data)
			
 
				-
			
 
				-    prob_lr = logreg.predict_proba(ttd.test.data)
			
 
				-    aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
			
 
				-    return TestResult("LR", ttd.test.labels, prediction, aps_lr)
			
 
				-
			
 
				-
			
 
				-
			
 
				-def knn(ttd):
			
 
				+    def g(nothing):
			
 
				+        checkType(ttd)
			
 
				+        logreg = LogisticRegression(
			
 
				+            C=1e5,
			
 
				+            solver='lbfgs',
			
 
				+            max_iter=10000,
			
 
				+            multi_class='multinomial',
			
 
				+            class_weight={0: 1, 1: 1.3}
			
 
				+            )
			
 
				+        logreg.fit(ttd.train.data, ttd.train.labels)
			
 
				+        prediction = logreg.predict(ttd.test.data)
			
 
				+        prob_lr = logreg.predict_proba(ttd.test.data)
			
 
				+        aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
			
 
				+        return {
			
 
				+            "labels": ttd.test.labels,
			
 
				+            "prediction": prediction,
			
 
				+            "aps_lr": aps_lr
			
 
				+            }
			
 
				+
			
 
				+    d = dataCache(jsonFileName, g)
			
 
				+    return TestResult("LR", d["labels"], d["prediction"], d["aps_lr"])
			
 
				+
			
 
				+
			
 
				+
			
 
				+def knn(ttd, jsonFileName=None):
			
 
				     """
			
 
				     Runs a test for a dataset with the k-next neighbourhood algorithm.
			
 
				     It returns a /TestResult./
			
 
				 
			
 
				     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
			
 
				     """
			
 
				-    checkType(ttd)
			
 
				     knnTester = KNeighborsClassifier(n_neighbors=10)
			
 
				-    knnTester.fit(ttd.train.data, ttd.train.labels)
			
 
				-    return runTester(ttd, knnTester, "KNN")
			
 
				+    return runTester(ttd, knnTester, "KNN", jsonFileName)
			
 
				 
			
 
				 
			
 
				-def gb(ttd):
			
 
				+def gb(ttd, jsonFileName=None):
			
 
				     """
			
 
				     Runs a test for a dataset with the gradient boosting algorithm.
			
 
				     It returns a /TestResult./
			
 
				 
			
 
				     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
			
 
				     """
			
 
				-    checkType(ttd)
			
 
				     tester = GradientBoostingClassifier()
			
 
				-    tester.fit(ttd.train.data, ttd.train.labels)
			
 
				-    return runTester(ttd, tester, "GB")
			
 
				+    return runTester(ttd, tester, "GB", jsonFileName)
			
 
				 
			
 
				 
			
 
				 
			
 
				-def rf(ttd):
			
 
				+def rf(ttd, jsonFileName=None):
			
 
				     """
			
 
				     Runs a test for a dataset with the random forest algorithm.
			
 
				     It returns a /TestResult./
			
 
				 
			
 
				     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
			
 
				     """
			
 
				-    checkType(ttd)
			
 
				     tester = RandomForestClassifier()
			
 
				-    tester.fit(ttd.train.data, ttd.train.labels)
			
 
				-    return runTester(ttd, tester, "RF")
			
 
				+    return runTester(ttd, tester, "RF", jsonFileName)
			
 
				+
			
 
				 
			
 
				 
			
 
				+def runTester(ttd, tester, name="GAN", jsonFileName=None):
			
 
				+    def g(nothing):
			
 
				+        checkType(ttd)
			
 
				+        tester.fit(ttd.train.data, ttd.train.labels)
			
 
				+        return {
			
 
				+            "labels": ttd.test.labels,
			
 
				+            "prediction": tester.predict(ttd.test.data)
			
 
				+            }
			
 
				 
			
 
				-def runTester(ttd, tester, name="GAN"):
			
 
				-    prediction = tester.predict(ttd.test.data)
			
 
				-    return TestResult(name, ttd.test.labels, prediction)
			
 
				+    d = dataCache(jsonFileName, g)
			
 
				+    return TestResult(name, d["labels"], d["prediction"])
			
 
				 
			
 
				 def checkType(t):
			
 
				     if str(type(t)) == "<class 'numpy.ndarray'>":