3 yıl önce · 284937f721
--- a/CreateTables-all.ipynb
+++ b/CreateTables-all.ipynb
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,26 @@ all: benchmark
 
															 benchmark: docker/container.ok
														
 
															 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
														
 
															-	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
														
 
															+	make fix
														
 
															 benchmark-gpu: docker/container.ok
														
 
															 	docker container run --rm --gpus all -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
														
 
															+	make fix
														
 
															+
														
 
															+tables: docker/container.ok
														
 
															+	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/CreateTables.py
														
 
															+	make fix
														
 
															+
														
 
															+
														
 
															+fix: docker/container.ok
														
 
															 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
														
 
															+clean: fix
														
 
															+	rm -f data_result/*/folding_*.csv
														
 
															+	rm -f data_result/*/folding_*.log
														
 
															+	rm -f data_result/*/folding_*.log.time
														
 
															+
														
 
															 docker/container.ok: docker/Dockerfile docker/run.sh docker/requirements.txt
														
 
															 	docker build -t convgenbenchmark docker/.
														
 
															 	date > $@
														
--- a/library/cache.py
+++ b/library/cache.py
@@ -0,0 +1,25 @@
 
															+import os.path
														
 
															+import json
														
 
															+
														
 
															+
														
 
															+def dataCache(fileName, dataGenerator, x=None):
														
 
															+    def flatten(z):
														
 
															+        if str(type(z)) == "<class 'numpy.ndarray'>":
														
 
															+            return [flatten(x) for x in z]
														
 
															+        else:
														
 
															+            return float(z)
														
 
															+
														
 
															+    if fileName is not None and os.path.exists(fileName):
														
 
															+        print(f"load data from previous session '{fileName}'")
														
 
															+        with open(fileName) as f:
														
 
															+            return json.load(f)
														
 
															+    else:
														
 
															+        d = dataGenerator(x)
														
 
															+
														
 
															+        if fileName is not None:
														
 
															+            print(f"save data for '{fileName}'")
														
 
															+            with open(fileName, 'w') as f:
														
 
															+                json.dump({k: flatten(d[k]) for k in d.keys() }, f)
														
 
															+
														
 
															+        return d
														
 
															+                
														
--- a/library/exercise.py
+++ b/library/exercise.py
@@ -5,6 +5,7 @@ in generating synthetic samples for datasets with a minority class.
 
															 import os
														
 
															+import os.path
														
 
															 import numpy as np
														
 
															 from sklearn.decomposition import PCA
														
 
															 from sklearn.preprocessing import StandardScaler
														
@@ -13,6 +14,7 @@ import matplotlib.pyplot as plt
 
															 from library.dataset import DataSet, TrainTestData
														
 
															 from library.testers import lr, knn, gb, rf, TestResult, runTester
														
 
															+from library.cache import dataCache
														
 
															 import json
														
@@ -126,11 +128,10 @@ class Exercise:
 
															                 sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
														
 
															                 self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
														
 
															                 imageFileName = None
														
 
															-                pickleFileName = None
														
 
															+                jsonFileName = None
														
 
															                 if resultsFileName is not None:
														
 
															-                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.pdf"
														
 
															-                    pickleFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.json"
														
 
															-                self._exerciseWithDataSlice(gan, sliceData, imageFileName, pickleFileName)
														
 
															+                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}"
														
 
															+                self._exerciseWithDataSlice(gan, sliceData, imageFileName)
														
 
															         self.debug("### Exercise is done.")
														
@@ -156,7 +157,7 @@ class Exercise:
 
															         return {}
														
 
															-    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None, pickleFileName=None):
														
 
															+    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None):
														
 
															         """
														
 
															         Runs one test for the given gan and dataSlice.
														
@@ -167,62 +168,74 @@ class Exercise:
 
															         one data slice with training and testing data.
														
 
															         """
														
 
															-        # Start over with a new GAN instance.
														
 
															-        self.debug("-> Reset the GAN")
														
 
															-        gan.reset(dataSlice.train)
														
 
															-
														
 
															-        # Train the gan so it can produce synthetic samples.
														
 
															-        self.debug("-> Train generator for synthetic samples")
														
 
															-        gan.train(dataSlice.train)
														
 
															+        jsonFileName = f"{imageFileName}.json"
														
 
															         # Count how many syhthetic samples are needed.
														
 
															         numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
														
 
															+        # Start over with a new GAN instance.
														
 
															+        self.debug("-> Reset the GAN")
														
 
															+        gan.reset(dataSlice.train)
														
 
															+
														
 
															         # Add synthetic samples (generated by the GAN) to the minority class.
														
 
															         if numOfNeededSamples > 0:
														
 
															-            self.debug(f"-> create {numOfNeededSamples} synthetic samples")
														
 
															-            newSamples = gan.generateData(numOfNeededSamples)
														
 
															+            def synth(params):
														
 
															+                me = params["self"]
														
 
															+                train = params["train"]
														
 
															+
														
 
															+                # Train the gan so it can produce synthetic samples.
														
 
															+                me.debug("-> Train generator for synthetic samples")
														
 
															+                gan.train(train)
														
 
															-            if pickleFileName is not None:
														
 
															-                with open(pickleFileName, 'w') as f:
														
 
															-                    json.dump({
														
 
															-                        "majority": [[float(z) for z in x] for x in dataSlice.train.data0],
														
 
															-                        "minority": [[float(z) for z in x] for x in dataSlice.train.data1],
														
 
															-                        "synthetic": [[float(z) for z in x] for x in newSamples]
														
 
															-                        }, f)
														
 
															+                me.debug(f"-> create {numOfNeededSamples} synthetic samples")
														
 
															+                newSamples = gan.generateData(numOfNeededSamples)
														
 
															-            # Print out an overview of the new dataset.
														
 
															-            plotCloud(dataSlice.train.data0, dataSlice.train.data1, newSamples, outputFile=imageFileName, doShow=False)
														
 
															+                # Print out an overview of the new dataset.
														
 
															+                plotCloud(train.data0, train.data1, newSamples, outputFile=imageFileName, doShow=False)
														
 
															+                return {
														
 
															+                    "majority": train.data0,
														
 
															+                    "minority": train.data1,
														
 
															+                    "synthetic": newSamples
														
 
															+                    }
														
 
															+
														
 
															+            j = dataCache(jsonFileName, synth, {"self": self, "train":dataSlice.train})
														
 
															             dataSlice.train = DataSet(
														
 
															-                data0=dataSlice.train.data0,
														
 
															-                data1=np.concatenate((dataSlice.train.data1, newSamples))
														
 
															+                data0=j["majority"],
														
 
															+                data1=np.concatenate((j["minority"], j["synthetic"]))
														
 
															                 )
														
 
															+            j = None
														
 
															+
														
 
															+        if imageFileName is not None:
														
 
															+            fig_pr, ax_pr = plt.subplots()
														
 
															+            fig_roc, ax_roc = plt.subplots()
														
 
															         # Test this dataset with every given test-function.
														
 
															         # The results are printed out and stored to the results dictionary.
														
 
															         if gan.canPredict and "GAN" not in self.testFunctions.keys():
														
 
															-            self.debug(f"-> retrain GAN for predict")
														
 
															-            trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
														
 
															-            trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
														
 
															-            indices = shuffle(np.array(range(len(trainData))))
														
 
															-            trainData = trainData[indices]
														
 
															-            trainLabels = trainLabels[indices]
														
 
															-            indices = None
														
 
															-            gan.retrainDiscriminitor(trainData, trainLabels)
														
 
															-            trainData = None
														
 
															-            trainLabels = None
														
 
															-            self.debug(f"-> test with GAN.predict")
														
 
															-            testResult = runTester(dataSlice, gan)
														
 
															+            self.debug(f"-> test with 'GAN'")
														
 
															+            testResult = runTester(dataSlice, gan, "GAN", f"{imageFileName}-GAN.json")
														
 
															             self.debug(str(testResult))
														
 
															             self.results["GAN"].append(testResult)
														
 
															+            if imageFileName is not None:
														
 
															+                testResult.plotPR(ax_pr)
														
 
															+                testResult.plotROC(ax_roc)
														
 
															+
														
 
															         for testerName in self.testFunctions:
														
 
															             self.debug(f"-> test with '{testerName}'")
														
 
															-            testResult = (self.testFunctions[testerName])(dataSlice)
														
 
															+            testResult = (self.testFunctions[testerName])(dataSlice, f"{imageFileName}-{testerName}.json")
														
 
															             self.debug(str(testResult))
														
 
															             self.results[testerName].append(testResult)
														
 
															+            if imageFileName is not None:
														
 
															+                testResult.plotPR(ax_pr)
														
 
															+                testResult.plotROC(ax_roc)
														
 
															+
														
 
															+        if imageFileName is not None:
														
 
															+            fig_pr.savefig(imageFileName + "_PR.pdf")
														
 
															+            fig_roc.savefig(imageFileName + "_ROC.pdf")
														
 
															+
														
 
															     def saveResultsTo(self, fileName):
														
 
															         avgResults = {}
														
@@ -313,4 +326,4 @@ def plotCloud(data0, data1, dataNew=None, outputFile=None, title="", doShow=True
 
															         plt.show()
														
 
															     if outputFile is not None:
														
 
															-        fig.savefig(outputFile)
														
 
															+        fig.savefig(outputFile + ".pdf")
														
--- a/library/generators/ConvGeN.py
+++ b/library/generators/ConvGeN.py
@@ -407,3 +407,6 @@ class ConvGeN(GanBaseClass):
 
															         labels = np.array([ [x, 1 - x] for x in labels])
														
 
															         self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
														
 
															         self.maj_min_discriminator.trainable = False
														
 
															+    
														
 
															+    def fit(self, data, labels):
														
 
															+        return self.retrainDiscriminitor(data, labels)
														
--- a/library/testers.py
+++ b/library/testers.py
@@ -14,7 +14,12 @@ from sklearn.metrics import confusion_matrix
 
															 from sklearn.metrics import average_precision_score
														
 
															 from sklearn.metrics import f1_score
														
 
															 from sklearn.metrics import cohen_kappa_score
														
 
															+from sklearn.metrics import RocCurveDisplay
														
 
															+from sklearn.metrics import PrecisionRecallDisplay
														
 
															 from sklearn.ensemble import GradientBoostingClassifier
														
 
															+from imblearn.metrics import geometric_mean_score
														
 
															+
														
 
															+from library.cache import dataCache
														
 
															 _tF1 = "f1 score"
														
 
															 _tTN = "TN"
														
@@ -24,6 +29,7 @@ _tFP = "FP"
 
															 _tFP = "RF"
														
 
															 _tAps = "average precision score"
														
 
															 _tCks = "cohens kappa score"
														
 
															+_tGMean = "G-Mean score"
														
 
															 class TestResult:
														
 
															     """
														
@@ -45,11 +51,13 @@ class TestResult:
 
															         *aps* is a real number representing the average precision score.
														
 
															         """
														
 
															         self.title = title
														
 
															-        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks]
														
 
															-        if aps is not None:
														
 
															-            self.heading.append(_tAps)
														
 
															+        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks, _tAps, _tGMean]
														
 
															         self.data = { n: 0.0 for n in self.heading }
														
 
															+
														
 
															+        self.labels = labels
														
 
															+        self.prediction = prediction
														
 
															+
														
 
															         if labels is not None and prediction is not None:
														
 
															             self.data[_tF1]     = f1_score(labels, prediction)
														
 
															             self.data[_tCks]    = cohen_kappa_score(labels, prediction)
														
@@ -59,10 +67,14 @@ class TestResult:
 
															             self.data[_tTP] = tp
														
 
															             self.data[_tFN] = fn
														
 
															             self.data[_tFP] = fp
														
 
															+            self.data[_tGMean] = geometric_mean_score(labels, prediction)
														
 
															+            if aps is None:
														
 
															+                self.data[_tAps] = average_precision_score(labels, prediction)
														
 
															         if aps is not None:
														
 
															             self.data[_tAps] = aps
														
 
															+
														
 
															     def __str__(self):
														
 
															         """
														
 
															         Generates a text representing this result.
														
@@ -146,78 +158,92 @@ class TestResult:
 
															                     a.data[k] = 0.0
														
 
															             return (mi, ma, a)
														
 
															+    def plotPR(self, ax):
														
 
															+        PrecisionRecallDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
														
 
															+
														
 
															+    def plotROC(self, ax):
														
 
															+        RocCurveDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
														
 
															+
														
 
															-def lr(ttd):
														
 
															+def lr(ttd, jsonFileName=None):
														
 
															     """
														
 
															     Runs a test for a dataset with the logistic regression algorithm.
														
 
															     It returns a /TestResult./
														
 
															     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
														
 
															     """
														
 
															-    checkType(ttd)
														
 
															-    logreg = LogisticRegression(
														
 
															-        C=1e5,
														
 
															-        solver='lbfgs',
														
 
															-        max_iter=10000,
														
 
															-        multi_class='multinomial',
														
 
															-        class_weight={0: 1, 1: 1.3}
														
 
															-        )
														
 
															-    logreg.fit(ttd.train.data, ttd.train.labels)
														
 
															-
														
 
															-    prediction = logreg.predict(ttd.test.data)
														
 
															-
														
 
															-    prob_lr = logreg.predict_proba(ttd.test.data)
														
 
															-    aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
														
 
															-    return TestResult("LR", ttd.test.labels, prediction, aps_lr)
														
 
															-
														
 
															-
														
 
															-
														
 
															-def knn(ttd):
														
 
															+    def g(nothing):
														
 
															+        checkType(ttd)
														
 
															+        logreg = LogisticRegression(
														
 
															+            C=1e5,
														
 
															+            solver='lbfgs',
														
 
															+            max_iter=10000,
														
 
															+            multi_class='multinomial',
														
 
															+            class_weight={0: 1, 1: 1.3}
														
 
															+            )
														
 
															+        logreg.fit(ttd.train.data, ttd.train.labels)
														
 
															+        prediction = logreg.predict(ttd.test.data)
														
 
															+        prob_lr = logreg.predict_proba(ttd.test.data)
														
 
															+        aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
														
 
															+        return {
														
 
															+            "labels": ttd.test.labels,
														
 
															+            "prediction": prediction,
														
 
															+            "aps_lr": aps_lr
														
 
															+            }
														
 
															+
														
 
															+    d = dataCache(jsonFileName, g)
														
 
															+    return TestResult("LR", d["labels"], d["prediction"], d["aps_lr"])
														
 
															+
														
 
															+
														
 
															+
														
 
															+def knn(ttd, jsonFileName=None):
														
 
															     """
														
 
															     Runs a test for a dataset with the k-next neighbourhood algorithm.
														
 
															     It returns a /TestResult./
														
 
															     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
														
 
															     """
														
 
															-    checkType(ttd)
														
 
															     knnTester = KNeighborsClassifier(n_neighbors=10)
														
 
															-    knnTester.fit(ttd.train.data, ttd.train.labels)
														
 
															-    return runTester(ttd, knnTester, "KNN")
														
 
															+    return runTester(ttd, knnTester, "KNN", jsonFileName)
														
 
															-def gb(ttd):
														
 
															+def gb(ttd, jsonFileName=None):
														
 
															     """
														
 
															     Runs a test for a dataset with the gradient boosting algorithm.
														
 
															     It returns a /TestResult./
														
 
															     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
														
 
															     """
														
 
															-    checkType(ttd)
														
 
															     tester = GradientBoostingClassifier()
														
 
															-    tester.fit(ttd.train.data, ttd.train.labels)
														
 
															-    return runTester(ttd, tester, "GB")
														
 
															+    return runTester(ttd, tester, "GB", jsonFileName)
														
 
															-def rf(ttd):
														
 
															+def rf(ttd, jsonFileName=None):
														
 
															     """
														
 
															     Runs a test for a dataset with the random forest algorithm.
														
 
															     It returns a /TestResult./
														
 
															     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
														
 
															     """
														
 
															-    checkType(ttd)
														
 
															     tester = RandomForestClassifier()
														
 
															-    tester.fit(ttd.train.data, ttd.train.labels)
														
 
															-    return runTester(ttd, tester, "RF")
														
 
															+    return runTester(ttd, tester, "RF", jsonFileName)
														
 
															+
														
 
															+def runTester(ttd, tester, name="GAN", jsonFileName=None):
														
 
															+    def g(nothing):
														
 
															+        checkType(ttd)
														
 
															+        tester.fit(ttd.train.data, ttd.train.labels)
														
 
															+        return {
														
 
															+            "labels": ttd.test.labels,
														
 
															+            "prediction": tester.predict(ttd.test.data)
														
 
															+            }
														
 
															-def runTester(ttd, tester, name="GAN"):
														
 
															-    prediction = tester.predict(ttd.test.data)
														
 
															-    return TestResult(name, ttd.test.labels, prediction)
														
 
															+    d = dataCache(jsonFileName, g)
														
 
															+    return TestResult(name, d["labels"], d["prediction"])
														
 
															 def checkType(t):
														
 
															     if str(type(t)) == "<class 'numpy.ndarray'>":