Kaynağa Gözat

Merge branch 'ConvGen1_PaperRevision1' of fyrr/ConvGeNCode into master

fyrr 2 yıl önce
ebeveyn
işleme
284937f721
6 değiştirilmiş dosya ile 902 ekleme ve 77 silme
  1. 745 0
      CreateTables-all.ipynb
  2. 14 1
      Makefile
  3. 25 0
      library/cache.py
  4. 52 39
      library/exercise.py
  5. 3 0
      library/generators/ConvGeN.py
  6. 63 37
      library/testers.py

Dosya farkı çok büyük olduğundan ihmal edildi
+ 745 - 0
CreateTables-all.ipynb


+ 14 - 1
Makefile

@@ -2,13 +2,26 @@ all: benchmark
 
 
 benchmark: docker/container.ok
 benchmark: docker/container.ok
 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
-	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
+	make fix
 
 
 benchmark-gpu: docker/container.ok
 benchmark-gpu: docker/container.ok
 	docker container run --rm --gpus all -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
 	docker container run --rm --gpus all -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/run_all_exercises.py
+	make fix
+
+tables: docker/container.ok
+	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark python3 /benchmark/data/CreateTables.py
+	make fix
+
+
+fix: docker/container.ok
 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
 	docker container run --rm -it -v `(pwd)`:/benchmark/data convgenbenchmark chown -R `(./getMyUid)` /benchmark/data/data_result
 
 
 
 
+clean: fix
+	rm -f data_result/*/folding_*.csv
+	rm -f data_result/*/folding_*.log
+	rm -f data_result/*/folding_*.log.time
+
 docker/container.ok: docker/Dockerfile docker/run.sh docker/requirements.txt
 docker/container.ok: docker/Dockerfile docker/run.sh docker/requirements.txt
 	docker build -t convgenbenchmark docker/.
 	docker build -t convgenbenchmark docker/.
 	date > $@
 	date > $@

+ 25 - 0
library/cache.py

@@ -0,0 +1,25 @@
+import os.path
+import json
+
+
+def dataCache(fileName, dataGenerator, x=None):
+    def flatten(z):
+        if str(type(z)) == "<class 'numpy.ndarray'>":
+            return [flatten(x) for x in z]
+        else:
+            return float(z)
+
+    if fileName is not None and os.path.exists(fileName):
+        print(f"load data from previous session '{fileName}'")
+        with open(fileName) as f:
+            return json.load(f)
+    else:
+        d = dataGenerator(x)
+
+        if fileName is not None:
+            print(f"save data for '{fileName}'")
+            with open(fileName, 'w') as f:
+                json.dump({k: flatten(d[k]) for k in d.keys() }, f)
+
+        return d
+                

+ 52 - 39
library/exercise.py

@@ -5,6 +5,7 @@ in generating synthetic samples for datasets with a minority class.
 
 
 
 
 import os
 import os
+import os.path
 import numpy as np
 import numpy as np
 from sklearn.decomposition import PCA
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import StandardScaler
@@ -13,6 +14,7 @@ import matplotlib.pyplot as plt
 
 
 from library.dataset import DataSet, TrainTestData
 from library.dataset import DataSet, TrainTestData
 from library.testers import lr, knn, gb, rf, TestResult, runTester
 from library.testers import lr, knn, gb, rf, TestResult, runTester
+from library.cache import dataCache
 import json
 import json
 
 
 
 
@@ -126,11 +128,10 @@ class Exercise:
                 sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
                 sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
                 self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
                 self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
                 imageFileName = None
                 imageFileName = None
-                pickleFileName = None
+                jsonFileName = None
                 if resultsFileName is not None:
                 if resultsFileName is not None:
-                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.pdf"
-                    pickleFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.json"
-                self._exerciseWithDataSlice(gan, sliceData, imageFileName, pickleFileName)
+                    imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}"
+                self._exerciseWithDataSlice(gan, sliceData, imageFileName)
 
 
         self.debug("### Exercise is done.")
         self.debug("### Exercise is done.")
 
 
@@ -156,7 +157,7 @@ class Exercise:
 
 
         return {}
         return {}
 
 
-    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None, pickleFileName=None):
+    def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None):
         """
         """
         Runs one test for the given gan and dataSlice.
         Runs one test for the given gan and dataSlice.
 
 
@@ -167,62 +168,74 @@ class Exercise:
         one data slice with training and testing data.
         one data slice with training and testing data.
         """
         """
 
 
-        # Start over with a new GAN instance.
-        self.debug("-> Reset the GAN")
-        gan.reset(dataSlice.train)
-
-        # Train the gan so it can produce synthetic samples.
-        self.debug("-> Train generator for synthetic samples")
-        gan.train(dataSlice.train)
+        jsonFileName = f"{imageFileName}.json"
 
 
         # Count how many syhthetic samples are needed.
         # Count how many syhthetic samples are needed.
         numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
         numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
 
 
+        # Start over with a new GAN instance.
+        self.debug("-> Reset the GAN")
+        gan.reset(dataSlice.train)
+
         # Add synthetic samples (generated by the GAN) to the minority class.
         # Add synthetic samples (generated by the GAN) to the minority class.
         if numOfNeededSamples > 0:
         if numOfNeededSamples > 0:
-            self.debug(f"-> create {numOfNeededSamples} synthetic samples")
-            newSamples = gan.generateData(numOfNeededSamples)
+            def synth(params):
+                me = params["self"]
+                train = params["train"]
+
+                # Train the gan so it can produce synthetic samples.
+                me.debug("-> Train generator for synthetic samples")
+                gan.train(train)
 
 
-            if pickleFileName is not None:
-                with open(pickleFileName, 'w') as f:
-                    json.dump({
-                        "majority": [[float(z) for z in x] for x in dataSlice.train.data0],
-                        "minority": [[float(z) for z in x] for x in dataSlice.train.data1],
-                        "synthetic": [[float(z) for z in x] for x in newSamples]
-                        }, f)
+                me.debug(f"-> create {numOfNeededSamples} synthetic samples")
+                newSamples = gan.generateData(numOfNeededSamples)
 
 
-            # Print out an overview of the new dataset.
-            plotCloud(dataSlice.train.data0, dataSlice.train.data1, newSamples, outputFile=imageFileName, doShow=False)
+                # Print out an overview of the new dataset.
+                plotCloud(train.data0, train.data1, newSamples, outputFile=imageFileName, doShow=False)
 
 
+                return {
+                    "majority": train.data0,
+                    "minority": train.data1,
+                    "synthetic": newSamples
+                    }
+
+            j = dataCache(jsonFileName, synth, {"self": self, "train":dataSlice.train})
             dataSlice.train = DataSet(
             dataSlice.train = DataSet(
-                data0=dataSlice.train.data0,
-                data1=np.concatenate((dataSlice.train.data1, newSamples))
+                data0=j["majority"],
+                data1=np.concatenate((j["minority"], j["synthetic"]))
                 )
                 )
+            j = None
+
+        if imageFileName is not None:
+            fig_pr, ax_pr = plt.subplots()
+            fig_roc, ax_roc = plt.subplots()
 
 
         # Test this dataset with every given test-function.
         # Test this dataset with every given test-function.
         # The results are printed out and stored to the results dictionary.
         # The results are printed out and stored to the results dictionary.
         if gan.canPredict and "GAN" not in self.testFunctions.keys():
         if gan.canPredict and "GAN" not in self.testFunctions.keys():
-            self.debug(f"-> retrain GAN for predict")
-            trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
-            trainLabels  = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
-            indices = shuffle(np.array(range(len(trainData))))
-            trainData = trainData[indices]
-            trainLabels = trainLabels[indices]
-            indices = None
-            gan.retrainDiscriminitor(trainData, trainLabels)
-            trainData = None
-            trainLabels = None
-            self.debug(f"-> test with GAN.predict")
-            testResult = runTester(dataSlice, gan)
+            self.debug(f"-> test with 'GAN'")
+            testResult = runTester(dataSlice, gan, "GAN", f"{imageFileName}-GAN.json")
             self.debug(str(testResult))
             self.debug(str(testResult))
             self.results["GAN"].append(testResult)
             self.results["GAN"].append(testResult)
 
 
+            if imageFileName is not None:
+                testResult.plotPR(ax_pr)
+                testResult.plotROC(ax_roc)
+
         for testerName in self.testFunctions:
         for testerName in self.testFunctions:
             self.debug(f"-> test with '{testerName}'")
             self.debug(f"-> test with '{testerName}'")
-            testResult = (self.testFunctions[testerName])(dataSlice)
+            testResult = (self.testFunctions[testerName])(dataSlice, f"{imageFileName}-{testerName}.json")
             self.debug(str(testResult))
             self.debug(str(testResult))
             self.results[testerName].append(testResult)
             self.results[testerName].append(testResult)
 
 
+            if imageFileName is not None:
+                testResult.plotPR(ax_pr)
+                testResult.plotROC(ax_roc)
+
+        if imageFileName is not None:
+            fig_pr.savefig(imageFileName + "_PR.pdf")
+            fig_roc.savefig(imageFileName + "_ROC.pdf")
+
 
 
     def saveResultsTo(self, fileName):
     def saveResultsTo(self, fileName):
         avgResults = {}
         avgResults = {}
@@ -313,4 +326,4 @@ def plotCloud(data0, data1, dataNew=None, outputFile=None, title="", doShow=True
         plt.show()
         plt.show()
 
 
     if outputFile is not None:
     if outputFile is not None:
-        fig.savefig(outputFile)
+        fig.savefig(outputFile + ".pdf")

+ 3 - 0
library/generators/ConvGeN.py

@@ -407,3 +407,6 @@ class ConvGeN(GanBaseClass):
         labels = np.array([ [x, 1 - x] for x in labels])
         labels = np.array([ [x, 1 - x] for x in labels])
         self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
         self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
         self.maj_min_discriminator.trainable = False
         self.maj_min_discriminator.trainable = False
+    
+    def fit(self, data, labels):
+        return self.retrainDiscriminitor(data, labels)

+ 63 - 37
library/testers.py

@@ -14,7 +14,12 @@ from sklearn.metrics import confusion_matrix
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import f1_score
 from sklearn.metrics import f1_score
 from sklearn.metrics import cohen_kappa_score
 from sklearn.metrics import cohen_kappa_score
+from sklearn.metrics import RocCurveDisplay
+from sklearn.metrics import PrecisionRecallDisplay
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingClassifier
+from imblearn.metrics import geometric_mean_score
+
+from library.cache import dataCache
 
 
 _tF1 = "f1 score"
 _tF1 = "f1 score"
 _tTN = "TN"
 _tTN = "TN"
@@ -24,6 +29,7 @@ _tFP = "FP"
 _tFP = "RF"
 _tFP = "RF"
 _tAps = "average precision score"
 _tAps = "average precision score"
 _tCks = "cohens kappa score"
 _tCks = "cohens kappa score"
+_tGMean = "G-Mean score"
 
 
 class TestResult:
 class TestResult:
     """
     """
@@ -45,11 +51,13 @@ class TestResult:
         *aps* is a real number representing the average precision score.
         *aps* is a real number representing the average precision score.
         """
         """
         self.title = title
         self.title = title
-        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks]
-        if aps is not None:
-            self.heading.append(_tAps)
+        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks, _tAps, _tGMean]
         self.data = { n: 0.0 for n in self.heading }
         self.data = { n: 0.0 for n in self.heading }
 
 
+
+        self.labels = labels
+        self.prediction = prediction
+
         if labels is not None and prediction is not None:
         if labels is not None and prediction is not None:
             self.data[_tF1]     = f1_score(labels, prediction)
             self.data[_tF1]     = f1_score(labels, prediction)
             self.data[_tCks]    = cohen_kappa_score(labels, prediction)
             self.data[_tCks]    = cohen_kappa_score(labels, prediction)
@@ -59,10 +67,14 @@ class TestResult:
             self.data[_tTP] = tp
             self.data[_tTP] = tp
             self.data[_tFN] = fn
             self.data[_tFN] = fn
             self.data[_tFP] = fp
             self.data[_tFP] = fp
+            self.data[_tGMean] = geometric_mean_score(labels, prediction)
+            if aps is None:
+                self.data[_tAps] = average_precision_score(labels, prediction)
 
 
         if aps is not None:
         if aps is not None:
             self.data[_tAps] = aps
             self.data[_tAps] = aps
 
 
+
     def __str__(self):
     def __str__(self):
         """
         """
         Generates a text representing this result.
         Generates a text representing this result.
@@ -146,78 +158,92 @@ class TestResult:
                     a.data[k] = 0.0
                     a.data[k] = 0.0
             return (mi, ma, a)
             return (mi, ma, a)
 
 
+    def plotPR(self, ax):
+        PrecisionRecallDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
+
+    def plotROC(self, ax):
+        RocCurveDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)
+
         
         
 
 
 
 
-def lr(ttd):
+def lr(ttd, jsonFileName=None):
     """
     """
     Runs a test for a dataset with the logistic regression algorithm.
     Runs a test for a dataset with the logistic regression algorithm.
     It returns a /TestResult./
     It returns a /TestResult./
 
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
     """
-    checkType(ttd)
-    logreg = LogisticRegression(
-        C=1e5,
-        solver='lbfgs',
-        max_iter=10000,
-        multi_class='multinomial',
-        class_weight={0: 1, 1: 1.3}
-        )
-    logreg.fit(ttd.train.data, ttd.train.labels)
-
-    prediction = logreg.predict(ttd.test.data)
-
-    prob_lr = logreg.predict_proba(ttd.test.data)
-    aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
-    return TestResult("LR", ttd.test.labels, prediction, aps_lr)
-
-
-
-def knn(ttd):
+    def g(nothing):
+        checkType(ttd)
+        logreg = LogisticRegression(
+            C=1e5,
+            solver='lbfgs',
+            max_iter=10000,
+            multi_class='multinomial',
+            class_weight={0: 1, 1: 1.3}
+            )
+        logreg.fit(ttd.train.data, ttd.train.labels)
+        prediction = logreg.predict(ttd.test.data)
+        prob_lr = logreg.predict_proba(ttd.test.data)
+        aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
+        return {
+            "labels": ttd.test.labels,
+            "prediction": prediction,
+            "aps_lr": aps_lr
+            }
+
+    d = dataCache(jsonFileName, g)
+    return TestResult("LR", d["labels"], d["prediction"], d["aps_lr"])
+
+
+
+def knn(ttd, jsonFileName=None):
     """
     """
     Runs a test for a dataset with the k-next neighbourhood algorithm.
     Runs a test for a dataset with the k-next neighbourhood algorithm.
     It returns a /TestResult./
     It returns a /TestResult./
 
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
     """
-    checkType(ttd)
     knnTester = KNeighborsClassifier(n_neighbors=10)
     knnTester = KNeighborsClassifier(n_neighbors=10)
-    knnTester.fit(ttd.train.data, ttd.train.labels)
-    return runTester(ttd, knnTester, "KNN")
+    return runTester(ttd, knnTester, "KNN", jsonFileName)
 
 
 
 
-def gb(ttd):
+def gb(ttd, jsonFileName=None):
     """
     """
     Runs a test for a dataset with the gradient boosting algorithm.
     Runs a test for a dataset with the gradient boosting algorithm.
     It returns a /TestResult./
     It returns a /TestResult./
 
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
     """
-    checkType(ttd)
     tester = GradientBoostingClassifier()
     tester = GradientBoostingClassifier()
-    tester.fit(ttd.train.data, ttd.train.labels)
-    return runTester(ttd, tester, "GB")
+    return runTester(ttd, tester, "GB", jsonFileName)
 
 
 
 
 
 
-def rf(ttd):
+def rf(ttd, jsonFileName=None):
     """
     """
     Runs a test for a dataset with the random forest algorithm.
     Runs a test for a dataset with the random forest algorithm.
     It returns a /TestResult./
     It returns a /TestResult./
 
 
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
     """
     """
-    checkType(ttd)
     tester = RandomForestClassifier()
     tester = RandomForestClassifier()
-    tester.fit(ttd.train.data, ttd.train.labels)
-    return runTester(ttd, tester, "RF")
+    return runTester(ttd, tester, "RF", jsonFileName)
+
 
 
 
 
+def runTester(ttd, tester, name="GAN", jsonFileName=None):
+    def g(nothing):
+        checkType(ttd)
+        tester.fit(ttd.train.data, ttd.train.labels)
+        return {
+            "labels": ttd.test.labels,
+            "prediction": tester.predict(ttd.test.data)
+            }
 
 
-def runTester(ttd, tester, name="GAN"):
-    prediction = tester.predict(ttd.test.data)
-    return TestResult(name, ttd.test.labels, prediction)
+    d = dataCache(jsonFileName, g)
+    return TestResult(name, d["labels"], d["prediction"])
 
 
 def checkType(t):
 def checkType(t):
     if str(type(t)) == "<class 'numpy.ndarray'>":
     if str(type(t)) == "<class 'numpy.ndarray'>":

Bu fark içinde çok fazla dosya değişikliği olduğu için bazı dosyalar gösterilmiyor