|
@@ -5,6 +5,7 @@ in generating synthetic samples for datasets with a minority class.
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
import os
|
|
|
|
|
+import os.path
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.decomposition import PCA
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.preprocessing import StandardScaler
|
|
@@ -13,6 +14,7 @@ import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from library.dataset import DataSet, TrainTestData
|
|
from library.dataset import DataSet, TrainTestData
|
|
|
from library.testers import lr, knn, gb, rf, TestResult, runTester
|
|
from library.testers import lr, knn, gb, rf, TestResult, runTester
|
|
|
|
|
+from library.cache import dataCache
|
|
|
import json
|
|
import json
|
|
|
|
|
|
|
|
|
|
|
|
@@ -126,11 +128,10 @@ class Exercise:
|
|
|
sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
|
|
sliceTitle = f"Slice {sliceNr + 1}/{self.numOfSlices}"
|
|
|
self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
|
|
self.debug(f"\n------ {stepTitle}: {sliceTitle} -------")
|
|
|
imageFileName = None
|
|
imageFileName = None
|
|
|
- pickleFileName = None
|
|
|
|
|
|
|
+ jsonFileName = None
|
|
|
if resultsFileName is not None:
|
|
if resultsFileName is not None:
|
|
|
- imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.pdf"
|
|
|
|
|
- pickleFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}.json"
|
|
|
|
|
- self._exerciseWithDataSlice(gan, sliceData, imageFileName, pickleFileName)
|
|
|
|
|
|
|
+ imageFileName = f"{resultsFileName}/Step{shuffleStep + 1}_Slice{sliceNr + 1}"
|
|
|
|
|
+ self._exerciseWithDataSlice(gan, sliceData, imageFileName)
|
|
|
|
|
|
|
|
self.debug("### Exercise is done.")
|
|
self.debug("### Exercise is done.")
|
|
|
|
|
|
|
@@ -156,7 +157,7 @@ class Exercise:
|
|
|
|
|
|
|
|
return {}
|
|
return {}
|
|
|
|
|
|
|
|
- def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None, pickleFileName=None):
|
|
|
|
|
|
|
+ def _exerciseWithDataSlice(self, gan, dataSlice, imageFileName=None):
|
|
|
"""
|
|
"""
|
|
|
Runs one test for the given gan and dataSlice.
|
|
Runs one test for the given gan and dataSlice.
|
|
|
|
|
|
|
@@ -167,62 +168,74 @@ class Exercise:
|
|
|
one data slice with training and testing data.
|
|
one data slice with training and testing data.
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
- # Start over with a new GAN instance.
|
|
|
|
|
- self.debug("-> Reset the GAN")
|
|
|
|
|
- gan.reset(dataSlice.train)
|
|
|
|
|
-
|
|
|
|
|
- # Train the gan so it can produce synthetic samples.
|
|
|
|
|
- self.debug("-> Train generator for synthetic samples")
|
|
|
|
|
- gan.train(dataSlice.train)
|
|
|
|
|
|
|
+ jsonFileName = f"{imageFileName}.json"
|
|
|
|
|
|
|
|
# Count how many syhthetic samples are needed.
|
|
# Count how many syhthetic samples are needed.
|
|
|
numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
|
|
numOfNeededSamples = dataSlice.train.size0 - dataSlice.train.size1
|
|
|
|
|
|
|
|
|
|
+ # Start over with a new GAN instance.
|
|
|
|
|
+ self.debug("-> Reset the GAN")
|
|
|
|
|
+ gan.reset(dataSlice.train)
|
|
|
|
|
+
|
|
|
# Add synthetic samples (generated by the GAN) to the minority class.
|
|
# Add synthetic samples (generated by the GAN) to the minority class.
|
|
|
if numOfNeededSamples > 0:
|
|
if numOfNeededSamples > 0:
|
|
|
- self.debug(f"-> create {numOfNeededSamples} synthetic samples")
|
|
|
|
|
- newSamples = gan.generateData(numOfNeededSamples)
|
|
|
|
|
|
|
+ def synth(params):
|
|
|
|
|
+ me = params["self"]
|
|
|
|
|
+ train = params["train"]
|
|
|
|
|
+
|
|
|
|
|
+ # Train the gan so it can produce synthetic samples.
|
|
|
|
|
+ me.debug("-> Train generator for synthetic samples")
|
|
|
|
|
+ gan.train(train)
|
|
|
|
|
|
|
|
- if pickleFileName is not None:
|
|
|
|
|
- with open(pickleFileName, 'w') as f:
|
|
|
|
|
- json.dump({
|
|
|
|
|
- "majority": [[float(z) for z in x] for x in dataSlice.train.data0],
|
|
|
|
|
- "minority": [[float(z) for z in x] for x in dataSlice.train.data1],
|
|
|
|
|
- "synthetic": [[float(z) for z in x] for x in newSamples]
|
|
|
|
|
- }, f)
|
|
|
|
|
|
|
+ me.debug(f"-> create {numOfNeededSamples} synthetic samples")
|
|
|
|
|
+ newSamples = gan.generateData(numOfNeededSamples)
|
|
|
|
|
|
|
|
- # Print out an overview of the new dataset.
|
|
|
|
|
- plotCloud(dataSlice.train.data0, dataSlice.train.data1, newSamples, outputFile=imageFileName, doShow=False)
|
|
|
|
|
|
|
+ # Print out an overview of the new dataset.
|
|
|
|
|
+ plotCloud(train.data0, train.data1, newSamples, outputFile=imageFileName, doShow=False)
|
|
|
|
|
|
|
|
|
|
+ return {
|
|
|
|
|
+ "majority": train.data0,
|
|
|
|
|
+ "minority": train.data1,
|
|
|
|
|
+ "synthetic": newSamples
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ j = dataCache(jsonFileName, synth, {"self": self, "train":dataSlice.train})
|
|
|
dataSlice.train = DataSet(
|
|
dataSlice.train = DataSet(
|
|
|
- data0=dataSlice.train.data0,
|
|
|
|
|
- data1=np.concatenate((dataSlice.train.data1, newSamples))
|
|
|
|
|
|
|
+ data0=j["majority"],
|
|
|
|
|
+ data1=np.concatenate((j["minority"], j["synthetic"]))
|
|
|
)
|
|
)
|
|
|
|
|
+ j = None
|
|
|
|
|
+
|
|
|
|
|
+ if imageFileName is not None:
|
|
|
|
|
+ fig_pr, ax_pr = plt.subplots()
|
|
|
|
|
+ fig_roc, ax_roc = plt.subplots()
|
|
|
|
|
|
|
|
# Test this dataset with every given test-function.
|
|
# Test this dataset with every given test-function.
|
|
|
# The results are printed out and stored to the results dictionary.
|
|
# The results are printed out and stored to the results dictionary.
|
|
|
if gan.canPredict and "GAN" not in self.testFunctions.keys():
|
|
if gan.canPredict and "GAN" not in self.testFunctions.keys():
|
|
|
- self.debug(f"-> retrain GAN for predict")
|
|
|
|
|
- trainData = np.concatenate((dataSlice.train.data0, dataSlice.train.data1))
|
|
|
|
|
- trainLabels = np.concatenate((np.zeros(len(dataSlice.train.data0)), np.zeros(len(dataSlice.train.data1)) + 1))
|
|
|
|
|
- indices = shuffle(np.array(range(len(trainData))))
|
|
|
|
|
- trainData = trainData[indices]
|
|
|
|
|
- trainLabels = trainLabels[indices]
|
|
|
|
|
- indices = None
|
|
|
|
|
- gan.retrainDiscriminitor(trainData, trainLabels)
|
|
|
|
|
- trainData = None
|
|
|
|
|
- trainLabels = None
|
|
|
|
|
- self.debug(f"-> test with GAN.predict")
|
|
|
|
|
- testResult = runTester(dataSlice, gan)
|
|
|
|
|
|
|
+ self.debug(f"-> test with 'GAN'")
|
|
|
|
|
+ testResult = runTester(dataSlice, gan, "GAN", f"{imageFileName}-GAN.json")
|
|
|
self.debug(str(testResult))
|
|
self.debug(str(testResult))
|
|
|
self.results["GAN"].append(testResult)
|
|
self.results["GAN"].append(testResult)
|
|
|
|
|
|
|
|
|
|
+ if imageFileName is not None:
|
|
|
|
|
+ testResult.plotPR(ax_pr)
|
|
|
|
|
+ testResult.plotROC(ax_roc)
|
|
|
|
|
+
|
|
|
for testerName in self.testFunctions:
|
|
for testerName in self.testFunctions:
|
|
|
self.debug(f"-> test with '{testerName}'")
|
|
self.debug(f"-> test with '{testerName}'")
|
|
|
- testResult = (self.testFunctions[testerName])(dataSlice)
|
|
|
|
|
|
|
+ testResult = (self.testFunctions[testerName])(dataSlice, f"{imageFileName}-{testerName}.json")
|
|
|
self.debug(str(testResult))
|
|
self.debug(str(testResult))
|
|
|
self.results[testerName].append(testResult)
|
|
self.results[testerName].append(testResult)
|
|
|
|
|
|
|
|
|
|
+ if imageFileName is not None:
|
|
|
|
|
+ testResult.plotPR(ax_pr)
|
|
|
|
|
+ testResult.plotROC(ax_roc)
|
|
|
|
|
+
|
|
|
|
|
+ if imageFileName is not None:
|
|
|
|
|
+ fig_pr.savefig(imageFileName + "_PR.pdf")
|
|
|
|
|
+ fig_roc.savefig(imageFileName + "_ROC.pdf")
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def saveResultsTo(self, fileName):
|
|
def saveResultsTo(self, fileName):
|
|
|
avgResults = {}
|
|
avgResults = {}
|
|
@@ -313,4 +326,4 @@ def plotCloud(data0, data1, dataNew=None, outputFile=None, title="", doShow=True
|
|
|
plt.show()
|
|
plt.show()
|
|
|
|
|
|
|
|
if outputFile is not None:
|
|
if outputFile is not None:
|
|
|
- fig.savefig(outputFile)
|
|
|
|
|
|
|
+ fig.savefig(outputFile + ".pdf")
|