4 年之前 · 6da7e8b68b
--- a/Exercise.ipynb
+++ b/Exercise.ipynb
--- a/library/SimpleGan.py
+++ b/library/SimpleGan.py
@@ -22,10 +22,12 @@ class SimpleGan(GanBaseClass):
 
															     """
														
 
															     A class for a simple GAN.
														
 
															     """
														
 
															-    def __init__(self, numOfFeatures=786, noiseSize=100):
														
 
															+    def __init__(self, numOfFeatures=786, noiseSize=100, epochs=3, batchSize=128):
														
 
															         self.isTrained = False
														
 
															         self.noiseSize = noiseSize
														
 
															         self.numOfFeatures = numOfFeatures
														
 
															+        self.epochs = epochs
														
 
															+        self.batchSize = batchSize
														
 
															     def reset(self):
														
 
															         """
														
@@ -82,41 +84,41 @@ class SimpleGan(GanBaseClass):
 
															         discriminator.compile(loss='binary_crossentropy', optimizer=self._adamOptimizer())
														
 
															         return discriminator
														
 
															-    def train(self, dataset, epochs=1, batchSize=128):
														
 
															+    def train(self, dataset):
														
 
															         trainData = dataset.data1
														
 
															         trainDataSize = trainData.shape[0]
														
 
															         if trainDataSize <= 0:
														
 
															             raise AttributeError("Train GAN: Expected data class 1 to contain at least one point.")
														
 
															-        for e in range(epochs):
														
 
															-            print(f"Epoch {e + 1}")
														
 
															-            for _ in range(batchSize):
														
 
															+        for e in range(self.epochs):
														
 
															+            print(f"Epoch {e + 1}/{self.epochs}")
														
 
															+            for _ in range(self.batchSize):
														
 
															                 #generate  random noise as an input  to  initialize the  generator
														
 
															-                noise= np.random.normal(0, 1, [batchSize, self.noiseSize])
														
 
															+                noise= np.random.normal(0, 1, [self.batchSize, self.noiseSize])
														
 
															                 # Generate fake MNIST images from noised input
														
 
															                 generatedImages = self.generator.predict(noise)
														
 
															                 # Get a random set of  real images
														
 
															                 image_batch = dataset.data1[
														
 
															-                    np.random.randint(low=0, high=trainDataSize, size=batchSize)
														
 
															+                    np.random.randint(low=0, high=trainDataSize, size=self.batchSize)
														
 
															                     ]
														
 
															                 #Construct different batches of  real and fake data
														
 
															                 X = np.concatenate([image_batch, generatedImages])
														
 
															                 # Labels for generated and real data
														
 
															-                y_dis=np.zeros(2 * batchSize)
														
 
															-                y_dis[:batchSize] = 0.9
														
 
															+                y_dis=np.zeros(2 * self.batchSize)
														
 
															+                y_dis[:self.batchSize] = 0.9
														
 
															                 #Pre train discriminator on  fake and real data  before starting the gan.
														
 
															                 self.discriminator.trainable = True
														
 
															                 self.discriminator.train_on_batch(X, y_dis)
														
 
															                 #Tricking the noised input of the Generator as real data
														
 
															-                noise = np.random.normal(0, 1, [batchSize, 100])
														
 
															-                y_gen = np.ones(batchSize)
														
 
															+                noise = np.random.normal(0, 1, [self.batchSize, 100])
														
 
															+                y_gen = np.ones(self.batchSize)
														
 
															                 # During the training of gan,
														
 
															                 # the weights of discriminator should be fixed.
														
--- a/library/exercise.py
+++ b/library/exercise.py
@@ -13,7 +13,7 @@ from sklearn.preprocessing import StandardScaler
 
															 import matplotlib.pyplot as plt
														
 
															 from library.dataset import DataSet, TrainTestData
														
 
															-from library.testers import lr, svm, knn
														
 
															+from library.testers import lr, svm, knn, gb, TestResult
														
 
															 class Exercise:
														
@@ -48,6 +48,7 @@ class Exercise:
 
															             self.testFunctions = {
														
 
															                 "LR": lr,
														
 
															                 "SVM": svm,
														
 
															+                "GB": gb,
														
 
															                 "KNN": knn
														
 
															                 }
														
@@ -117,6 +118,23 @@ class Exercise:
 
															         self.debug("### Exercise is done.")
														
 
															+        for (n, name) in enumerate(self.results):
														
 
															+            stats = None
														
 
															+            for (m, result) in enumerate(self.results[name]):
														
 
															+                stats = result.addMinMaxAvg(stats)
														
 
															+        
														
 
															+            (mi, mx, avg) = TestResult.finishMinMaxAvg(stats)
														
 
															+            self.debug("")
														
 
															+            self.debug(f"-----[ {avg.title} ]-----")
														
 
															+            self.debug("maximum:")
														
 
															+            self.debug(str(mx))
														
 
															+            self.debug("")
														
 
															+            self.debug("average:")
														
 
															+            self.debug(str(avg))
														
 
															+            self.debug("")
														
 
															+            self.debug("minimum:")
														
 
															+            self.debug(str(mi))
														
 
															+
														
 
															     def _exerciseWithDataSlice(self, gan, dataSlice):
														
 
															         """
														
 
															         Runs one test for the given gan and dataSlice.
														
@@ -143,14 +161,15 @@ class Exercise:
 
															         if numOfNeededSamples > 0:
														
 
															             self.debug(f"-> create {numOfNeededSamples} synthetic samples")
														
 
															             newSamples = gan.generateData(numOfNeededSamples)
														
 
															+
														
 
															+            # Print out an overview of the new dataset.
														
 
															+            plotCloud(dataSlice.train.data0, dataSlice.train.data1, newSamples)
														
 
															+
														
 
															             dataSlice.train = DataSet(
														
 
															                 data0=dataSlice.train.data0,
														
 
															                 data1=np.concatenate((dataSlice.train.data1, newSamples))
														
 
															                 )
														
 
															-        # Print out an overview of the new dataset.
														
 
															-        plotCloud(dataSlice.train)
														
 
															-
														
 
															         # Test this dataset with every given test-function.
														
 
															         # The results are printed out and stored to the results dictionary.
														
 
															         for testerName in self.testFunctions:
														
@@ -163,25 +182,34 @@ class Exercise:
 
															     def saveResultsTo(self, fileName):
														
 
															         with open(fileName, "w") as f:
														
 
															             for (n, name) in enumerate(self.results):
														
 
															-                if n == 0:
														
 
															-                    f.write("---")
														
 
															+                if n > 0:
														
 
															+                    f.write("---\n")
														
 
															                 f.write(name + "\n")
														
 
															                 isFirst = True
														
 
															-                for result in self.results[name]:
														
 
															+                stats = None
														
 
															+                for (m, result) in enumerate(self.results[name]):
														
 
															                     if isFirst:
														
 
															                         isFirst = False
														
 
															-                        f.write(result.csvHeading() + "\n")
														
 
															-                    f.write(result.toCSV() + "\n")
														
 
															+                        f.write("Nr.;" + result.csvHeading() + "\n")
														
 
															+
														
 
															+                    stats = result.addMinMaxAvg(stats)
														
 
															+
														
 
															+                    f.write(f"{m + 1};" + result.toCSV() + "\n")
														
 
															+                (mi, mx, avg) = TestResult.finishMinMaxAvg(stats)
														
 
															+                f.write(f"max;" + mx.toCSV() + "\n")
														
 
															+                f.write(f"avg;" + avg.toCSV() + "\n")
														
 
															+                f.write(f"min;" + mi.toCSV() + "\n")
														
 
															+
														
 
															-def plotCloud(dataset):
														
 
															+def plotCloud(data0, data1, dataNew):
														
 
															     """
														
 
															     Does a PCA analysis of the given data and plot the both important axis.
														
 
															     """
														
 
															     # Normalizes the data.
														
 
															-    data_t = StandardScaler().fit_transform(dataset.data)
														
 
															+    data_t = StandardScaler().fit_transform(np.concatenate([data0, data1, dataNew]))
														
 
															     # Run the PCA analysis.
														
 
															     pca = PCA(n_components=2)
														
@@ -189,7 +217,11 @@ def plotCloud(dataset):
 
															     # Create a DataFrame for plotting.
														
 
															     result = pd.DataFrame(data=pc, columns=['PCA0', 'PCA1'])
														
 
															-    result['Cluster'] = dataset.labels
														
 
															+    result['Cluster'] = np.concatenate([
														
 
															+        np.zeros(len(data0)),
														
 
															+        np.zeros(len(data1)) + 1,
														
 
															+        np.zeros(len(dataNew)) + 2
														
 
															+        ])
														
 
															     # Plot the analysis results.
														
 
															     sns.set( font_scale=1.2)
														
@@ -200,5 +232,5 @@ def plotCloud(dataset):
 
															       legend=False,
														
 
															       scatter_kws={"s": 3}, palette="Set1") # specify the point size
														
 
															-    plt.legend(title='', loc='upper left', labels=['0', '1'])
														
 
															+    plt.legend(title='', loc='upper left', labels=['0', '1', '2'])
														
 
															     plt.show()
														
--- a/library/testers.py
+++ b/library/testers.py
@@ -13,7 +13,17 @@ from sklearn.metrics import confusion_matrix
 
															 from sklearn.metrics import average_precision_score
														
 
															 from sklearn.metrics import f1_score
														
 
															 from sklearn.metrics import balanced_accuracy_score
														
 
															-
														
 
															+from sklearn.metrics import cohen_kappa_score
														
 
															+from sklearn.ensemble import GradientBoostingClassifier
														
 
															+
														
 
															+_tF1 = "f1 score"
														
 
															+_tBalAcc = "balanced accuracy"
														
 
															+_tTN = "TN"
														
 
															+_tTP = "TP"
														
 
															+_tFN = "FN"
														
 
															+_tFP = "FP"
														
 
															+_tAps = "average precision score"
														
 
															+_tCks = "cohens kappa score"
														
 
															 class TestResult:
														
 
															     """
														
@@ -22,7 +32,7 @@ class TestResult:
 
															     It stores its *title*, a confusion matrix (*con_mat*), the balanced accuracy score (*bal_acc*)
														
 
															     and the f1 score (*f1*). If given the average precision score is also stored (*aps*).
														
 
															     """
														
 
															-    def __init__(self, title, labels, prediction, aps=None):
														
 
															+    def __init__(self, title, labels=None, prediction=None, aps=None):
														
 
															         """
														
 
															         Creates an instance of this class. The stored data will be generated from the given values.
														
@@ -35,57 +45,109 @@ class TestResult:
 
															         *aps* is a real number representing the average precision score.
														
 
															         """
														
 
															         self.title = title
														
 
															-        self.con_mat = confusion_matrix(labels, prediction)
														
 
															-        self.bal_acc = balanced_accuracy_score(labels, prediction)
														
 
															-        self.f1 = f1_score(labels, prediction)
														
 
															-        self.aps = aps
														
 
															+        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tBalAcc, _tCks]
														
 
															+        if aps is not None:
														
 
															+            self.heading.append(_tAps)
														
 
															+        self.data = { n: 0.0 for n in self.heading }
														
 
															+
														
 
															+        if labels is not None and prediction is not None:
														
 
															+            self.data[_tBalAcc] = balanced_accuracy_score(labels, prediction)
														
 
															+            self.data[_tF1]     = f1_score(labels, prediction)
														
 
															+            self.data[_tCks]    = cohen_kappa_score(labels, prediction)
														
 
															+            conMat = self._enshureConfusionMatrix(confusion_matrix(labels, prediction))
														
 
															+            [[tn, fp], [fn, tp]] = conMat
														
 
															+            self.data[_tTN] = tn
														
 
															+            self.data[_tTP] = tp
														
 
															+            self.data[_tFN] = fn
														
 
															+            self.data[_tFP] = fp
														
 
															+
														
 
															+        if aps is not None:
														
 
															+            self.data[_tAps] = aps
														
 
															     def __str__(self):
														
 
															         """
														
 
															         Generates a text representing this result.
														
 
															         """
														
 
															-        #tn, fp, fn, tp = con_mat.ravel()
														
 
															-        r = self.con_mat.ravel()
														
 
															-        text = f"tn, fp, fn, tp: {r}"
														
 
															+        text = ""
														
 
															-        if self.aps is not None:
														
 
															-            text += f"\naverage_pr_score: {self.aps}"
														
 
															+        tn = self.data[_tTN]
														
 
															+        tp = self.data[_tTP]
														
 
															+        fn = self.data[_tFN]
														
 
															+        fp = self.data[_tFP]
														
 
															+        text += f"{self.title} tn, fp: {tn}, {tp}\n"
														
 
															+        text += f"{self.title} fn, tp: {fn}, {tp}"
														
 
															-        text += f"\nf1 score_{self.title}: {self.f1}"
														
 
															-        text += f"\nbalanced accuracy_{self.title}: {self.bal_acc}"
														
 
															-        text += f"\nconfusion matrix_{self.title}\n {self.con_mat}"
														
 
															-        return text
														
 
															+        for k in self.heading:
														
 
															+            if k not in [_tTP, _tTN, _tFP, _tFN]:
														
 
															+                text += f"{self.title} {k}: {self.data[k]:.3f}\n"
														
 
															+        return text
														
 
															     def csvHeading(self):
														
 
															-        r = [
														
 
															-            "F1 score",
														
 
															-            "balanced accuracy",
														
 
															-            "TN",
														
 
															-            "FP",
														
 
															-            "FN",
														
 
															-            "TP"
														
 
															-            ]
														
 
															+        return ";".join(self.heading)
														
 
															-        if self.aps is not None:
														
 
															-            r.append("Aps")
														
 
															+    def toCSV(self):
														
 
															+        return ";".join(map(lambda k: f"{self.data[k]:0.3f}", self.heading))
														
 
															-        return ";".join(r)
														
 
															+    @staticmethod
														
 
															+    def _enshureConfusionMatrix(c):
														
 
															+        c0 = [0.0, 0.0]
														
 
															+        c1 = [0.0, 0.0]
														
 
															-    def toCSV(self):
														
 
															-        r = map(str, [
														
 
															-            self.f1,
														
 
															-            self.bal_acc,
														
 
															-            self.con_mat[0] if len(self.con_mat) > 0 else float(self.con_mat),
														
 
															-            self.con_mat[1] if len(self.con_mat) > 1 else 0,
														
 
															-            self.con_mat[2] if len(self.con_mat) > 2 else 0,
														
 
															-            self.con_mat[3] if len(self.con_mat) > 3 else 0
														
 
															-            ])
														
 
															+        if len(c) > 0:
														
 
															+            if len(c[0]) > 0:
														
 
															+                c0[0] = c[0][0]
														
 
															+
														
 
															+            if len(c[0]) > 1:
														
 
															+                c0[1] = c[0][1]
														
 
															+
														
 
															+        if len(c) > 1 and len(c[1]) > 1:
														
 
															+            c1[0] = c[1][0]
														
 
															+            c1[1] = c[1][1]
														
 
															+
														
 
															+        return [c0, c1]
														
 
															+
														
 
															+    def copy(self):
														
 
															+        r = TestResult(self.title)
														
 
															+        r.data = self.data.copy()
														
 
															+        r.heading = self.heading.copy()
														
 
															+        return r
														
 
															+
														
 
															+
														
 
															+    def addMinMaxAvg(self, mma=None):
														
 
															+        if mma is None:
														
 
															+            return (1, self.copy(), self.copy(), self.copy())
														
 
															+
														
 
															+        (n, mi, mx, a) = mma
														
 
															+
														
 
															+        for k in a.heading:
														
 
															+            if k in self.heading:
														
 
															+                a.data[k] += self.data[k]
														
 
															-        if self.aps is not None:
														
 
															-            r.append(str(self.aps))
														
 
															+        for k in mi.heading:
														
 
															+            if k in self.heading:
														
 
															+                mi.data[k] = min(mi.data[k], self.data[k])
														
 
															-        return ";".join(r)
														
 
															+        for k in mx.heading:
														
 
															+            if k in self.heading:
														
 
															+                mx.data[k] = max(mx.data[k], self.data[k])
														
 
															+
														
 
															+        return (n + 1, mi, mx, a)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def finishMinMaxAvg(mma):
														
 
															+        if mma is None:
														
 
															+            return (TestResult("?"), TestResult("?"), TestResult("?"))
														
 
															+        else:
														
 
															+            (n, mi, ma, a) = mma
														
 
															+            for k in a.heading:
														
 
															+                if n > 0:
														
 
															+                    a.data[k] = a.data[k] / n
														
 
															+                else:
														
 
															+                    a.data[k] = 0.0
														
 
															+            return (mi, ma, a)
														
 
															+
														
 
															+        
														
 
															 def lr(ttd):
														
@@ -146,6 +208,21 @@ def knn(ttd):
 
															     return TestResult("KNN", ttd.test.labels, prediction)
														
 
															+def gb(ttd):
														
 
															+    """
														
 
															+    Runs a test for a dataset with the gradient boosting algorithm.
														
 
															+    It returns a /TestResult./
														
 
															+
														
 
															+    *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
														
 
															+    """
														
 
															+    checkType(ttd)
														
 
															+    tester = GradientBoostingClassifier()
														
 
															+    tester.fit(ttd.train.data, ttd.train.labels)
														
 
															+
														
 
															+    prediction = tester.predict(ttd.test.data)
														
 
															+    return TestResult("GB", ttd.test.labels, prediction)
														
 
															+
														
 
															+
														
 
															 def checkType(t):
														
 
															     if str(type(t)) == "<class 'numpy.ndarray'>":
														
 
															         return t.shape[0] > 0 and all(map(checkType, t))