Sfoglia il codice sorgente

Added example image creation.

Kristian Schultz 4 anni fa
parent
commit
b770191f0c

File diff suppressed because it is too large
+ 190 - 0
SyntheticPointDistribution.ipynb


BIN
images/example_ProWRAS.pdf


BIN
images/example_basisData.pdf


BIN
images/example_convGAN.pdf


BIN
images/example_folding_car_good_ProWRAS.pdf


BIN
images/example_folding_car_good_basisData.pdf


BIN
images/example_folding_car_good_convGAN.pdf


BIN
images/example_folding_car_good_simpleGAN.pdf


BIN
images/example_folding_yeast4_ProWRAS.pdf


BIN
images/example_folding_yeast4_basisData.pdf


BIN
images/example_folding_yeast4_convGAN.pdf


BIN
images/example_folding_yeast4_simpleGAN.pdf


BIN
images/example_imblearn_ozone_level_ProWRAS.pdf


BIN
images/example_imblearn_ozone_level_basisData.pdf


BIN
images/example_imblearn_ozone_level_convGAN.pdf


BIN
images/example_imblearn_ozone_level_simpleGAN.pdf


BIN
images/example_simpleGAN.pdf


BIN
images/example_x_folding_car_good_ProWRAS.pdf


BIN
images/example_x_folding_car_good_basisData.pdf


BIN
images/example_x_folding_car_good_simpleGAN.pdf


+ 44 - 10
library/analysis.py

@@ -10,6 +10,8 @@ import pickle
 import numpy as np
 import time
 import random
+import csv
+import gzip
 from imblearn.datasets import fetch_datasets
 
 
@@ -36,16 +38,41 @@ def loadDataset(datasetName):
             return True
         return f
 
-    pickle_in = open(f"{datasetName}.pickle", "rb")
-    pickle_dict = pickle.load(pickle_in)
+    print(f"Load '{datasetName}'")
+    if datasetName.startswith("data_input/imblearn_"):
+        print("from imblearn")
+        ds = fetch_datasets()
+        myData = ds[datasetName[20:]]
+        ds = None
+
+        features = myData["data"]
+        labels = myData["target"]
+    elif datasetName.startswith("data_input/kaggle_"):
+        features = []
+        labels = []
+        c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt")) 
+        for (n, row) in enumerate(c):
+            # Skip heading
+            if n > 0:
+                features.append([float(x) for x in row[:-1]])
+                labels.append(int(row[-1]))
+
+        features = np.array(features)
+        labels = np.array(labels)
+
+    else:
+        print("from pickle file")
+        pickle_in = open(f"{datasetName}.pickle", "rb")
+        pickle_dict = pickle.load(pickle_in)
+
+        myData = pickle_dict["folding"]
+        k = myData[0]
+
+        labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
+        features = np.concatenate((k[0], k[2]), axis=0).astype(float)
 
-    myData = pickle_dict["folding"]
-    k = myData[0]
-
-    labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
-    features = np.concatenate((k[0], k[2]), axis=0).astype(float)
     label_1 = list(np.where(labels == 1)[0])
-    label_0 = list(np.where(labels == 0)[0])
+    label_0 = list(np.where(labels != 1)[0])
     features_1 = features[label_1]
     features_0 = features[label_0]
     cut = np.array(list(filter(isIn(features_0), features_1)))
@@ -56,7 +83,9 @@ def loadDataset(datasetName):
     #    features_1 = np.array(list(filter(isNotIn(cut), features_1)))
     #    print(f"{len(features_0)}/{len(features_1)} points after")
     
-    return DataSet(data0=features_0, data1=features_1)
+    ds = DataSet(data0=features_0, data1=features_1)
+    print("Data loaded.")
+    return ds
 
 
 def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
@@ -195,7 +224,12 @@ testSets = [
     "folding_winequality-red-4",
     "folding_yeast4",
     "folding_yeast5",
-    "folding_yeast6"
+    "folding_yeast6",
+    "imblearn_webpage",
+    "imblearn_mammography",
+    "imblearn_protein_homo",
+    "imblearn_ozone_level",
+    "kaggle_creditcard"
     ]
 
 def runAllTestSets(dataSetList):

+ 37 - 23
library/exercise.py

@@ -5,9 +5,6 @@ in generating synthetic samples for datasets with a minority class.
 
 
 import numpy as np
-import pandas as pd
-
-import seaborn as sns
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
@@ -205,33 +202,50 @@ class Exercise:
         return avgResults
 
 
-def plotCloud(data0, data1, dataNew):
+def plotCloud(data0, data1, dataNew=None, outputFile=None, title=""):
     """
     Does a PCA analysis of the given data and plot the both important axis.
     """
+
     # Normalizes the data.
-    data_t = StandardScaler().fit_transform(np.concatenate([data0, data1, dataNew]))
+    if dataNew is None:
+        data_t = StandardScaler().fit_transform(np.concatenate([data0, data1]))
+    else:
+        data_t = StandardScaler().fit_transform(np.concatenate([data0, data1, dataNew]))
+
 
     # Run the PCA analysis.
     pca = PCA(n_components=2)
     pc = pca.fit_transform(data_t)
 
-    # Create a DataFrame for plotting.
-    result = pd.DataFrame(data=pc, columns=['PCA0', 'PCA1'])
-    result['Cluster'] = np.concatenate([
-        np.zeros(len(data0)),
-        np.zeros(len(data1)) + 1,
-        np.zeros(len(dataNew)) + 2
-        ])
-
-    # Plot the analysis results.
-    sns.set( font_scale=1.2)
-    sns.lmplot( x="PCA0", y="PCA1",
-      data=result,
-      fit_reg=False,
-      hue='Cluster', # color by cluster
-      legend=False,
-      scatter_kws={"s": 3}, palette="Set1") # specify the point size
-
-    plt.legend(title='', loc='upper left', labels=['0', '1', '2'])
+    fig, ax = plt.subplots(sharex=True, sharey=True)
+    fig.set_dpi(600)
+    fig.set_figwidth(10)
+    fig.set_figheight(10)
+    fig.set_facecolor("white")
+    ax.set_title(title)
+
+    def doSubplot(m, n, c):
+        pca0 = [x[0] for x in pc[m : m + n]]
+        pca1 = [x[1] for x in pc[m : m + n]]
+        s = ax.scatter(pca0, pca1, c=c)
+
+    m = 0
+    n = len(data0)
+    doSubplot(m, n, "gray")
+    
+    m += n
+    n = len(data1)
+    doSubplot(m, n, "red")
+
+    if dataNew is not None:
+        m += n
+        n = len(dataNew)
+        doSubplot(m, n, "blue")
+
+    ax.legend(title="", loc='upper left', labels=['majority', 'minority', 'synthetic minority'])
+    ax.set_xlabel("PCA0")
+    ax.set_ylabel("PCA1")
     plt.show()
+    if outputFile is not None:
+        fig.savefig(outputFile)

Some files were not shown because too many files changed in this diff