Selaa lähdekoodia

Added new data sources.

Kristian Schultz 4 vuotta sitten
vanhempi
commit
05bfbe5c4e
3 muutettua tiedostoa jossa 44 lisäystä ja 10 poistoa
  1. 1 0
      .gitignore
  2. BIN
      data_input/kaggle_creditcard.csv.gz
  3. 43 10
      library/analysis.py

+ 1 - 0
.gitignore

@@ -1,3 +1,4 @@
 .ipynb_checkpoints
 __pycache__
 *.swp
+temp

BIN
data_input/kaggle_creditcard.csv.gz


+ 43 - 10
library/analysis.py

@@ -10,6 +10,8 @@ import pickle
 import numpy as np
 import time
 import random
+import csv
+import gzip
 from imblearn.datasets import fetch_datasets
 
 
@@ -36,16 +38,41 @@ def loadDataset(datasetName):
             return True
         return f
 
-    pickle_in = open(f"{datasetName}.pickle", "rb")
-    pickle_dict = pickle.load(pickle_in)
+    print(f"Load '{datasetName}'")
+    if datasetName.startswith("data_input/imblearn_"):
+        print("from imblearn")
+        ds = fetch_datasets()
+        myData = ds[datasetName[20:]]
+        ds = None
+
+        features = myData["data"]
+        labels = myData["target"]
+    elif datasetName.startswith("data_input/kaggle_"):
+        features = []
+        labels = []
+        c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt")) 
+        for (n, row) in enumerate(c):
+            # Skip heading
+            if n > 0:
+                features.append([float(x) for x in row[:-1]])
+                labels.append(int(row[-1]))
+
+        features = np.array(features)
+        labels = np.array(labels)
+
+    else:
+        print("from pickle file")
+        pickle_in = open(f"{datasetName}.pickle", "rb")
+        pickle_dict = pickle.load(pickle_in)
+
+        myData = pickle_dict["folding"]
+        k = myData[0]
+
+        labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
+        features = np.concatenate((k[0], k[2]), axis=0).astype(float)
 
-    myData = pickle_dict["folding"]
-    k = myData[0]
-
-    labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
-    features = np.concatenate((k[0], k[2]), axis=0).astype(float)
     label_1 = list(np.where(labels == 1)[0])
-    label_0 = list(np.where(labels == 0)[0])
+    label_0 = list(np.where(labels != 1)[0])
     features_1 = features[label_1]
     features_0 = features[label_0]
     cut = np.array(list(filter(isIn(features_0), features_1)))
@@ -56,7 +83,9 @@ def loadDataset(datasetName):
     #    features_1 = np.array(list(filter(isNotIn(cut), features_1)))
     #    print(f"{len(features_0)}/{len(features_1)} points after")
     
-    return DataSet(data0=features_0, data1=features_1)
+    ds = DataSet(data0=features_0, data1=features_1)
+    print("Data loaded.")
+    return ds
 
 
 def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
@@ -195,7 +224,11 @@ testSets = [
     "folding_winequality-red-4",
     "folding_yeast4",
     "folding_yeast5",
-    "folding_yeast6"
+    "folding_yeast6",
+    "imblearn_webpage",
+    "imblearn_mammography",
+    "imblearn_protein_homo",
+    "kaggle_creditcard"
     ]
 
 def runAllTestSets(dataSetList):