|
|
@@ -10,6 +10,8 @@ import pickle
|
|
|
import numpy as np
|
|
|
import time
|
|
|
import random
|
|
|
+import csv
|
|
|
+import gzip
|
|
|
from imblearn.datasets import fetch_datasets
|
|
|
|
|
|
|
|
|
@@ -36,16 +38,41 @@ def loadDataset(datasetName):
|
|
|
return True
|
|
|
return f
|
|
|
|
|
|
- pickle_in = open(f"{datasetName}.pickle", "rb")
|
|
|
- pickle_dict = pickle.load(pickle_in)
|
|
|
+ print(f"Load '{datasetName}'")
|
|
|
+ if datasetName.startswith("data_input/imblearn_"):
|
|
|
+ print("from imblearn")
|
|
|
+ ds = fetch_datasets()
|
|
|
+ myData = ds[datasetName[20:]]
|
|
|
+ ds = None
|
|
|
+
|
|
|
+ features = myData["data"]
|
|
|
+ labels = myData["target"]
|
|
|
+ elif datasetName.startswith("data_input/kaggle_"):
|
|
|
+ features = []
|
|
|
+ labels = []
|
|
|
+ c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
|
|
|
+ for (n, row) in enumerate(c):
|
|
|
+ # Skip heading
|
|
|
+ if n > 0:
|
|
|
+ features.append([float(x) for x in row[:-1]])
|
|
|
+ labels.append(int(row[-1]))
|
|
|
+
|
|
|
+ features = np.array(features)
|
|
|
+ labels = np.array(labels)
|
|
|
+
|
|
|
+ else:
|
|
|
+ print("from pickle file")
|
|
|
+ pickle_in = open(f"{datasetName}.pickle", "rb")
|
|
|
+ pickle_dict = pickle.load(pickle_in)
|
|
|
+
|
|
|
+ myData = pickle_dict["folding"]
|
|
|
+ k = myData[0]
|
|
|
+
|
|
|
+ labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
|
|
|
+ features = np.concatenate((k[0], k[2]), axis=0).astype(float)
|
|
|
|
|
|
- myData = pickle_dict["folding"]
|
|
|
- k = myData[0]
|
|
|
-
|
|
|
- labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
|
|
|
- features = np.concatenate((k[0], k[2]), axis=0).astype(float)
|
|
|
label_1 = list(np.where(labels == 1)[0])
|
|
|
- label_0 = list(np.where(labels == 0)[0])
|
|
|
+ label_0 = list(np.where(labels != 1)[0])
|
|
|
features_1 = features[label_1]
|
|
|
features_0 = features[label_0]
|
|
|
cut = np.array(list(filter(isIn(features_0), features_1)))
|
|
|
@@ -56,7 +83,9 @@ def loadDataset(datasetName):
|
|
|
# features_1 = np.array(list(filter(isNotIn(cut), features_1)))
|
|
|
# print(f"{len(features_0)}/{len(features_1)} points after")
|
|
|
|
|
|
- return DataSet(data0=features_0, data1=features_1)
|
|
|
+ ds = DataSet(data0=features_0, data1=features_1)
|
|
|
+ print("Data loaded.")
|
|
|
+ return ds
|
|
|
|
|
|
|
|
|
def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
|
|
|
@@ -195,7 +224,12 @@ testSets = [
|
|
|
"folding_winequality-red-4",
|
|
|
"folding_yeast4",
|
|
|
"folding_yeast5",
|
|
|
- "folding_yeast6"
|
|
|
+ "folding_yeast6",
|
|
|
+ "imblearn_webpage",
|
|
|
+ "imblearn_mammography",
|
|
|
+ "imblearn_protein_homo",
|
|
|
+ "imblearn_ozone_level",
|
|
|
+ "kaggle_creditcard"
|
|
|
]
|
|
|
|
|
|
def runAllTestSets(dataSetList):
|