from library.exercise import Exercise from library.dataset import DataSet, TrainTestData from library.GanExamples import StupidToyListGan from library.SimpleGan import SimpleGan from library.Repeater import Repeater from library.SpheredNoise import SpheredNoise from library.convGAN import ConvGAN import pickle import numpy as np import time import random import csv import gzip from imblearn.datasets import fetch_datasets def loadDataset(datasetName): def isSame(xs, ys): for (x, y) in zip(xs, ys): if x != y: return False return True def isIn(ys): def f(x): for y in ys: if isSame(x,y): return True return False return f def isNotIn(ys): def f(x): for y in ys: if isSame(x,y): return False return True return f print(f"Load '{datasetName}'") if datasetName.startswith("data_input/imblearn_"): print("from imblearn") ds = fetch_datasets() myData = ds[datasetName[20:]] ds = None features = myData["data"] labels = myData["target"] elif datasetName.startswith("data_input/kaggle_"): features = [] labels = [] c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt")) for (n, row) in enumerate(c): # Skip heading if n > 0: features.append([float(x) for x in row[:-1]]) labels.append(int(row[-1])) features = np.array(features) labels = np.array(labels) else: print("from pickle file") pickle_in = open(f"{datasetName}.pickle", "rb") pickle_dict = pickle.load(pickle_in) myData = pickle_dict["folding"] k = myData[0] labels = np.concatenate((k[1], k[3]), axis=0).astype(float) features = np.concatenate((k[0], k[2]), axis=0).astype(float) label_1 = list(np.where(labels == 1)[0]) label_0 = list(np.where(labels != 1)[0]) features_1 = features[label_1] features_0 = features[label_0] cut = np.array(list(filter(isIn(features_0), features_1))) if len(cut) > 0: print(f"non empty cut in {datasetName}! ({len(cut)} points)") # print(f"{len(features_0)}/{len(features_1)} point before") # features_0 = np.array(list(filter(isNotIn(cut), features_0))) # features_1 = np.array(list(filter(isNotIn(cut), features_1))) # print(f"{len(features_0)}/{len(features_1)} points after") ds = DataSet(data0=features_0, data1=features_1) print("Data loaded.") return ds def getRandGen(initValue, incValue=257, multValue=101, modulus=65537): value = initValue while True: value = ((multValue * value) + incValue) % modulus yield value def genShuffler(): randGen = getRandGen(2021) def shuffler(data): data = list(data) size = len(data) shuffled = [] while size > 0: p = next(randGen) % size size -= 1 shuffled.append(data[p]) data = data[0:p] + data[(p + 1):] return np.array(shuffled) return shuffler def runExerciseForSimpleGAN(datasetName): ganName = "SimpleGAN" print() print() print("///////////////////////////////////////////") print(f"// Running {ganName} on {datasetName}") print("///////////////////////////////////////////") print() data = loadDataset(f"data_input/{datasetName}") gan = SimpleGan(numOfFeatures=data.data0.shape[1]) random.seed(2021) shuffler = genShuffler() exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5) exercise.run(gan, data) exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv") exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv") def runExerciseForRepeater(datasetName): ganName = "Repeater" print() print() print("///////////////////////////////////////////") print(f"// Running {ganName} on {datasetName}") print("///////////////////////////////////////////") print() data = loadDataset(f"data_input/{datasetName}") gan = Repeater() random.seed(2021) shuffler = genShuffler() exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5) exercise.run(gan, data) exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv") exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv") def runExerciseForSpheredNoise(datasetName, resultList=None): ganName = "SpheredNoise" print() print() print("///////////////////////////////////////////") print(f"// Running {ganName} on {datasetName}") print("///////////////////////////////////////////") print() data = loadDataset(f"data_input/{datasetName}") gan = SpheredNoise() random.seed(2021) shuffler = genShuffler() exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5) exercise.run(gan, data) avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv") exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv") if resultList is not None: resultList[datasetName] = avg def runExerciseForConvGAN(datasetName, resultList=None): ganName = "convGAN" print() print() print("///////////////////////////////////////////") print(f"// Running {ganName} on {datasetName}") print("///////////////////////////////////////////") print() data = loadDataset(f"data_input/{datasetName}") gan = ConvGAN(data.data0.shape[1]) random.seed(2021) shuffler = genShuffler() exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5) exercise.run(gan, data) avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv") exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv") if resultList is not None: resultList[datasetName] = avg def runSpeedTestForConvGan(datasetName, ganGenerator): ganName = "convGAN" print() print() print("///////////////////////////////////////////") print(f"// Running speed test for {ganName} on {datasetName}") print("///////////////////////////////////////////") print() d = [] t1 = time.time() data = loadDataset(f"data_input/{datasetName}") gan = ganGenerator(data.data0.shape[1]) random.seed(2021) shuffler = genShuffler() exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3) exercise.debug = (lambda _x: None) t2 = time.time() exercise.run(gan, data) t3 = time.time() d = (t3 - t1, t2 - t1, t3 - t2) print(f"Total Time: {d[0]}") print(f"Preparation Time: {d[1]}") print(f"Test Time: {d[2]}") return d, gan testSets = [ "folding_abalone_17_vs_7_8_9_10", "folding_abalone9-18", "folding_car_good", "folding_car-vgood", "folding_flare-F", "folding_hypothyroid", "folding_kddcup-guess_passwd_vs_satan", "folding_kr-vs-k-three_vs_eleven", "folding_kr-vs-k-zero-one_vs_draw", "folding_shuttle-2_vs_5", "folding_winequality-red-4", "folding_yeast4", "folding_yeast5", "folding_yeast6", "imblearn_webpage", "imblearn_mammography", "imblearn_protein_homo", "imblearn_ozone_level", "kaggle_creditcard" ] def runAllTestSets(dataSetList): for dsFileName in dataSetList: runExerciseForSimpleGAN(dataSetList) runExerciseForRepeater(dataSetList)