| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238 |
- from library.exercise import Exercise
- from library.dataset import DataSet, TrainTestData
- from library.GanExamples import StupidToyListGan
- from library.SimpleGan import SimpleGan
- from library.Repeater import Repeater
- from library.SpheredNoise import SpheredNoise
- from library.convGAN import ConvGAN
- import pickle
- import numpy as np
- import time
- import random
- import csv
- import gzip
- from imblearn.datasets import fetch_datasets
- def loadDataset(datasetName):
- def isSame(xs, ys):
- for (x, y) in zip(xs, ys):
- if x != y:
- return False
- return True
-
- def isIn(ys):
- def f(x):
- for y in ys:
- if isSame(x,y):
- return True
- return False
- return f
- def isNotIn(ys):
- def f(x):
- for y in ys:
- if isSame(x,y):
- return False
- return True
- return f
- print(f"Load '{datasetName}'")
- if datasetName.startswith("data_input/imblearn_"):
- print("from imblearn")
- ds = fetch_datasets()
- myData = ds[datasetName[20:]]
- ds = None
- features = myData["data"]
- labels = myData["target"]
- elif datasetName.startswith("data_input/kaggle_"):
- features = []
- labels = []
- c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
- for (n, row) in enumerate(c):
- # Skip heading
- if n > 0:
- features.append([float(x) for x in row[:-1]])
- labels.append(int(row[-1]))
- features = np.array(features)
- labels = np.array(labels)
- else:
- print("from pickle file")
- pickle_in = open(f"{datasetName}.pickle", "rb")
- pickle_dict = pickle.load(pickle_in)
- myData = pickle_dict["folding"]
- k = myData[0]
- labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
- features = np.concatenate((k[0], k[2]), axis=0).astype(float)
- label_1 = list(np.where(labels == 1)[0])
- label_0 = list(np.where(labels != 1)[0])
- features_1 = features[label_1]
- features_0 = features[label_0]
- cut = np.array(list(filter(isIn(features_0), features_1)))
- if len(cut) > 0:
- print(f"non empty cut in {datasetName}! ({len(cut)} points)")
- # print(f"{len(features_0)}/{len(features_1)} point before")
- # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
- # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
- # print(f"{len(features_0)}/{len(features_1)} points after")
-
- ds = DataSet(data0=features_0, data1=features_1)
- print("Data loaded.")
- return ds
- def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
- value = initValue
- while True:
- value = ((multValue * value) + incValue) % modulus
- yield value
-
- def genShuffler():
- randGen = getRandGen(2021)
- def shuffler(data):
- data = list(data)
- size = len(data)
- shuffled = []
- while size > 0:
- p = next(randGen) % size
- size -= 1
- shuffled.append(data[p])
- data = data[0:p] + data[(p + 1):]
- return np.array(shuffled)
- return shuffler
- def runExerciseForSimpleGAN(datasetName):
- ganName = "SimpleGAN"
- print()
- print()
- print("///////////////////////////////////////////")
- print(f"// Running {ganName} on {datasetName}")
- print("///////////////////////////////////////////")
- print()
- data = loadDataset(f"data_input/{datasetName}")
- gan = SimpleGan(numOfFeatures=data.data0.shape[1])
- random.seed(2021)
- shuffler = genShuffler()
- exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
- exercise.run(gan, data)
- exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
- exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
-
-
- def runExerciseForRepeater(datasetName):
- ganName = "Repeater"
- print()
- print()
- print("///////////////////////////////////////////")
- print(f"// Running {ganName} on {datasetName}")
- print("///////////////////////////////////////////")
- print()
- data = loadDataset(f"data_input/{datasetName}")
- gan = Repeater()
- random.seed(2021)
- shuffler = genShuffler()
- exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
- exercise.run(gan, data)
- exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
- exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
-
- def runExerciseForSpheredNoise(datasetName, resultList=None):
- ganName = "SpheredNoise"
- print()
- print()
- print("///////////////////////////////////////////")
- print(f"// Running {ganName} on {datasetName}")
- print("///////////////////////////////////////////")
- print()
- data = loadDataset(f"data_input/{datasetName}")
- gan = SpheredNoise()
- random.seed(2021)
- shuffler = genShuffler()
- exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
- exercise.run(gan, data)
- avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
- exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
- if resultList is not None:
- resultList[datasetName] = avg
- def runExerciseForConvGAN(datasetName, resultList=None):
- ganName = "convGAN"
- print()
- print()
- print("///////////////////////////////////////////")
- print(f"// Running {ganName} on {datasetName}")
- print("///////////////////////////////////////////")
- print()
- data = loadDataset(f"data_input/{datasetName}")
- gan = ConvGAN(data.data0.shape[1])
- random.seed(2021)
- shuffler = genShuffler()
- exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
- exercise.run(gan, data)
- avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
- exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
- if resultList is not None:
- resultList[datasetName] = avg
- def runSpeedTestForConvGan(datasetName, ganGenerator):
- ganName = "convGAN"
- print()
- print()
- print("///////////////////////////////////////////")
- print(f"// Running speed test for {ganName} on {datasetName}")
- print("///////////////////////////////////////////")
- print()
- d = []
- t1 = time.time()
- data = loadDataset(f"data_input/{datasetName}")
- gan = ganGenerator(data.data0.shape[1])
- random.seed(2021)
- shuffler = genShuffler()
- exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3)
- exercise.debug = (lambda _x: None)
- t2 = time.time()
- exercise.run(gan, data)
- t3 = time.time()
- d = (t3 - t1, t2 - t1, t3 - t2)
- print(f"Total Time: {d[0]}")
- print(f"Preparation Time: {d[1]}")
- print(f"Test Time: {d[2]}")
- return d, gan
-
- testSets = [
- "folding_abalone_17_vs_7_8_9_10",
- "folding_abalone9-18",
- "folding_car_good",
- "folding_car-vgood",
- "folding_flare-F",
- "folding_hypothyroid",
- "folding_kddcup-guess_passwd_vs_satan",
- "folding_kr-vs-k-three_vs_eleven",
- "folding_kr-vs-k-zero-one_vs_draw",
- "folding_shuttle-2_vs_5",
- "folding_winequality-red-4",
- "folding_yeast4",
- "folding_yeast5",
- "folding_yeast6",
- "imblearn_webpage",
- "imblearn_mammography",
- "imblearn_protein_homo",
- "imblearn_ozone_level",
- "kaggle_creditcard"
- ]
- def runAllTestSets(dataSetList):
- for dsFileName in dataSetList:
- runExerciseForSimpleGAN(dataSetList)
- runExerciseForRepeater(dataSetList)
|