analysis.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.GanExamples import StupidToyListGan
  4. from library.generators import SimpleGan, Repeater, SpheredNoise, ConvGAN
  5. import pickle
  6. import numpy as np
  7. import time
  8. import random
  9. import csv
  10. import gzip
  11. import sys
  12. import os
  13. from imblearn.datasets import fetch_datasets
  14. def loadDataset(datasetName):
  15. def isSame(xs, ys):
  16. for (x, y) in zip(xs, ys):
  17. if x != y:
  18. return False
  19. return True
  20. def isIn(ys):
  21. def f(x):
  22. for y in ys:
  23. if isSame(x,y):
  24. return True
  25. return False
  26. return f
  27. def isNotIn(ys):
  28. def f(x):
  29. for y in ys:
  30. if isSame(x,y):
  31. return False
  32. return True
  33. return f
  34. print(f"Load '{datasetName}'")
  35. if datasetName.startswith("data_input/imblearn_"):
  36. print("from imblearn")
  37. ds = fetch_datasets()
  38. myData = ds[datasetName[20:]]
  39. ds = None
  40. features = myData["data"]
  41. labels = myData["target"]
  42. elif datasetName.startswith("data_input/kaggle_"):
  43. features = []
  44. labels = []
  45. c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
  46. for (n, row) in enumerate(c):
  47. # Skip heading
  48. if n > 0:
  49. features.append([float(x) for x in row[:-1]])
  50. labels.append(int(row[-1]))
  51. features = np.array(features)
  52. labels = np.array(labels)
  53. else:
  54. print("from pickle file")
  55. pickle_in = open(f"{datasetName}.pickle", "rb")
  56. pickle_dict = pickle.load(pickle_in)
  57. myData = pickle_dict["folding"]
  58. k = myData[0]
  59. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  60. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  61. label_1 = list(np.where(labels == 1)[0])
  62. label_0 = list(np.where(labels != 1)[0])
  63. features_1 = features[label_1]
  64. features_0 = features[label_0]
  65. cut = np.array(list(filter(isIn(features_0), features_1)))
  66. if len(cut) > 0:
  67. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  68. # print(f"{len(features_0)}/{len(features_1)} point before")
  69. # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
  70. # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
  71. # print(f"{len(features_0)}/{len(features_1)} points after")
  72. ds = DataSet(data0=features_0, data1=features_1)
  73. print("Data loaded.")
  74. return ds
  75. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  76. value = initValue
  77. while True:
  78. value = ((multValue * value) + incValue) % modulus
  79. yield value
  80. def genShuffler():
  81. randGen = getRandGen(2021)
  82. def shuffler(data):
  83. data = list(data)
  84. size = len(data)
  85. shuffled = []
  86. while size > 0:
  87. p = next(randGen) % size
  88. size -= 1
  89. shuffled.append(data[p])
  90. data = data[0:p] + data[(p + 1):]
  91. return np.array(shuffled)
  92. return shuffler
  93. def showTime(t):
  94. s = int(t)
  95. m = s // 60
  96. h = m // 60
  97. d = h // 24
  98. s = s % 60
  99. m = m % 60
  100. h = h % 24
  101. if d > 0:
  102. return f"{d} days {h:02d}:{m:02d}:{s:02d}"
  103. else:
  104. return f"{h:02d}:{m:02d}:{s:02d}"
  105. def mkDirIfNotExists(name):
  106. try:
  107. os.mkdir(name)
  108. except FileExistsError as e:
  109. pass
  110. def runExercise(datasetName, resultList, ganName, ganCreator, skipIfCsvExists=True):
  111. print(f"* Running {ganName} on {datasetName}")
  112. oldStdOut = sys.stdout
  113. oldStdErr = sys.stderr
  114. resultsFileName = f"data_result/{ganName}"
  115. # Prepare Folder for result data
  116. mkDirIfNotExists("data_result")
  117. mkDirIfNotExists(resultsFileName)
  118. resultsFileName += f"/{datasetName}"
  119. try:
  120. os.stat(f"{resultsFileName}.csv")
  121. if skipIfCsvExists and resultList is None:
  122. print(" Resultfile exists => skip calculation.")
  123. return
  124. except FileNotFoundError as e:
  125. pass
  126. sys.stdout = open(resultsFileName + ".log", "w")
  127. sys.stderr = sys.stdout
  128. twStart = time.time()
  129. tpStart = time.process_time()
  130. print()
  131. print()
  132. print("///////////////////////////////////////////")
  133. print(f"// Running {ganName} on {datasetName}")
  134. print("///////////////////////////////////////////")
  135. print()
  136. data = loadDataset(f"data_input/{datasetName}")
  137. gan = ganCreator(data)
  138. random.seed(2021)
  139. shuffler = genShuffler()
  140. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  141. avg = exercise.run(gan, data, resultsFileName=resultsFileName)
  142. tpEnd = time.process_time()
  143. twEnd = time.time()
  144. if resultList is not None:
  145. resultList[datasetName] = avg
  146. sys.stdout = oldStdOut
  147. sys.stderr = oldStdErr
  148. print(f" wall time: {showTime(twEnd - twStart)}s, process time: {showTime(tpEnd - tpStart)}")
  149. def runExerciseForSimpleGAN(datasetName, resultList=None):
  150. runExercise(datasetName, resultList, "SimpleGAN", lambda data: SimpleGan(numOfFeatures=data.data0.shape[1]))
  151. def runExerciseForRepeater(datasetName, resultList=None):
  152. runExercise(datasetName, resultList, "Repeater", lambda _data: Repeater())
  153. def runExerciseForSpheredNoise(datasetName, resultList=None):
  154. runExercise(datasetName, resultList, "SpheredNoise", lambda _data: SpheredNoise())
  155. def runExerciseForConvGAN(datasetName, resultList=None, debug=False):
  156. runExercise(datasetName, resultList, "convGAN", lambda data: ConvGAN(data.data0.shape[1], debug=debug))
  157. def runSpeedTestForConvGan(datasetName, ganGenerator):
  158. ganName = "convGAN"
  159. print()
  160. print()
  161. print("///////////////////////////////////////////")
  162. print(f"// Running speed test for {ganName} on {datasetName}")
  163. print("///////////////////////////////////////////")
  164. print()
  165. d = []
  166. t1 = time.time()
  167. data = loadDataset(f"data_input/{datasetName}")
  168. gan = ganGenerator(data.data0.shape[1])
  169. random.seed(2021)
  170. shuffler = genShuffler()
  171. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3)
  172. exercise.debug = (lambda _x: None)
  173. t2 = time.time()
  174. exercise.run(gan, data)
  175. t3 = time.time()
  176. d = (t3 - t1, t2 - t1, t3 - t2)
  177. print(f"Total Time: {d[0]}")
  178. print(f"Preparation Time: {d[1]}")
  179. print(f"Test Time: {d[2]}")
  180. return d, gan
  181. testSets = [
  182. "folding_abalone_17_vs_7_8_9_10",
  183. "folding_abalone9-18",
  184. "folding_car_good",
  185. "folding_car-vgood",
  186. "folding_flare-F",
  187. "folding_hypothyroid",
  188. "folding_kddcup-guess_passwd_vs_satan",
  189. "folding_kr-vs-k-three_vs_eleven",
  190. "folding_kr-vs-k-zero-one_vs_draw",
  191. "folding_shuttle-2_vs_5",
  192. "folding_winequality-red-4",
  193. "folding_yeast4",
  194. "folding_yeast5",
  195. "folding_yeast6",
  196. "imblearn_webpage",
  197. "imblearn_mammography",
  198. "imblearn_protein_homo",
  199. "imblearn_ozone_level",
  200. "kaggle_creditcard"
  201. ]
  202. def runAllTestSets(dataSetList):
  203. for dataset in testSets:
  204. runExerciseForRepeater(dataset)
  205. runExerciseForSpheredNoise(dataset)
  206. runExerciseForSimpleGAN(dataset)
  207. runExerciseForConvGAN(dataset)