analysis.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.generators import SimpleGan, Repeater, SpheredNoise, ConvGAN, StupidToyListGan, CtGAN
  4. import pickle
  5. import numpy as np
  6. import time
  7. import random
  8. import csv
  9. import gzip
  10. import sys
  11. import os
  12. from imblearn.datasets import fetch_datasets
  13. def loadDataset(datasetName):
  14. def isSame(xs, ys):
  15. for (x, y) in zip(xs, ys):
  16. if x != y:
  17. return False
  18. return True
  19. def isIn(ys):
  20. def f(x):
  21. for y in ys:
  22. if isSame(x,y):
  23. return True
  24. return False
  25. return f
  26. def isNotIn(ys):
  27. def f(x):
  28. for y in ys:
  29. if isSame(x,y):
  30. return False
  31. return True
  32. return f
  33. print(f"Load '{datasetName}'")
  34. if datasetName.startswith("data_input/imblearn_"):
  35. print("from imblearn")
  36. ds = fetch_datasets()
  37. myData = ds[datasetName[20:]]
  38. ds = None
  39. features = myData["data"]
  40. labels = myData["target"]
  41. elif datasetName.startswith("data_input/kaggle_"):
  42. features = []
  43. labels = []
  44. c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
  45. for (n, row) in enumerate(c):
  46. # Skip heading
  47. if n > 0:
  48. features.append([float(x) for x in row[:-1]])
  49. labels.append(int(row[-1]))
  50. features = np.array(features)
  51. labels = np.array(labels)
  52. else:
  53. print("from pickle file")
  54. pickle_in = open(f"{datasetName}.pickle", "rb")
  55. pickle_dict = pickle.load(pickle_in)
  56. myData = pickle_dict["folding"]
  57. k = myData[0]
  58. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  59. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  60. label_1 = list(np.where(labels == 1)[0])
  61. label_0 = list(np.where(labels != 1)[0])
  62. features_1 = features[label_1]
  63. features_0 = features[label_0]
  64. cut = np.array(list(filter(isIn(features_0), features_1)))
  65. if len(cut) > 0:
  66. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  67. # print(f"{len(features_0)}/{len(features_1)} point before")
  68. # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
  69. # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
  70. # print(f"{len(features_0)}/{len(features_1)} points after")
  71. ds = DataSet(data0=features_0, data1=features_1)
  72. print("Data loaded.")
  73. return ds
  74. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  75. value = initValue
  76. while True:
  77. value = ((multValue * value) + incValue) % modulus
  78. yield value
  79. def genShuffler():
  80. randGen = getRandGen(2021)
  81. def shuffler(data):
  82. data = list(data)
  83. size = len(data)
  84. shuffled = []
  85. while size > 0:
  86. p = next(randGen) % size
  87. size -= 1
  88. shuffled.append(data[p])
  89. data = data[0:p] + data[(p + 1):]
  90. return np.array(shuffled)
  91. return shuffler
  92. def showTime(t):
  93. s = int(t)
  94. m = s // 60
  95. h = m // 60
  96. d = h // 24
  97. s = s % 60
  98. m = m % 60
  99. h = h % 24
  100. if d > 0:
  101. return f"{d} days {h:02d}:{m:02d}:{s:02d}"
  102. else:
  103. return f"{h:02d}:{m:02d}:{s:02d}"
  104. def mkDirIfNotExists(name):
  105. try:
  106. os.mkdir(name)
  107. except FileExistsError as e:
  108. pass
  109. def runExercise(datasetName, resultList, ganName, ganCreator, skipIfCsvExists=True):
  110. print(f"* Running {ganName} on {datasetName}")
  111. oldStdOut = sys.stdout
  112. oldStdErr = sys.stderr
  113. resultsFileName = f"data_result/{ganName}"
  114. # Prepare Folder for result data
  115. mkDirIfNotExists("data_result")
  116. mkDirIfNotExists(resultsFileName)
  117. resultsFileName += f"/{datasetName}"
  118. try:
  119. os.stat(f"{resultsFileName}.csv")
  120. if skipIfCsvExists and resultList is None:
  121. print(" Resultfile exists => skip calculation.")
  122. return
  123. except FileNotFoundError as e:
  124. pass
  125. sys.stdout = open(resultsFileName + ".log", "w")
  126. sys.stderr = sys.stdout
  127. twStart = time.time()
  128. tpStart = time.process_time()
  129. print()
  130. print()
  131. print("///////////////////////////////////////////")
  132. print(f"// Running {ganName} on {datasetName}")
  133. print("///////////////////////////////////////////")
  134. print()
  135. data = loadDataset(f"data_input/{datasetName}")
  136. gan = ganCreator(data)
  137. random.seed(2021)
  138. shuffler = genShuffler()
  139. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  140. avg = exercise.run(gan, data, resultsFileName=resultsFileName)
  141. tpEnd = time.process_time()
  142. twEnd = time.time()
  143. if resultList is not None:
  144. resultList[datasetName] = avg
  145. sys.stdout = oldStdOut
  146. sys.stderr = oldStdErr
  147. print(f" wall time: {showTime(twEnd - twStart)}s, process time: {showTime(tpEnd - tpStart)}")
  148. def runExerciseForSimpleGAN(datasetName, resultList=None):
  149. runExercise(datasetName, resultList, "SimpleGAN", lambda data: SimpleGan(numOfFeatures=data.data0.shape[1]))
  150. def runExerciseForRepeater(datasetName, resultList=None):
  151. runExercise(datasetName, resultList, "Repeater", lambda _data: Repeater())
  152. def runExerciseForSpheredNoise(datasetName, resultList=None):
  153. runExercise(datasetName, resultList, "SpheredNoise", lambda _data: SpheredNoise())
  154. def runExerciseForCtGAN(datasetName, resultList=None, debug=False):
  155. runExercise(datasetName, resultList, "ctGAN", lambda data: CtGAN(data.data0.shape[1], debug=debug))
  156. def runExerciseForConvGAN(datasetName, resultList=None, debug=False):
  157. runExercise(datasetName, resultList, "convGAN", lambda data: ConvGAN(data.data0.shape[1], debug=debug))
  158. def runSpeedTestForConvGan(datasetName, ganGenerator):
  159. ganName = "convGAN"
  160. print()
  161. print()
  162. print("///////////////////////////////////////////")
  163. print(f"// Running speed test for {ganName} on {datasetName}")
  164. print("///////////////////////////////////////////")
  165. print()
  166. d = []
  167. t1 = time.time()
  168. data = loadDataset(f"data_input/{datasetName}")
  169. gan = ganGenerator(data.data0.shape[1])
  170. random.seed(2021)
  171. shuffler = genShuffler()
  172. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3)
  173. exercise.debug = (lambda _x: None)
  174. t2 = time.time()
  175. exercise.run(gan, data)
  176. t3 = time.time()
  177. d = (t3 - t1, t2 - t1, t3 - t2)
  178. print(f"Total Time: {d[0]}")
  179. print(f"Preparation Time: {d[1]}")
  180. print(f"Test Time: {d[2]}")
  181. return d, gan
  182. testSets = [
  183. "folding_abalone_17_vs_7_8_9_10",
  184. "folding_abalone9-18",
  185. "folding_car_good",
  186. "folding_car-vgood",
  187. "folding_flare-F",
  188. "folding_hypothyroid",
  189. "folding_kddcup-guess_passwd_vs_satan",
  190. "folding_kr-vs-k-three_vs_eleven",
  191. "folding_kr-vs-k-zero-one_vs_draw",
  192. "folding_shuttle-2_vs_5",
  193. "folding_winequality-red-4",
  194. "folding_yeast4",
  195. "folding_yeast5",
  196. "folding_yeast6",
  197. "imblearn_webpage",
  198. "imblearn_mammography",
  199. "imblearn_protein_homo",
  200. "imblearn_ozone_level",
  201. "kaggle_creditcard"
  202. ]
  203. def runAllTestSets(dataSetList):
  204. for dataset in testSets:
  205. runExerciseForRepeater(dataset)
  206. runExerciseForSpheredNoise(dataset)
  207. runExerciseForSimpleGAN(dataset)
  208. runExerciseForConvGAN(dataset)