analysis.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.GanExamples import StupidToyListGan
  4. from library.SimpleGan import SimpleGan
  5. from library.Repeater import Repeater
  6. from library.SpheredNoise import SpheredNoise
  7. from library.convGAN import ConvGAN
  8. import pickle
  9. import numpy as np
  10. import time
  11. import random
  12. import csv
  13. import gzip
  14. import sys
  15. import os
  16. from imblearn.datasets import fetch_datasets
  17. def loadDataset(datasetName):
  18. def isSame(xs, ys):
  19. for (x, y) in zip(xs, ys):
  20. if x != y:
  21. return False
  22. return True
  23. def isIn(ys):
  24. def f(x):
  25. for y in ys:
  26. if isSame(x,y):
  27. return True
  28. return False
  29. return f
  30. def isNotIn(ys):
  31. def f(x):
  32. for y in ys:
  33. if isSame(x,y):
  34. return False
  35. return True
  36. return f
  37. print(f"Load '{datasetName}'")
  38. if datasetName.startswith("data_input/imblearn_"):
  39. print("from imblearn")
  40. ds = fetch_datasets()
  41. myData = ds[datasetName[20:]]
  42. ds = None
  43. features = myData["data"]
  44. labels = myData["target"]
  45. elif datasetName.startswith("data_input/kaggle_"):
  46. features = []
  47. labels = []
  48. c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
  49. for (n, row) in enumerate(c):
  50. # Skip heading
  51. if n > 0:
  52. features.append([float(x) for x in row[:-1]])
  53. labels.append(int(row[-1]))
  54. features = np.array(features)
  55. labels = np.array(labels)
  56. else:
  57. print("from pickle file")
  58. pickle_in = open(f"{datasetName}.pickle", "rb")
  59. pickle_dict = pickle.load(pickle_in)
  60. myData = pickle_dict["folding"]
  61. k = myData[0]
  62. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  63. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  64. label_1 = list(np.where(labels == 1)[0])
  65. label_0 = list(np.where(labels != 1)[0])
  66. features_1 = features[label_1]
  67. features_0 = features[label_0]
  68. cut = np.array(list(filter(isIn(features_0), features_1)))
  69. if len(cut) > 0:
  70. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  71. # print(f"{len(features_0)}/{len(features_1)} point before")
  72. # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
  73. # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
  74. # print(f"{len(features_0)}/{len(features_1)} points after")
  75. ds = DataSet(data0=features_0, data1=features_1)
  76. print("Data loaded.")
  77. return ds
  78. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  79. value = initValue
  80. while True:
  81. value = ((multValue * value) + incValue) % modulus
  82. yield value
  83. def genShuffler():
  84. randGen = getRandGen(2021)
  85. def shuffler(data):
  86. data = list(data)
  87. size = len(data)
  88. shuffled = []
  89. while size > 0:
  90. p = next(randGen) % size
  91. size -= 1
  92. shuffled.append(data[p])
  93. data = data[0:p] + data[(p + 1):]
  94. return np.array(shuffled)
  95. return shuffler
  96. def showTime(t):
  97. s = int(t)
  98. m = s // 60
  99. h = m // 60
  100. d = h // 24
  101. s = s % 60
  102. m = m % 60
  103. h = h % 24
  104. if d > 0:
  105. return f"{d} days {h:02d}:{m:02d}:{s:02d}"
  106. else:
  107. return f"{h:02d}:{m:02d}:{s:02d}"
  108. def runExercise(datasetName, resultList, ganName, ganCreator, skipIfCsvExists=True):
  109. print(f"* Running {ganName} on {datasetName}")
  110. oldStdOut = sys.stdout
  111. oldStdErr = sys.stderr
  112. resultsFileName = f"data_result/{ganName}"
  113. # Prepare Folder for result data
  114. try:
  115. os.mkdir(resultsFileName)
  116. except FileExistsError as e:
  117. pass
  118. resultsFileName += f"/{datasetName}"
  119. try:
  120. os.stat(f"{resultsFileName}.csv")
  121. if skipIfCsvExists and resultList is None:
  122. print(" Resultfile exists => skip calculation.")
  123. return
  124. except FileNotFoundError as e:
  125. pass
  126. sys.stdout = open(resultsFileName + ".log", "w")
  127. sys.stderr = sys.stdout
  128. twStart = time.time()
  129. tpStart = time.process_time()
  130. print()
  131. print()
  132. print("///////////////////////////////////////////")
  133. print(f"// Running {ganName} on {datasetName}")
  134. print("///////////////////////////////////////////")
  135. print()
  136. data = loadDataset(f"data_input/{datasetName}")
  137. gan = ganCreator(data)
  138. random.seed(2021)
  139. shuffler = genShuffler()
  140. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  141. avg = exercise.run(gan, data, resultsFileName=resultsFileName)
  142. tpEnd = time.process_time()
  143. twEnd = time.time()
  144. if resultList is not None:
  145. resultList[datasetName] = avg
  146. sys.stdout = oldStdOut
  147. sys.stderr = oldStdErr
  148. print(f" wall time: {showTime(twEnd - twStart)}s, process time: {showTime(tpEnd - tpStart)}")
  149. def runExerciseForSimpleGAN(datasetName, resultList=None):
  150. runExercise(datasetName, resultList, "SimpleGAN", lambda data: SimpleGan(numOfFeatures=data.data0.shape[1]))
  151. def runExerciseForRepeater(datasetName, resultList=None):
  152. runExercise(datasetName, resultList, "Repeater", lambda _data: Repeater())
  153. def runExerciseForSpheredNoise(datasetName, resultList=None):
  154. runExercise(datasetName, resultList, "SpheredNoise", lambda _data: SpheredNoise())
  155. def runExerciseForConvGAN(datasetName, resultList=None, debug=False):
  156. runExercise(datasetName, resultList, "convGAN", lambda data: ConvGAN(data.data0.shape[1], debug=debug))
  157. def runSpeedTestForConvGan(datasetName, ganGenerator):
  158. ganName = "convGAN"
  159. print()
  160. print()
  161. print("///////////////////////////////////////////")
  162. print(f"// Running speed test for {ganName} on {datasetName}")
  163. print("///////////////////////////////////////////")
  164. print()
  165. d = []
  166. t1 = time.time()
  167. data = loadDataset(f"data_input/{datasetName}")
  168. gan = ganGenerator(data.data0.shape[1])
  169. random.seed(2021)
  170. shuffler = genShuffler()
  171. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3)
  172. exercise.debug = (lambda _x: None)
  173. t2 = time.time()
  174. exercise.run(gan, data)
  175. t3 = time.time()
  176. d = (t3 - t1, t2 - t1, t3 - t2)
  177. print(f"Total Time: {d[0]}")
  178. print(f"Preparation Time: {d[1]}")
  179. print(f"Test Time: {d[2]}")
  180. return d, gan
  181. testSets = [
  182. "folding_abalone_17_vs_7_8_9_10",
  183. "folding_abalone9-18",
  184. "folding_car_good",
  185. "folding_car-vgood",
  186. "folding_flare-F",
  187. "folding_hypothyroid",
  188. "folding_kddcup-guess_passwd_vs_satan",
  189. "folding_kr-vs-k-three_vs_eleven",
  190. "folding_kr-vs-k-zero-one_vs_draw",
  191. "folding_shuttle-2_vs_5",
  192. "folding_winequality-red-4",
  193. "folding_yeast4",
  194. "folding_yeast5",
  195. "folding_yeast6",
  196. "imblearn_webpage",
  197. "imblearn_mammography",
  198. "imblearn_protein_homo",
  199. "imblearn_ozone_level",
  200. "kaggle_creditcard"
  201. ]
  202. def runAllTestSets(dataSetList):
  203. for dataset in testSets:
  204. runExerciseForRepeater(dataset)
  205. runExerciseForSpheredNoise(dataset)
  206. runExerciseForSimpleGAN(dataset)
  207. runExerciseForConvGAN(dataset)