analysis.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.GanExamples import StupidToyListGan
  4. from library.SimpleGan import SimpleGan
  5. from library.Repeater import Repeater
  6. from library.SpheredNoise import SpheredNoise
  7. from library.convGAN import ConvGAN
  8. import pickle
  9. import numpy as np
  10. import time
  11. import random
  12. import csv
  13. import gzip
  14. import sys
  15. import os
  16. from imblearn.datasets import fetch_datasets
  17. def loadDataset(datasetName):
  18. def isSame(xs, ys):
  19. for (x, y) in zip(xs, ys):
  20. if x != y:
  21. return False
  22. return True
  23. def isIn(ys):
  24. def f(x):
  25. for y in ys:
  26. if isSame(x,y):
  27. return True
  28. return False
  29. return f
  30. def isNotIn(ys):
  31. def f(x):
  32. for y in ys:
  33. if isSame(x,y):
  34. return False
  35. return True
  36. return f
  37. print(f"Load '{datasetName}'")
  38. if datasetName.startswith("data_input/imblearn_"):
  39. print("from imblearn")
  40. ds = fetch_datasets()
  41. myData = ds[datasetName[20:]]
  42. ds = None
  43. features = myData["data"]
  44. labels = myData["target"]
  45. elif datasetName.startswith("data_input/kaggle_"):
  46. features = []
  47. labels = []
  48. c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
  49. for (n, row) in enumerate(c):
  50. # Skip heading
  51. if n > 0:
  52. features.append([float(x) for x in row[:-1]])
  53. labels.append(int(row[-1]))
  54. features = np.array(features)
  55. labels = np.array(labels)
  56. else:
  57. print("from pickle file")
  58. pickle_in = open(f"{datasetName}.pickle", "rb")
  59. pickle_dict = pickle.load(pickle_in)
  60. myData = pickle_dict["folding"]
  61. k = myData[0]
  62. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  63. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  64. label_1 = list(np.where(labels == 1)[0])
  65. label_0 = list(np.where(labels != 1)[0])
  66. features_1 = features[label_1]
  67. features_0 = features[label_0]
  68. cut = np.array(list(filter(isIn(features_0), features_1)))
  69. if len(cut) > 0:
  70. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  71. # print(f"{len(features_0)}/{len(features_1)} point before")
  72. # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
  73. # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
  74. # print(f"{len(features_0)}/{len(features_1)} points after")
  75. ds = DataSet(data0=features_0, data1=features_1)
  76. print("Data loaded.")
  77. return ds
  78. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  79. value = initValue
  80. while True:
  81. value = ((multValue * value) + incValue) % modulus
  82. yield value
  83. def genShuffler():
  84. randGen = getRandGen(2021)
  85. def shuffler(data):
  86. data = list(data)
  87. size = len(data)
  88. shuffled = []
  89. while size > 0:
  90. p = next(randGen) % size
  91. size -= 1
  92. shuffled.append(data[p])
  93. data = data[0:p] + data[(p + 1):]
  94. return np.array(shuffled)
  95. return shuffler
  96. def showTime(t):
  97. s = int(t)
  98. m = s // 60
  99. h = m // 60
  100. d = h // 24
  101. s = s % 60
  102. m = m % 60
  103. h = h % 24
  104. if d > 0:
  105. return f"{d} days {h:02d}:{m:02d}:{s:02d}"
  106. else:
  107. return f"{h:02d}:{m:02d}:{s:02d}"
  108. def mkDirIfNotExists(name):
  109. try:
  110. os.mkdir(name)
  111. except FileExistsError as e:
  112. pass
  113. def runExercise(datasetName, resultList, ganName, ganCreator, skipIfCsvExists=True):
  114. print(f"* Running {ganName} on {datasetName}")
  115. oldStdOut = sys.stdout
  116. oldStdErr = sys.stderr
  117. resultsFileName = f"data_result/{ganName}"
  118. # Prepare Folder for result data
  119. mkDirIfNotExists("data_result")
  120. mkDirIfNotExists(resultsFileName)
  121. resultsFileName += f"/{datasetName}"
  122. try:
  123. os.stat(f"{resultsFileName}.csv")
  124. if skipIfCsvExists and resultList is None:
  125. print(" Resultfile exists => skip calculation.")
  126. return
  127. except FileNotFoundError as e:
  128. pass
  129. sys.stdout = open(resultsFileName + ".log", "w")
  130. sys.stderr = sys.stdout
  131. twStart = time.time()
  132. tpStart = time.process_time()
  133. print()
  134. print()
  135. print("///////////////////////////////////////////")
  136. print(f"// Running {ganName} on {datasetName}")
  137. print("///////////////////////////////////////////")
  138. print()
  139. data = loadDataset(f"data_input/{datasetName}")
  140. gan = ganCreator(data)
  141. random.seed(2021)
  142. shuffler = genShuffler()
  143. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  144. avg = exercise.run(gan, data, resultsFileName=resultsFileName)
  145. tpEnd = time.process_time()
  146. twEnd = time.time()
  147. if resultList is not None:
  148. resultList[datasetName] = avg
  149. sys.stdout = oldStdOut
  150. sys.stderr = oldStdErr
  151. print(f" wall time: {showTime(twEnd - twStart)}s, process time: {showTime(tpEnd - tpStart)}")
  152. def runExerciseForSimpleGAN(datasetName, resultList=None):
  153. runExercise(datasetName, resultList, "SimpleGAN", lambda data: SimpleGan(numOfFeatures=data.data0.shape[1]))
  154. def runExerciseForRepeater(datasetName, resultList=None):
  155. runExercise(datasetName, resultList, "Repeater", lambda _data: Repeater())
  156. def runExerciseForSpheredNoise(datasetName, resultList=None):
  157. runExercise(datasetName, resultList, "SpheredNoise", lambda _data: SpheredNoise())
  158. def runExerciseForConvGAN(datasetName, resultList=None, debug=False):
  159. runExercise(datasetName, resultList, "convGAN", lambda data: ConvGAN(data.data0.shape[1], debug=debug))
  160. def runSpeedTestForConvGan(datasetName, ganGenerator):
  161. ganName = "convGAN"
  162. print()
  163. print()
  164. print("///////////////////////////////////////////")
  165. print(f"// Running speed test for {ganName} on {datasetName}")
  166. print("///////////////////////////////////////////")
  167. print()
  168. d = []
  169. t1 = time.time()
  170. data = loadDataset(f"data_input/{datasetName}")
  171. gan = ganGenerator(data.data0.shape[1])
  172. random.seed(2021)
  173. shuffler = genShuffler()
  174. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3)
  175. exercise.debug = (lambda _x: None)
  176. t2 = time.time()
  177. exercise.run(gan, data)
  178. t3 = time.time()
  179. d = (t3 - t1, t2 - t1, t3 - t2)
  180. print(f"Total Time: {d[0]}")
  181. print(f"Preparation Time: {d[1]}")
  182. print(f"Test Time: {d[2]}")
  183. return d, gan
  184. testSets = [
  185. "folding_abalone_17_vs_7_8_9_10",
  186. "folding_abalone9-18",
  187. "folding_car_good",
  188. "folding_car-vgood",
  189. "folding_flare-F",
  190. "folding_hypothyroid",
  191. "folding_kddcup-guess_passwd_vs_satan",
  192. "folding_kr-vs-k-three_vs_eleven",
  193. "folding_kr-vs-k-zero-one_vs_draw",
  194. "folding_shuttle-2_vs_5",
  195. "folding_winequality-red-4",
  196. "folding_yeast4",
  197. "folding_yeast5",
  198. "folding_yeast6",
  199. "imblearn_webpage",
  200. "imblearn_mammography",
  201. "imblearn_protein_homo",
  202. "imblearn_ozone_level",
  203. "kaggle_creditcard"
  204. ]
  205. def runAllTestSets(dataSetList):
  206. for dataset in testSets:
  207. runExerciseForRepeater(dataset)
  208. runExerciseForSpheredNoise(dataset)
  209. runExerciseForSimpleGAN(dataset)
  210. runExerciseForConvGAN(dataset)