analysis.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.GanExamples import StupidToyListGan
  4. from library.SimpleGan import SimpleGan
  5. from library.Repeater import Repeater
  6. from library.SpheredNoise import SpheredNoise
  7. from library.convGAN import ConvGAN
  8. import pickle
  9. import numpy as np
  10. import time
  11. import random
  12. import csv
  13. import gzip
  14. from imblearn.datasets import fetch_datasets
  15. def loadDataset(datasetName):
  16. def isSame(xs, ys):
  17. for (x, y) in zip(xs, ys):
  18. if x != y:
  19. return False
  20. return True
  21. def isIn(ys):
  22. def f(x):
  23. for y in ys:
  24. if isSame(x,y):
  25. return True
  26. return False
  27. return f
  28. def isNotIn(ys):
  29. def f(x):
  30. for y in ys:
  31. if isSame(x,y):
  32. return False
  33. return True
  34. return f
  35. print(f"Load '{datasetName}'")
  36. if datasetName.startswith("data_input/imblearn_"):
  37. print("from imblearn")
  38. ds = fetch_datasets()
  39. myData = ds[datasetName[20:]]
  40. ds = None
  41. features = myData["data"]
  42. labels = myData["target"]
  43. elif datasetName.startswith("data_input/kaggle_"):
  44. features = []
  45. labels = []
  46. c = csv.reader(gzip.open(f"{datasetName}.csv.gz", "rt"))
  47. for (n, row) in enumerate(c):
  48. # Skip heading
  49. if n > 0:
  50. features.append([float(x) for x in row[:-1]])
  51. labels.append(int(row[-1]))
  52. features = np.array(features)
  53. labels = np.array(labels)
  54. else:
  55. print("from pickle file")
  56. pickle_in = open(f"{datasetName}.pickle", "rb")
  57. pickle_dict = pickle.load(pickle_in)
  58. myData = pickle_dict["folding"]
  59. k = myData[0]
  60. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  61. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  62. label_1 = list(np.where(labels == 1)[0])
  63. label_0 = list(np.where(labels != 1)[0])
  64. features_1 = features[label_1]
  65. features_0 = features[label_0]
  66. cut = np.array(list(filter(isIn(features_0), features_1)))
  67. if len(cut) > 0:
  68. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  69. # print(f"{len(features_0)}/{len(features_1)} point before")
  70. # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
  71. # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
  72. # print(f"{len(features_0)}/{len(features_1)} points after")
  73. ds = DataSet(data0=features_0, data1=features_1)
  74. print("Data loaded.")
  75. return ds
  76. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  77. value = initValue
  78. while True:
  79. value = ((multValue * value) + incValue) % modulus
  80. yield value
  81. def genShuffler():
  82. randGen = getRandGen(2021)
  83. def shuffler(data):
  84. data = list(data)
  85. size = len(data)
  86. shuffled = []
  87. while size > 0:
  88. p = next(randGen) % size
  89. size -= 1
  90. shuffled.append(data[p])
  91. data = data[0:p] + data[(p + 1):]
  92. return np.array(shuffled)
  93. return shuffler
  94. def runExerciseForSimpleGAN(datasetName):
  95. ganName = "SimpleGAN"
  96. print()
  97. print()
  98. print("///////////////////////////////////////////")
  99. print(f"// Running {ganName} on {datasetName}")
  100. print("///////////////////////////////////////////")
  101. print()
  102. data = loadDataset(f"data_input/{datasetName}")
  103. gan = SimpleGan(numOfFeatures=data.data0.shape[1])
  104. random.seed(2021)
  105. shuffler = genShuffler()
  106. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  107. exercise.run(gan, data)
  108. exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  109. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  110. def runExerciseForRepeater(datasetName):
  111. ganName = "Repeater"
  112. print()
  113. print()
  114. print("///////////////////////////////////////////")
  115. print(f"// Running {ganName} on {datasetName}")
  116. print("///////////////////////////////////////////")
  117. print()
  118. data = loadDataset(f"data_input/{datasetName}")
  119. gan = Repeater()
  120. random.seed(2021)
  121. shuffler = genShuffler()
  122. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  123. exercise.run(gan, data)
  124. exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  125. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  126. def runExerciseForSpheredNoise(datasetName, resultList=None):
  127. ganName = "SpheredNoise"
  128. print()
  129. print()
  130. print("///////////////////////////////////////////")
  131. print(f"// Running {ganName} on {datasetName}")
  132. print("///////////////////////////////////////////")
  133. print()
  134. data = loadDataset(f"data_input/{datasetName}")
  135. gan = SpheredNoise()
  136. random.seed(2021)
  137. shuffler = genShuffler()
  138. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  139. exercise.run(gan, data)
  140. avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  141. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  142. if resultList is not None:
  143. resultList[datasetName] = avg
  144. def runExerciseForConvGAN(datasetName, resultList=None):
  145. ganName = "convGAN"
  146. print()
  147. print()
  148. print("///////////////////////////////////////////")
  149. print(f"// Running {ganName} on {datasetName}")
  150. print("///////////////////////////////////////////")
  151. print()
  152. data = loadDataset(f"data_input/{datasetName}")
  153. gan = ConvGAN(data.data0.shape[1])
  154. random.seed(2021)
  155. shuffler = genShuffler()
  156. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  157. exercise.run(gan, data)
  158. avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  159. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  160. if resultList is not None:
  161. resultList[datasetName] = avg
  162. def runSpeedTestForConvGan(datasetName, ganGenerator):
  163. ganName = "convGAN"
  164. print()
  165. print()
  166. print("///////////////////////////////////////////")
  167. print(f"// Running speed test for {ganName} on {datasetName}")
  168. print("///////////////////////////////////////////")
  169. print()
  170. d = []
  171. t1 = time.time()
  172. data = loadDataset(f"data_input/{datasetName}")
  173. gan = ganGenerator(data.data0.shape[1])
  174. random.seed(2021)
  175. shuffler = genShuffler()
  176. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=3, numOfSlices=3)
  177. exercise.debug = (lambda _x: None)
  178. t2 = time.time()
  179. exercise.run(gan, data)
  180. t3 = time.time()
  181. d = (t3 - t1, t2 - t1, t3 - t2)
  182. print(f"Total Time: {d[0]}")
  183. print(f"Preparation Time: {d[1]}")
  184. print(f"Test Time: {d[2]}")
  185. return d, gan
  186. testSets = [
  187. "folding_abalone_17_vs_7_8_9_10",
  188. "folding_abalone9-18",
  189. "folding_car_good",
  190. "folding_car-vgood",
  191. "folding_flare-F",
  192. "folding_hypothyroid",
  193. "folding_kddcup-guess_passwd_vs_satan",
  194. "folding_kr-vs-k-three_vs_eleven",
  195. "folding_kr-vs-k-zero-one_vs_draw",
  196. "folding_shuttle-2_vs_5",
  197. "folding_winequality-red-4",
  198. "folding_yeast4",
  199. "folding_yeast5",
  200. "folding_yeast6",
  201. "imblearn_webpage",
  202. "imblearn_mammography",
  203. "imblearn_protein_homo",
  204. "kaggle_creditcard"
  205. ]
  206. def runAllTestSets(dataSetList):
  207. for dsFileName in dataSetList:
  208. runExerciseForSimpleGAN(dataSetList)
  209. runExerciseForRepeater(dataSetList)