analysis.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.GanExamples import StupidToyListGan
  4. from library.SimpleGan import SimpleGan
  5. from library.Repeater import Repeater
  6. from library.SpheredNoise import SpheredNoise
  7. import pickle
  8. import numpy as np
  9. import random
  10. from imblearn.datasets import fetch_datasets
  11. def loadDataset(datasetName):
  12. def isSame(xs, ys):
  13. for (x, y) in zip(xs, ys):
  14. if x != y:
  15. return False
  16. return True
  17. def isIn(ys):
  18. def f(x):
  19. for y in ys:
  20. if isSame(x,y):
  21. return True
  22. return False
  23. return f
  24. def isNotIn(ys):
  25. def f(x):
  26. for y in ys:
  27. if isSame(x,y):
  28. return False
  29. return True
  30. return f
  31. pickle_in = open(f"{datasetName}.pickle", "rb")
  32. pickle_dict = pickle.load(pickle_in)
  33. myData = pickle_dict["folding"]
  34. k = myData[0]
  35. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  36. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  37. label_1 = list(np.where(labels == 1)[0])
  38. label_0 = list(np.where(labels == 0)[0])
  39. features_1 = features[label_1]
  40. features_0 = features[label_0]
  41. cut = np.array(list(filter(isIn(features_0), features_1)))
  42. if len(cut) > 0:
  43. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  44. # print(f"{len(features_0)}/{len(features_1)} point before")
  45. # features_0 = np.array(list(filter(isNotIn(cut), features_0)))
  46. # features_1 = np.array(list(filter(isNotIn(cut), features_1)))
  47. # print(f"{len(features_0)}/{len(features_1)} points after")
  48. return DataSet(data0=features_0, data1=features_1)
  49. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  50. value = initValue
  51. while True:
  52. value = ((multValue * value) + incValue) % modulus
  53. yield value
  54. def genShuffler():
  55. randGen = getRandGen(2021)
  56. def shuffler(data):
  57. data = list(data)
  58. size = len(data)
  59. shuffled = []
  60. while size > 0:
  61. p = next(randGen) % size
  62. size -= 1
  63. shuffled.append(data[p])
  64. data = data[0:p] + data[(p + 1):]
  65. return np.array(shuffled)
  66. return shuffler
  67. def runExerciseForSimpleGAN(datasetName):
  68. ganName = "SimpleGAN"
  69. print()
  70. print()
  71. print("///////////////////////////////////////////")
  72. print(f"// Running {ganName} on {datasetName}")
  73. print("///////////////////////////////////////////")
  74. print()
  75. data = loadDataset(f"data_input/{datasetName}")
  76. gan = SimpleGan(numOfFeatures=data.data0.shape[1])
  77. random.seed(2021)
  78. shuffler = genShuffler()
  79. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  80. exercise.run(gan, data)
  81. exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  82. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  83. def runExerciseForRepeater(datasetName):
  84. ganName = "Repeater"
  85. print()
  86. print()
  87. print("///////////////////////////////////////////")
  88. print(f"// Running {ganName} on {datasetName}")
  89. print("///////////////////////////////////////////")
  90. print()
  91. data = loadDataset(f"data_input/{datasetName}")
  92. gan = Repeater()
  93. random.seed(2021)
  94. shuffler = genShuffler()
  95. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  96. exercise.run(gan, data)
  97. exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  98. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  99. def runExerciseForSpheredNoise(datasetName, resultList=None):
  100. ganName = "SpheredNoise"
  101. print()
  102. print()
  103. print("///////////////////////////////////////////")
  104. print(f"// Running {ganName} on {datasetName}")
  105. print("///////////////////////////////////////////")
  106. print()
  107. data = loadDataset(f"data_input/{datasetName}")
  108. gan = SpheredNoise()
  109. random.seed(2021)
  110. shuffler = genShuffler()
  111. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  112. exercise.run(gan, data)
  113. avg = exercise.saveResultsTo(f"data_result/{datasetName}-{ganName}.csv")
  114. exercise.saveResultsTo(f"data_result/{ganName}-{datasetName}.csv")
  115. if resultList is not None:
  116. resultList[datasetName] = avg
  117. testSets = [
  118. "folding_abalone_17_vs_7_8_9_10",
  119. "folding_abalone9-18",
  120. "folding_car_good",
  121. "folding_car-vgood",
  122. "folding_flare-F",
  123. "folding_hypothyroid",
  124. "folding_kddcup-guess_passwd_vs_satan",
  125. "folding_kr-vs-k-three_vs_eleven",
  126. "folding_kr-vs-k-zero-one_vs_draw",
  127. "folding_shuttle-2_vs_5",
  128. "folding_winequality-red-4",
  129. "folding_yeast4",
  130. "folding_yeast5",
  131. "folding_yeast6"
  132. ]
  133. def runAllTestSets(dataSetList):
  134. for dsFileName in dataSetList:
  135. runExerciseForSimpleGAN(dataSetList)
  136. runExerciseForRepeater(dataSetList)