analysis.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. from library.exercise import Exercise
  2. from library.dataset import DataSet, TrainTestData
  3. from library.generators import ProWRAS, SimpleGan, Repeater, ConvGeN, CtGAN, CtabGan
  4. import pickle
  5. import numpy as np
  6. import time
  7. import random
  8. import csv
  9. import gzip
  10. import sys
  11. import os
  12. from imblearn.datasets import fetch_datasets
  13. def loadDataset(datasetName):
  14. def isSame(xs, ys):
  15. for (x, y) in zip(xs, ys):
  16. if x != y:
  17. return False
  18. return True
  19. def isIn(ys):
  20. def f(x):
  21. for y in ys:
  22. if isSame(x,y):
  23. return True
  24. return False
  25. return f
  26. print(f"Load '{datasetName}'")
  27. if datasetName.startswith("imblearn_"):
  28. print("from imblearn")
  29. ds = fetch_datasets()
  30. myData = ds[datasetName[9:]]
  31. ds = None
  32. features = myData["data"]
  33. labels = myData["target"]
  34. elif datasetName.startswith("kaggle_"):
  35. features = []
  36. labels = []
  37. c = csv.reader(gzip.open(f"data_input/{datasetName}.csv.gz", "rt"))
  38. for (n, row) in enumerate(c):
  39. # Skip heading
  40. if n > 0:
  41. features.append([float(x) for x in row[:-1]])
  42. labels.append(int(row[-1]))
  43. features = np.array(features)
  44. labels = np.array(labels)
  45. else:
  46. print("from pickle file")
  47. pickle_in = open(f"data_input/{datasetName}.pickle", "rb")
  48. pickle_dict = pickle.load(pickle_in)
  49. myData = pickle_dict["folding"]
  50. k = myData[0]
  51. labels = np.concatenate((k[1], k[3]), axis=0).astype(float)
  52. features = np.concatenate((k[0], k[2]), axis=0).astype(float)
  53. label_1 = list(np.where(labels == 1)[0])
  54. label_0 = list(np.where(labels != 1)[0])
  55. features_1 = features[label_1]
  56. features_0 = features[label_0]
  57. cut = np.array(list(filter(isIn(features_1), features_0)))
  58. if len(cut) > 0:
  59. print(f"non empty cut in {datasetName}! ({len(cut)} points)")
  60. ds = DataSet(data0=features_0, data1=features_1)
  61. print("Data loaded.")
  62. return ds
  63. def getRandGen(initValue, incValue=257, multValue=101, modulus=65537):
  64. value = initValue
  65. while True:
  66. value = ((multValue * value) + incValue) % modulus
  67. yield value
  68. def genShuffler():
  69. randGen = getRandGen(2021)
  70. def shuffler(data):
  71. data = list(data)
  72. size = len(data)
  73. shuffled = []
  74. while size > 0:
  75. p = next(randGen) % size
  76. size -= 1
  77. shuffled.append(data[p])
  78. data = data[0:p] + data[(p + 1):]
  79. return np.array(shuffled)
  80. return shuffler
  81. def showTime(t):
  82. s = int(t)
  83. m = s // 60
  84. h = m // 60
  85. d = h // 24
  86. s = s % 60
  87. m = m % 60
  88. h = h % 24
  89. if d > 0:
  90. return f"{d} days {h:02d}:{m:02d}:{s:02d}"
  91. else:
  92. return f"{h:02d}:{m:02d}:{s:02d}"
  93. def mkDirIfNotExists(name):
  94. try:
  95. os.mkdir(name)
  96. except FileExistsError as e:
  97. pass
  98. def runExercise(datasetName, resultList, ganName, ganCreator, skipIfCsvExists=True):
  99. print(f"* Running {ganName} on {datasetName}")
  100. oldStdOut = sys.stdout
  101. oldStdErr = sys.stderr
  102. resultsFileName = f"data_result/{ganName}"
  103. # Prepare Folder for result data
  104. mkDirIfNotExists("data_result")
  105. mkDirIfNotExists(resultsFileName)
  106. resultsFileName += f"/{datasetName}"
  107. try:
  108. os.stat(f"{resultsFileName}.csv")
  109. if skipIfCsvExists and resultList is None:
  110. print(" Resultfile exists => skip calculation.")
  111. return
  112. except FileNotFoundError as e:
  113. pass
  114. sys.stdout = open(resultsFileName + ".log", "w")
  115. sys.stderr = sys.stdout
  116. twStart = time.time()
  117. tpStart = time.process_time()
  118. print()
  119. print()
  120. print("///////////////////////////////////////////")
  121. print(f"// Running {ganName} on {datasetName}")
  122. print("///////////////////////////////////////////")
  123. print()
  124. data = loadDataset(f"{datasetName}")
  125. gan = ganCreator(data)
  126. random.seed(2021)
  127. shuffler = genShuffler()
  128. exercise = Exercise(shuffleFunction=shuffler, numOfShuffles=5, numOfSlices=5)
  129. avg = exercise.run(gan, data, resultsFileName=resultsFileName)
  130. tpEnd = time.process_time()
  131. twEnd = time.time()
  132. if resultList is not None:
  133. resultList[datasetName] = avg
  134. print(f" wall time: {showTime(twEnd - twStart)}s, process time: {showTime(tpEnd - tpStart)}s")
  135. sys.stdout = open(resultsFileName + ".log.time", "w")
  136. print(f"Running {ganName} on {datasetName}")
  137. print(f"wall time (s): {showTime(twEnd - twStart)}\nprocess time (s): {showTime(tpEnd - tpStart)}")
  138. sys.stdout = oldStdOut
  139. sys.stderr = oldStdErr
  140. print(f" wall time: {showTime(twEnd - twStart)}s, process time: {showTime(tpEnd - tpStart)}s")
  141. testSets = [
  142. "folding_abalone_17_vs_7_8_9_10",
  143. "folding_abalone9-18",
  144. "folding_car_good",
  145. "folding_car-vgood",
  146. "folding_flare-F",
  147. "folding_hypothyroid",
  148. "folding_kddcup-guess_passwd_vs_satan",
  149. "folding_kr-vs-k-three_vs_eleven",
  150. "folding_kr-vs-k-zero-one_vs_draw",
  151. "folding_shuttle-2_vs_5",
  152. "folding_winequality-red-4",
  153. "folding_yeast4",
  154. "folding_yeast5",
  155. "folding_yeast6",
  156. #"imblearn_webpage",
  157. #"imblearn_mammography",
  158. #"imblearn_protein_homo",
  159. #"imblearn_ozone_level",
  160. #"kaggle_creditcard"
  161. ]
  162. generators = { "Repeater": lambda _data: Repeater()
  163. , "ProWRAS": lambda _data: ProWRAS()
  164. , "GAN": lambda data: SimpleGan(numOfFeatures=data.data0.shape[1], epochs=300)
  165. , "CTGAN": lambda data: CtGAN(epochs=300)
  166. , "CTAB-GAN": lambda _data: CtabGan(epochs=300)
  167. , "ConvGeN-majority-5": lambda data: ConvGeN(data.data0.shape[1], neb=5, gen=5, neb_epochs=30)
  168. , "ConvGeN-majority-full": lambda data: ConvGeN(data.data0.shape[1], neb=None, neb_epochs=30)
  169. , "ConvGeN-proximity-5": lambda data: ConvGeN(data.data0.shape[1], neb=5, gen=5, maj_proximal=True, neb_epochs=30)
  170. , "ConvGeN-proximity-full": lambda data: ConvGeN(data.data0.shape[1], neb=None, maj_proximal=True, neb_epochs=30)
  171. }