testers.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. """
  2. This module contains test function for datasets using the logistic regression, the support vector
  3. machine and the k-next-neighbourhood algoritm. Additionally it contains a class for storing the
  4. results of the tests.
  5. """
  6. import sklearn
  7. # needed in function lr
  8. from sklearn.ensemble import RandomForestClassifier
  9. from sklearn.neighbors import KNeighborsClassifier
  10. from sklearn.linear_model import LogisticRegression
  11. from sklearn.metrics import confusion_matrix
  12. from sklearn.metrics import average_precision_score
  13. from sklearn.metrics import f1_score
  14. from sklearn.metrics import cohen_kappa_score
  15. from sklearn.ensemble import GradientBoostingClassifier
  16. _tF1 = "f1 score"
  17. _tTN = "TN"
  18. _tTP = "TP"
  19. _tFN = "FN"
  20. _tFP = "FP"
  21. _tFP = "RF"
  22. _tAps = "average precision score"
  23. _tCks = "cohens kappa score"
  24. class TestResult:
  25. """
  26. This class represents the result of one test.
  27. It stores its *title*, a confusion matrix (*con_mat*), the balanced accuracy score (*bal_acc*)
  28. and the f1 score (*f1*). If given the average precision score is also stored (*aps*).
  29. """
  30. def __init__(self, title, labels=None, prediction=None, aps=None):
  31. """
  32. Creates an instance of this class. The stored data will be generated from the given values.
  33. *title* is a text to identify this result.
  34. *labels* is a /numpy.array/ containing the labels of the test-data-set.
  35. *prediction* is a /numpy.array/ containing the done prediction for the test-data-set.
  36. *aps* is a real number representing the average precision score.
  37. """
  38. self.title = title
  39. self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks]
  40. if aps is not None:
  41. self.heading.append(_tAps)
  42. self.data = { n: 0.0 for n in self.heading }
  43. if labels is not None and prediction is not None:
  44. self.data[_tF1] = f1_score(labels, prediction)
  45. self.data[_tCks] = cohen_kappa_score(labels, prediction)
  46. conMat = self._enshureConfusionMatrix(confusion_matrix(labels, prediction))
  47. [[tn, fp], [fn, tp]] = conMat
  48. self.data[_tTN] = tn
  49. self.data[_tTP] = tp
  50. self.data[_tFN] = fn
  51. self.data[_tFP] = fp
  52. if aps is not None:
  53. self.data[_tAps] = aps
  54. def __str__(self):
  55. """
  56. Generates a text representing this result.
  57. """
  58. text = ""
  59. tn = self.data[_tTN]
  60. tp = self.data[_tTP]
  61. fn = self.data[_tFN]
  62. fp = self.data[_tFP]
  63. text += f"{self.title} tn, fp: {tn}, {fp}\n"
  64. text += f"{self.title} fn, tp: {fn}, {tp}\n"
  65. for k in self.heading:
  66. if k not in [_tTP, _tTN, _tFP, _tFN]:
  67. text += f"{self.title} {k}: {self.data[k]:.3f}\n"
  68. return text
  69. def csvHeading(self):
  70. return ";".join(self.heading)
  71. def toCSV(self):
  72. return ";".join(map(lambda k: f"{self.data[k]:0.3f}", self.heading))
  73. @staticmethod
  74. def _enshureConfusionMatrix(c):
  75. c0 = [0.0, 0.0]
  76. c1 = [0.0, 0.0]
  77. if len(c) > 0:
  78. if len(c[0]) > 0:
  79. c0[0] = c[0][0]
  80. if len(c[0]) > 1:
  81. c0[1] = c[0][1]
  82. if len(c) > 1 and len(c[1]) > 1:
  83. c1[0] = c[1][0]
  84. c1[1] = c[1][1]
  85. return [c0, c1]
  86. def copy(self):
  87. r = TestResult(self.title)
  88. r.data = self.data.copy()
  89. r.heading = self.heading.copy()
  90. return r
  91. def addMinMaxAvg(self, mma=None):
  92. if mma is None:
  93. return (1, self.copy(), self.copy(), self.copy())
  94. (n, mi, mx, a) = mma
  95. for k in a.heading:
  96. if k in self.heading:
  97. a.data[k] += self.data[k]
  98. for k in mi.heading:
  99. if k in self.heading:
  100. mi.data[k] = min(mi.data[k], self.data[k])
  101. for k in mx.heading:
  102. if k in self.heading:
  103. mx.data[k] = max(mx.data[k], self.data[k])
  104. return (n + 1, mi, mx, a)
  105. @staticmethod
  106. def finishMinMaxAvg(mma):
  107. if mma is None:
  108. return (TestResult("?"), TestResult("?"), TestResult("?"))
  109. else:
  110. (n, mi, ma, a) = mma
  111. for k in a.heading:
  112. if n > 0:
  113. a.data[k] = a.data[k] / n
  114. else:
  115. a.data[k] = 0.0
  116. return (mi, ma, a)
  117. def lr(ttd):
  118. """
  119. Runs a test for a dataset with the logistic regression algorithm.
  120. It returns a /TestResult./
  121. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  122. """
  123. checkType(ttd)
  124. logreg = LogisticRegression(
  125. C=1e5,
  126. solver='lbfgs',
  127. max_iter=10000,
  128. multi_class='multinomial',
  129. class_weight={0: 1, 1: 1.3}
  130. )
  131. logreg.fit(ttd.train.data, ttd.train.labels)
  132. prediction = logreg.predict(ttd.test.data)
  133. prob_lr = logreg.predict_proba(ttd.test.data)
  134. aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
  135. return TestResult("LR", ttd.test.labels, prediction, aps_lr)
  136. def knn(ttd):
  137. """
  138. Runs a test for a dataset with the k-next neighbourhood algorithm.
  139. It returns a /TestResult./
  140. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  141. """
  142. checkType(ttd)
  143. knnTester = KNeighborsClassifier(n_neighbors=10)
  144. knnTester.fit(ttd.train.data, ttd.train.labels)
  145. return runTester(ttd, knnTester, "KNN")
  146. def gb(ttd):
  147. """
  148. Runs a test for a dataset with the gradient boosting algorithm.
  149. It returns a /TestResult./
  150. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  151. """
  152. checkType(ttd)
  153. tester = GradientBoostingClassifier()
  154. tester.fit(ttd.train.data, ttd.train.labels)
  155. return runTester(ttd, tester, "GB")
  156. def rf(ttd):
  157. """
  158. Runs a test for a dataset with the random forest algorithm.
  159. It returns a /TestResult./
  160. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  161. """
  162. checkType(ttd)
  163. tester = RandomForestClassifier()
  164. tester.fit(ttd.train.data, ttd.train.labels)
  165. return runTester(ttd, tester, "RF")
  166. def runTester(ttd, tester, name="GAN"):
  167. prediction = tester.predict(ttd.test.data)
  168. return TestResult(name, ttd.test.labels, prediction)
  169. def checkType(t):
  170. if str(type(t)) == "<class 'numpy.ndarray'>":
  171. return t.shape[0] > 0 and all(map(checkType, t))
  172. elif str(type(t)) == "<class 'list'>":
  173. return len(t) > 0 and all(map(checkType, t))
  174. elif str(type(t)) in ["<class 'int'>", "<class 'float'>", "<class 'numpy.float64'>"]:
  175. return True
  176. elif str(type(t)) == "<class 'library.dataset.DataSet'>":
  177. return checkType(t.data0) and checkType(t.data1)
  178. elif str(type(t)) == "<class 'library.dataset.TrainTestData'>":
  179. return checkType(t.train) and checkType(t.test)
  180. else:
  181. raise ValueError("expected int, float, or list, dataset of int, float but got " + str(type(t)))
  182. return False