testers.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. """
  2. This module contains test function for datasets using the logistic regression, the support vector
  3. machine and the k-next-neighbourhood algoritm. Additionally it contains a class for storing the
  4. results of the tests.
  5. """
  6. import sklearn
  7. # needed in function lr
  8. from sklearn.neighbors import KNeighborsClassifier
  9. from sklearn.linear_model import LogisticRegression
  10. from sklearn.metrics import confusion_matrix
  11. from sklearn.metrics import average_precision_score
  12. from sklearn.metrics import f1_score
  13. from sklearn.metrics import cohen_kappa_score
  14. from sklearn.ensemble import GradientBoostingClassifier
  15. _tF1 = "f1 score"
  16. _tTN = "TN"
  17. _tTP = "TP"
  18. _tFN = "FN"
  19. _tFP = "FP"
  20. _tAps = "average precision score"
  21. _tCks = "cohens kappa score"
  22. class TestResult:
  23. """
  24. This class represents the result of one test.
  25. It stores its *title*, a confusion matrix (*con_mat*), the balanced accuracy score (*bal_acc*)
  26. and the f1 score (*f1*). If given the average precision score is also stored (*aps*).
  27. """
  28. def __init__(self, title, labels=None, prediction=None, aps=None):
  29. """
  30. Creates an instance of this class. The stored data will be generated from the given values.
  31. *title* is a text to identify this result.
  32. *labels* is a /numpy.array/ containing the labels of the test-data-set.
  33. *prediction* is a /numpy.array/ containing the done prediction for the test-data-set.
  34. *aps* is a real number representing the average precision score.
  35. """
  36. self.title = title
  37. self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks]
  38. if aps is not None:
  39. self.heading.append(_tAps)
  40. self.data = { n: 0.0 for n in self.heading }
  41. if labels is not None and prediction is not None:
  42. self.data[_tF1] = f1_score(labels, prediction)
  43. self.data[_tCks] = cohen_kappa_score(labels, prediction)
  44. conMat = self._enshureConfusionMatrix(confusion_matrix(labels, prediction))
  45. [[tn, fp], [fn, tp]] = conMat
  46. self.data[_tTN] = tn
  47. self.data[_tTP] = tp
  48. self.data[_tFN] = fn
  49. self.data[_tFP] = fp
  50. if aps is not None:
  51. self.data[_tAps] = aps
  52. def __str__(self):
  53. """
  54. Generates a text representing this result.
  55. """
  56. text = ""
  57. tn = self.data[_tTN]
  58. tp = self.data[_tTP]
  59. fn = self.data[_tFN]
  60. fp = self.data[_tFP]
  61. text += f"{self.title} tn, fp: {tn}, {fp}\n"
  62. text += f"{self.title} fn, tp: {fn}, {tp}\n"
  63. for k in self.heading:
  64. if k not in [_tTP, _tTN, _tFP, _tFN]:
  65. text += f"{self.title} {k}: {self.data[k]:.3f}\n"
  66. return text
  67. def csvHeading(self):
  68. return ";".join(self.heading)
  69. def toCSV(self):
  70. return ";".join(map(lambda k: f"{self.data[k]:0.3f}", self.heading))
  71. @staticmethod
  72. def _enshureConfusionMatrix(c):
  73. c0 = [0.0, 0.0]
  74. c1 = [0.0, 0.0]
  75. if len(c) > 0:
  76. if len(c[0]) > 0:
  77. c0[0] = c[0][0]
  78. if len(c[0]) > 1:
  79. c0[1] = c[0][1]
  80. if len(c) > 1 and len(c[1]) > 1:
  81. c1[0] = c[1][0]
  82. c1[1] = c[1][1]
  83. return [c0, c1]
  84. def copy(self):
  85. r = TestResult(self.title)
  86. r.data = self.data.copy()
  87. r.heading = self.heading.copy()
  88. return r
  89. def addMinMaxAvg(self, mma=None):
  90. if mma is None:
  91. return (1, self.copy(), self.copy(), self.copy())
  92. (n, mi, mx, a) = mma
  93. for k in a.heading:
  94. if k in self.heading:
  95. a.data[k] += self.data[k]
  96. for k in mi.heading:
  97. if k in self.heading:
  98. mi.data[k] = min(mi.data[k], self.data[k])
  99. for k in mx.heading:
  100. if k in self.heading:
  101. mx.data[k] = max(mx.data[k], self.data[k])
  102. return (n + 1, mi, mx, a)
  103. @staticmethod
  104. def finishMinMaxAvg(mma):
  105. if mma is None:
  106. return (TestResult("?"), TestResult("?"), TestResult("?"))
  107. else:
  108. (n, mi, ma, a) = mma
  109. for k in a.heading:
  110. if n > 0:
  111. a.data[k] = a.data[k] / n
  112. else:
  113. a.data[k] = 0.0
  114. return (mi, ma, a)
  115. def lr(ttd):
  116. """
  117. Runs a test for a dataset with the logistic regression algorithm.
  118. It returns a /TestResult./
  119. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  120. """
  121. checkType(ttd)
  122. logreg = LogisticRegression(
  123. C=1e5,
  124. solver='lbfgs',
  125. max_iter=10000,
  126. multi_class='multinomial',
  127. class_weight={0: 1, 1: 1.3}
  128. )
  129. logreg.fit(ttd.train.data, ttd.train.labels)
  130. prediction = logreg.predict(ttd.test.data)
  131. prob_lr = logreg.predict_proba(ttd.test.data)
  132. aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
  133. return TestResult("LR", ttd.test.labels, prediction, aps_lr)
  134. def knn(ttd):
  135. """
  136. Runs a test for a dataset with the k-next neighbourhood algorithm.
  137. It returns a /TestResult./
  138. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  139. """
  140. checkType(ttd)
  141. knnTester = KNeighborsClassifier(n_neighbors=10)
  142. knnTester.fit(ttd.train.data, ttd.train.labels)
  143. return runTester(ttd, knnTester, "KNN")
  144. def gb(ttd):
  145. """
  146. Runs a test for a dataset with the gradient boosting algorithm.
  147. It returns a /TestResult./
  148. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  149. """
  150. checkType(ttd)
  151. tester = GradientBoostingClassifier()
  152. tester.fit(ttd.train.data, ttd.train.labels)
  153. return runTester(ttd, tester, "GB")
  154. def runTester(ttd, tester, name="GAN"):
  155. prediction = tester.predict(ttd.test.data)
  156. return TestResult(name, ttd.test.labels, prediction)
  157. def checkType(t):
  158. if str(type(t)) == "<class 'numpy.ndarray'>":
  159. return t.shape[0] > 0 and all(map(checkType, t))
  160. elif str(type(t)) == "<class 'list'>":
  161. return len(t) > 0 and all(map(checkType, t))
  162. elif str(type(t)) in ["<class 'int'>", "<class 'float'>", "<class 'numpy.float64'>"]:
  163. return True
  164. elif str(type(t)) == "<class 'library.dataset.DataSet'>":
  165. return checkType(t.data0) and checkType(t.data1)
  166. elif str(type(t)) == "<class 'library.dataset.TrainTestData'>":
  167. return checkType(t.train) and checkType(t.test)
  168. else:
  169. raise ValueError("expected int, float, or list, dataset of int, float but got " + str(type(t)))
  170. return False