testers.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. """
  2. This module contains test function for datasets using the logistic regression, the support vector
  3. machine and the k-next-neighbourhood algoritm. Additionally it contains a class for storing the
  4. results of the tests.
  5. """
  6. import sklearn
  7. # needed in function lr
  8. from sklearn.neighbors import KNeighborsClassifier
  9. from sklearn.linear_model import LogisticRegression
  10. from sklearn.metrics import confusion_matrix
  11. from sklearn.metrics import average_precision_score
  12. from sklearn.metrics import f1_score
  13. from sklearn.metrics import balanced_accuracy_score
  14. from sklearn.metrics import cohen_kappa_score
  15. from sklearn.ensemble import GradientBoostingClassifier
  16. _tF1 = "f1 score"
  17. _tBalAcc = "balanced accuracy"
  18. _tTN = "TN"
  19. _tTP = "TP"
  20. _tFN = "FN"
  21. _tFP = "FP"
  22. _tAps = "average precision score"
  23. _tCks = "cohens kappa score"
  24. class TestResult:
  25. """
  26. This class represents the result of one test.
  27. It stores its *title*, a confusion matrix (*con_mat*), the balanced accuracy score (*bal_acc*)
  28. and the f1 score (*f1*). If given the average precision score is also stored (*aps*).
  29. """
  30. def __init__(self, title, labels=None, prediction=None, aps=None):
  31. """
  32. Creates an instance of this class. The stored data will be generated from the given values.
  33. *title* is a text to identify this result.
  34. *labels* is a /numpy.array/ containing the labels of the test-data-set.
  35. *prediction* is a /numpy.array/ containing the done prediction for the test-data-set.
  36. *aps* is a real number representing the average precision score.
  37. """
  38. self.title = title
  39. self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tBalAcc, _tCks]
  40. if aps is not None:
  41. self.heading.append(_tAps)
  42. self.data = { n: 0.0 for n in self.heading }
  43. if labels is not None and prediction is not None:
  44. self.data[_tBalAcc] = balanced_accuracy_score(labels, prediction)
  45. self.data[_tF1] = f1_score(labels, prediction)
  46. self.data[_tCks] = cohen_kappa_score(labels, prediction)
  47. conMat = self._enshureConfusionMatrix(confusion_matrix(labels, prediction))
  48. [[tn, fp], [fn, tp]] = conMat
  49. self.data[_tTN] = tn
  50. self.data[_tTP] = tp
  51. self.data[_tFN] = fn
  52. self.data[_tFP] = fp
  53. if aps is not None:
  54. self.data[_tAps] = aps
  55. def __str__(self):
  56. """
  57. Generates a text representing this result.
  58. """
  59. text = ""
  60. tn = self.data[_tTN]
  61. tp = self.data[_tTP]
  62. fn = self.data[_tFN]
  63. fp = self.data[_tFP]
  64. text += f"{self.title} tn, fp: {tn}, {tp}\n"
  65. text += f"{self.title} fn, tp: {fn}, {tp}"
  66. for k in self.heading:
  67. if k not in [_tTP, _tTN, _tFP, _tFN]:
  68. text += f"{self.title} {k}: {self.data[k]:.3f}\n"
  69. return text
  70. def csvHeading(self):
  71. return ";".join(self.heading)
  72. def toCSV(self):
  73. return ";".join(map(lambda k: f"{self.data[k]:0.3f}", self.heading))
  74. @staticmethod
  75. def _enshureConfusionMatrix(c):
  76. c0 = [0.0, 0.0]
  77. c1 = [0.0, 0.0]
  78. if len(c) > 0:
  79. if len(c[0]) > 0:
  80. c0[0] = c[0][0]
  81. if len(c[0]) > 1:
  82. c0[1] = c[0][1]
  83. if len(c) > 1 and len(c[1]) > 1:
  84. c1[0] = c[1][0]
  85. c1[1] = c[1][1]
  86. return [c0, c1]
  87. def copy(self):
  88. r = TestResult(self.title)
  89. r.data = self.data.copy()
  90. r.heading = self.heading.copy()
  91. return r
  92. def addMinMaxAvg(self, mma=None):
  93. if mma is None:
  94. return (1, self.copy(), self.copy(), self.copy())
  95. (n, mi, mx, a) = mma
  96. for k in a.heading:
  97. if k in self.heading:
  98. a.data[k] += self.data[k]
  99. for k in mi.heading:
  100. if k in self.heading:
  101. mi.data[k] = min(mi.data[k], self.data[k])
  102. for k in mx.heading:
  103. if k in self.heading:
  104. mx.data[k] = max(mx.data[k], self.data[k])
  105. return (n + 1, mi, mx, a)
  106. @staticmethod
  107. def finishMinMaxAvg(mma):
  108. if mma is None:
  109. return (TestResult("?"), TestResult("?"), TestResult("?"))
  110. else:
  111. (n, mi, ma, a) = mma
  112. for k in a.heading:
  113. if n > 0:
  114. a.data[k] = a.data[k] / n
  115. else:
  116. a.data[k] = 0.0
  117. return (mi, ma, a)
  118. def lr(ttd):
  119. """
  120. Runs a test for a dataset with the logistic regression algorithm.
  121. It returns a /TestResult./
  122. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  123. """
  124. checkType(ttd)
  125. logreg = LogisticRegression(
  126. C=1e5,
  127. solver='lbfgs',
  128. multi_class='multinomial',
  129. class_weight={0: 1, 1: 1.3}
  130. )
  131. logreg.fit(ttd.train.data, ttd.train.labels)
  132. prediction = logreg.predict(ttd.test.data)
  133. prob_lr = logreg.predict_proba(ttd.test.data)
  134. aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
  135. return TestResult("LR", ttd.test.labels, prediction, aps_lr)
  136. def svm(ttd):
  137. """
  138. Runs a test for a dataset with the support vector machine algorithm.
  139. It returns a /TestResult./
  140. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  141. """
  142. checkType(ttd)
  143. svmTester = sklearn.svm.SVC(
  144. kernel='linear',
  145. decision_function_shape='ovo',
  146. class_weight={0: 1., 1: 1.},
  147. probability=True
  148. )
  149. svmTester.fit(ttd.train.data, ttd.train.labels)
  150. prediction = svmTester.predict(ttd.test.data)
  151. return TestResult("SVM", ttd.test.labels, prediction)
  152. def knn(ttd):
  153. """
  154. Runs a test for a dataset with the k-next neighbourhood algorithm.
  155. It returns a /TestResult./
  156. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  157. """
  158. checkType(ttd)
  159. knnTester = KNeighborsClassifier(n_neighbors=10)
  160. knnTester.fit(ttd.train.data, ttd.train.labels)
  161. prediction = knnTester.predict(ttd.test.data)
  162. return TestResult("KNN", ttd.test.labels, prediction)
  163. def gb(ttd):
  164. """
  165. Runs a test for a dataset with the gradient boosting algorithm.
  166. It returns a /TestResult./
  167. *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
  168. """
  169. checkType(ttd)
  170. tester = GradientBoostingClassifier()
  171. tester.fit(ttd.train.data, ttd.train.labels)
  172. prediction = tester.predict(ttd.test.data)
  173. return TestResult("GB", ttd.test.labels, prediction)
  174. def checkType(t):
  175. if str(type(t)) == "<class 'numpy.ndarray'>":
  176. return t.shape[0] > 0 and all(map(checkType, t))
  177. elif str(type(t)) == "<class 'list'>":
  178. return len(t) > 0 and all(map(checkType, t))
  179. elif str(type(t)) in ["<class 'int'>", "<class 'float'>", "<class 'numpy.float64'>"]:
  180. return True
  181. elif str(type(t)) == "<class 'library.dataset.DataSet'>":
  182. return checkType(t.data0) and checkType(t.data1)
  183. elif str(type(t)) == "<class 'library.dataset.TrainTestData'>":
  184. return checkType(t.train) and checkType(t.test)
  185. else:
  186. raise ValueError("expected int, float, or list, dataset of int, float but got " + str(type(t)))
  187. return False