fyrr
/
ConvGeNCode


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
							"""
This module contains test function for datasets using the logistic regression, the support vector
machine and the k-next-neighbourhood algoritm. Additionally it contains a class for storing the
results of the tests.
"""


import sklearn
# needed in function lr
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.metrics import geometric_mean_score

from library.cache import dataCache

_tF1 = "f1 score"
_tTN = "TN"
_tTP = "TP"
_tFN = "FN"
_tFP = "FP"
_tFP = "RF"
_tAps = "average precision score"
_tCks = "cohens kappa score"
_tGMean = "G-Mean score"

class TestResult:
    """
    This class represents the result of one test.

    It stores its *title*, a confusion matrix (*con_mat*), the balanced accuracy score (*bal_acc*)
    and the f1 score (*f1*). If given the average precision score is also stored (*aps*).
    """
    def __init__(self, title, labels=None, prediction=None, aps=None):
        """
        Creates an instance of this class. The stored data will be generated from the given values.

        *title* is a text to identify this result.

        *labels* is a /numpy.array/ containing the labels of the test-data-set.

        *prediction* is a /numpy.array/ containing the done prediction for the test-data-set.

        *aps* is a real number representing the average precision score.
        """
        self.title = title
        self.heading = [_tTN, _tTP, _tFN, _tFP, _tF1, _tCks, _tAps, _tGMean]
        self.data = { n: 0.0 for n in self.heading }


        self.labels = labels
        self.prediction = prediction

        if labels is not None and prediction is not None:
            self.data[_tF1]     = f1_score(labels, prediction)
            self.data[_tCks]    = cohen_kappa_score(labels, prediction)
            conMat = self._enshureConfusionMatrix(confusion_matrix(labels, prediction))
            [[tn, fp], [fn, tp]] = conMat
            self.data[_tTN] = tn
            self.data[_tTP] = tp
            self.data[_tFN] = fn
            self.data[_tFP] = fp
            self.data[_tGMean] = geometric_mean_score(labels, prediction)
            if aps is None:
                self.data[_tAps] = average_precision_score(labels, prediction)

        if aps is not None:
            self.data[_tAps] = aps


    def __str__(self):
        """
        Generates a text representing this result.
        """
        text = ""

        tn = self.data[_tTN]
        tp = self.data[_tTP]
        fn = self.data[_tFN]
        fp = self.data[_tFP]
        text += f"{self.title} tn, fp: {tn}, {fp}\n"
        text += f"{self.title} fn, tp: {fn}, {tp}\n"

        for k in self.heading:
            if k not in [_tTP, _tTN, _tFP, _tFN]:
                text += f"{self.title} {k}: {self.data[k]:.3f}\n"

        return text

    def csvHeading(self):
        return ";".join(self.heading)

    def toCSV(self):
        return ";".join(map(lambda k: f"{self.data[k]:0.3f}", self.heading))

    @staticmethod
    def _enshureConfusionMatrix(c):
        c0 = [0.0, 0.0]
        c1 = [0.0, 0.0]

        if len(c) > 0:
            if len(c[0]) > 0:
                c0[0] = c[0][0]

            if len(c[0]) > 1:
                c0[1] = c[0][1]

        if len(c) > 1 and len(c[1]) > 1:
            c1[0] = c[1][0]
            c1[1] = c[1][1]

        return [c0, c1]

    def copy(self):
        r = TestResult(self.title)
        r.data = self.data.copy()
        r.heading = self.heading.copy()
        return r


    def addMinMaxAvg(self, mma=None):
        if mma is None:
            return (1, self.copy(), self.copy(), self.copy())

        (n, mi, mx, a) = mma

        for k in a.heading:
            if k in self.heading:
                a.data[k] += self.data[k]

        for k in mi.heading:
            if k in self.heading:
                mi.data[k] = min(mi.data[k], self.data[k])

        for k in mx.heading:
            if k in self.heading:
                mx.data[k] = max(mx.data[k], self.data[k])

        return (n + 1, mi, mx, a)

    @staticmethod
    def finishMinMaxAvg(mma):
        if mma is None:
            return (TestResult("?"), TestResult("?"), TestResult("?"))
        else:
            (n, mi, ma, a) = mma
            for k in a.heading:
                if n > 0:
                    a.data[k] = a.data[k] / n
                else:
                    a.data[k] = 0.0
            return (mi, ma, a)

    def plotPR(self, ax):
        PrecisionRecallDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)

    def plotROC(self, ax):
        RocCurveDisplay.from_predictions(self.labels, self.prediction, name=self.title, ax=ax)

        
def lr(ttd, jsonFileName=None):
    """
    Runs a test for a dataset with the logistic regression algorithm.
    It returns a /TestResult./

    *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
    """
    def g(nothing):
        checkType(ttd)
        logreg = LogisticRegression(
            C=1e5,
            solver='lbfgs',
            max_iter=10000,
            multi_class='multinomial',
            class_weight={0: 1, 1: 1.3}
            )
        logreg.fit(ttd.train.data, ttd.train.labels)
        prediction = logreg.predict(ttd.test.data)
        prob_lr = logreg.predict_proba(ttd.test.data)
        aps_lr = average_precision_score(ttd.test.labels, prob_lr[:,1])
        return {
            "labels": ttd.test.labels,
            "prediction": prediction,
            "aps_lr": aps_lr
            }

    d = dataCache(jsonFileName, g)
    return TestResult("LR", d["labels"], d["prediction"], d["aps_lr"])


def knn(ttd, jsonFileName=None):
    """
    Runs a test for a dataset with the k-next neighbourhood algorithm.
    It returns a /TestResult./

    *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
    """
    knnTester = KNeighborsClassifier(n_neighbors=10)
    return runTester(ttd, knnTester, "KNN", jsonFileName)


def gb(ttd, jsonFileName=None):
    """
    Runs a test for a dataset with the gradient boosting algorithm.
    It returns a /TestResult./

    *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
    """
    tester = GradientBoostingClassifier()
    return runTester(ttd, tester, "GB", jsonFileName)


def rf(ttd, jsonFileName=None):
    """
    Runs a test for a dataset with the random forest algorithm.
    It returns a /TestResult./

    *ttd* is a /library.dataset.TrainTestData/ instance containing data to test.
    """
    tester = RandomForestClassifier()
    return runTester(ttd, tester, "RF", jsonFileName)


def runTester(ttd, tester, name="GAN", jsonFileName=None):
    def g(nothing):
        checkType(ttd)
        tester.fit(ttd.train.data, ttd.train.labels)
        return {
            "labels": ttd.test.labels,
            "prediction": tester.predict(ttd.test.data)
            }

    d = dataCache(jsonFileName, g)
    return TestResult(name, d["labels"], d["prediction"])

def checkType(t):
    if str(type(t)) == "<class 'numpy.ndarray'>":
        return t.shape[0] > 0 and all(map(checkType, t))
    elif str(type(t)) == "<class 'list'>":
        return len(t) > 0 and all(map(checkType, t))
    elif str(type(t)) in ["<class 'int'>", "<class 'float'>", "<class 'numpy.float64'>"]:
        return True
    elif str(type(t)) == "<class 'library.dataset.DataSet'>":
        return checkType(t.data0) and checkType(t.data1)
    elif str(type(t)) == "<class 'library.dataset.TrainTestData'>":
        return checkType(t.train) and checkType(t.test)
    else:
        raise ValueError("expected int, float, or list, dataset of int, float but got " + str(type(t)))
        return False