| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- import math
- import numpy as np
- class DataSet:
- """
- Stores data and Labels.
- """
- def __init__(self, data0=None, data1=None):
- self.data0 = data0
- self.data1 = data1
- self.size0 = len(data0) if data0 is not None else 0
- self.size1 = len(data1) if data1 is not None else 0
- if data0 is not None and data1 is not None:
- self.data = np.concatenate( [data1, data0] )
- self.labels = np.concatenate( [self.labels1(), self.labels0()] )
- elif data0 is None:
- self.data = data1
- self.labels = self.labels1()
- elif data1 is None:
- self.data = data0
- self.labels = self.labels0()
- else:
- raise AttributeError("Expected data, data0 or data1 to be a numpy.array")
- def shuffleWith(self, shuffleFn):
- if self.data0 is not None:
- self.data0 = shuffleFn(self.data0)
- if self.data1 is not None:
- self.data1 = shuffleFn(self.data1)
- if self.data0 is None:
- self.data = self.data1
- elif self.data1 is None:
- self.data = self.data0
- else:
- self.data = np.concatenate((self.data1, self.data0))
- def labels0(self):
- return np.zeros(self.size0)
- def labels1(self):
- return np.zeros(self.size0) + 1
- class TrainTestData:
- """
- Stores features, data and labels for class 0 and class 1.
- """
- def __init__(self, train, test):
- self.train = train
- self.test = test
- @staticmethod
- def splitUpData(data, trainFactor=0.9):
- size = len(data)
- trainSize = math.ceil(size * trainFactor)
- trn = data[list(range(0, trainSize))]
- tst = data[list(range(trainSize, size))]
- return trn, tst
- @classmethod
- def splitDataByFactor(cls, features0, features1, trainFactor=0.9):
- features_0_trn, features_0_tst = cls.splitUpData(features0, trainFactor)
- features_1_trn, features_1_tst = cls.splitUpData(features1, trainFactor)
- return cls(
- test=DataSet(data1=features_1_tst, data0=features_0_tst),
- train=DataSet(data1=features_1_trn, data0=features_0_trn)
- )
- @classmethod
- def splitDataToSlices(cls, bigData, numOfSlices=5):
- data0slices = cls._arrayToSlices(bigData.data0, numOfSlices)
- data1slices = cls._arrayToSlices(bigData.data1, numOfSlices)
- for n in range(numOfSlices):
- data0 = np.concatenate([data0slices[k] for k in range(numOfSlices) if n != k])
- data1 = np.concatenate([data1slices[k] for k in range(numOfSlices) if n != k])
- train = DataSet(data0=data0, data1=data1)
- test = DataSet(data0=data0slices[n], data1=data1slices[n])
- yield cls(train=train, test=test)
- @staticmethod
- def _arrayToSlices(data, numOfSlices):
- size = len(data)
- if size < numOfSlices:
- raise AttributeError(
- f"Expected data set to contain at least {numOfSlices} points"
- + f" but got {size} points."
- )
- sliceSize = (size // numOfSlices) + (0 if size % numOfSlices == 0 else 1)
- return [
- data[n * sliceSize : min(size, (n+1) * sliceSize)]
- for n in range(numOfSlices)
- ]
|