dataset.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import math
  2. import numpy as np
  3. class DataSet:
  4. """
  5. Stores data and Labels.
  6. """
  7. def __init__(self, data0=None, data1=None):
  8. self.data0 = data0
  9. self.data1 = data1
  10. self.size0 = len(data0) if data0 is not None else 0
  11. self.size1 = len(data1) if data1 is not None else 0
  12. if data0 is not None and data1 is not None:
  13. self.data = np.concatenate( [data1, data0] )
  14. self.labels = np.concatenate( [self.labels1(), self.labels0()] )
  15. elif data0 is None:
  16. self.data = data1
  17. self.labels = self.labels1()
  18. elif data1 is None:
  19. self.data = data0
  20. self.labels = self.labels0()
  21. else:
  22. raise AttributeError("Expected data, data0 or data1 to be a numpy.array")
  23. def shuffleWith(self, shuffleFn):
  24. if self.data0 is not None:
  25. self.data0 = shuffleFn(self.data0)
  26. if self.data1 is not None:
  27. self.data1 = shuffleFn(self.data1)
  28. if self.data0 is None:
  29. self.data = self.data1
  30. elif self.data1 is None:
  31. self.data = self.data0
  32. else:
  33. self.data = np.concatenate((self.data1, self.data0))
  34. def labels0(self):
  35. return np.zeros(self.size0)
  36. def labels1(self):
  37. return np.zeros(self.size1) + 1
  38. class TrainTestData:
  39. """
  40. Stores features, data and labels for class 0 and class 1.
  41. """
  42. def __init__(self, train, test):
  43. self.train = train
  44. self.test = test
  45. @staticmethod
  46. def splitUpData(data, trainFactor=0.9):
  47. size = len(data)
  48. trainSize = math.ceil(size * trainFactor)
  49. trn = data[list(range(0, trainSize))]
  50. tst = data[list(range(trainSize, size))]
  51. return trn, tst
  52. @classmethod
  53. def splitDataByFactor(cls, features0, features1, trainFactor=0.9):
  54. features_0_trn, features_0_tst = cls.splitUpData(features0, trainFactor)
  55. features_1_trn, features_1_tst = cls.splitUpData(features1, trainFactor)
  56. return cls(
  57. test=DataSet(data1=features_1_tst, data0=features_0_tst),
  58. train=DataSet(data1=features_1_trn, data0=features_0_trn)
  59. )
  60. @classmethod
  61. def splitDataToSlices(cls, bigData, numOfSlices=5):
  62. data0slices = cls._arrayToSlices(bigData.data0, numOfSlices)
  63. data1slices = cls._arrayToSlices(bigData.data1, numOfSlices)
  64. for n in range(numOfSlices):
  65. data0 = np.concatenate([data0slices[k] for k in range(numOfSlices) if n != k])
  66. data1 = np.concatenate([data1slices[k] for k in range(numOfSlices) if n != k])
  67. train = DataSet(data0=data0, data1=data1)
  68. test = DataSet(data0=data0slices[n], data1=data1slices[n])
  69. yield cls(train=train, test=test)
  70. @staticmethod
  71. def _arrayToSlices(data, numOfSlices):
  72. size = len(data)
  73. if size < numOfSlices:
  74. raise AttributeError(
  75. f"Expected data set to contain at least {numOfSlices} points"
  76. + f" but got {size} points."
  77. )
  78. sliceSize = (size // numOfSlices) + (0 if size % numOfSlices == 0 else 1)
  79. return [
  80. data[n * sliceSize : min(size, (n+1) * sliceSize)]
  81. for n in range(numOfSlices)
  82. ]