dataset.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. """
  2. This module contains classes to collect data for testing and training.
  3. """
  4. import math
  5. import numpy as np
  6. class DataSet:
  7. """
  8. This class stores data and labels for a test or training dataset.
  9. *data0*, *data1* are instances of /numpy.array/. Containg the data for the class 0 (majority
  10. class) and the class 1 (minority class).
  11. *size0*, *size1* are integers, giving the size of the classes 0 and 1.
  12. *data* is an instance of /numpy.array/ containing the combined classes 0 and 1.
  13. *labels* is a /numpy.array/ containing the labels for *data*.
  14. """
  15. def __init__(self, data0=None, data1=None):
  16. """
  17. Initializes one instance of this class and fills *data* and *labels*.
  18. """
  19. self.data0 = data0
  20. self.data1 = data1
  21. self.size0 = len(data0) if data0 is not None else 0
  22. self.size1 = len(data1) if data1 is not None else 0
  23. if data0 is not None and data1 is not None:
  24. self.data = np.concatenate( [data1, data0] )
  25. self.labels = np.concatenate( [self.labels1(), self.labels0()] )
  26. elif data0 is None:
  27. self.data = data1
  28. self.labels = self.labels1()
  29. elif data1 is None:
  30. self.data = data0
  31. self.labels = self.labels0()
  32. else:
  33. raise AttributeError("Expected data, data0 or data1 to be a numpy.array")
  34. def shuffleWith(self, shuffleFn):
  35. """
  36. Shuffles the points in the classes 0 and 1 with the given function
  37. (numpy.array -> numpy.array). After that the *data* array will be regenerated.
  38. """
  39. if self.data0 is not None:
  40. self.data0 = shuffleFn(self.data0)
  41. if self.data1 is not None:
  42. self.data1 = shuffleFn(self.data1)
  43. if self.data0 is None:
  44. self.data = self.data1
  45. elif self.data1 is None:
  46. self.data = self.data0
  47. else:
  48. self.data = np.concatenate((self.data1, self.data0))
  49. def labels0(self):
  50. """
  51. Returns a /numpy.array/ with labels for class0.
  52. """
  53. return np.zeros(self.size0)
  54. def labels1(self):
  55. """
  56. Returns a /numpy.array/ with labels for class1.
  57. """
  58. return np.zeros(self.size1) + 1
  59. class TrainTestData:
  60. """
  61. Stores data and labels for class 0 and class 1.
  62. *train* is a /DataSet/ containing the data for training.
  63. *test* is a /DataSet/ containing the data for testing.
  64. """
  65. def __init__(self, train, test):
  66. """
  67. Initializes a new instance for this class and stores the given data.
  68. """
  69. self.train = train
  70. self.test = test
  71. @classmethod
  72. def splitDataByFactor(cls, features0, features1, factor=0.9):
  73. """
  74. Creates a new instance of this class.
  75. The first (factor * 100%) percent of the points in the given classes are stored for
  76. training. The remaining points are stored for testing.
  77. *features0* and *features1* are /numpy.array/ instances containing the data for class 0
  78. and class 1.
  79. *factor* is a real number > 0 and < 1 for the spliting point.
  80. """
  81. if factor <= 0.0 or factor >= 1.0:
  82. raise AttributeError(f"Expected trainFactor to be between 0 and 1 but got {factor}.")
  83. # ----------------------------------------------------------------------------------------
  84. # Supporting function:
  85. def splitUpData(data):
  86. """
  87. Splits a given /numpy.array/ in two /numpy.array/.
  88. The first array contains (factor * 100%) percent of the data points.
  89. The second array contains the remaining data points.
  90. """
  91. size = len(data)
  92. trainSize = math.ceil(size * factor)
  93. trn = data[list(range(0, trainSize))]
  94. tst = data[list(range(trainSize, size))]
  95. return trn, tst
  96. # ----------------------------------------------------------------------------------------
  97. features_0_trn, features_0_tst = splitUpData(features0)
  98. features_1_trn, features_1_tst = splitUpData(features1)
  99. return cls(
  100. test=DataSet(data1=features_1_tst, data0=features_0_tst),
  101. train=DataSet(data1=features_1_trn, data0=features_0_trn)
  102. )
  103. @classmethod
  104. def splitDataToSlices(cls, bigData, numOfSlices=5):
  105. """
  106. Creates a list of new instance of this class. The list is returned as a generator.
  107. The given data is splitted in the given number of slices.
  108. *bigData* is an instance of /DataSet/ containing the data to split.
  109. *numOfSlices* is the number of generated slices.
  110. """
  111. numOfSlices = int(numOfSlices)
  112. if numOfSlices < 1:
  113. raise AttributeError(f"Expected numOfSlices to be positive but got {numOfSlices}")
  114. # ----------------------------------------------------------------------------------------
  115. # Supporting function:
  116. def arrayToSlices(data):
  117. """
  118. Takes a /numpy.array/ and splits it into *numOfSlices* slices.
  119. A list of the slices will be returned.
  120. """
  121. size = len(data)
  122. if size < numOfSlices:
  123. raise AttributeError(
  124. f"Expected data set to contain at least {numOfSlices} points"
  125. + f" but got {size} points."
  126. )
  127. sliceSize = (size // numOfSlices) + (0 if size % numOfSlices == 0 else 1)
  128. return [
  129. data[n * sliceSize : min(size, (n+1) * sliceSize)]
  130. for n in range(numOfSlices)
  131. ]
  132. # ----------------------------------------------------------------------------------------
  133. data0slices = arrayToSlices(bigData.data0)
  134. data1slices = arrayToSlices(bigData.data1)
  135. for n in range(numOfSlices):
  136. data0 = np.concatenate([data0slices[k] for k in range(numOfSlices) if n != k])
  137. data1 = np.concatenate([data1slices[k] for k in range(numOfSlices) if n != k])
  138. train = DataSet(data0=data0, data1=data1)
  139. test = DataSet(data0=data0slices[n], data1=data1slices[n])
  140. yield cls(train=train, test=test)