|
|
@@ -32,24 +32,89 @@ def create01Labels(totalSize, sizeFirstHalf):
|
|
|
labels.extend(repeat(np.array([0,1]), totalSize - sizeFirstHalf))
|
|
|
return np.array(labels)
|
|
|
|
|
|
+
|
|
|
+class GeneratorConfig:
|
|
|
+ def __init__(self, n_feat=None, neb=5, gen=None, neb_epochs=10, genLayerSizes=None, genAddNoise=True):
|
|
|
+ self.n_feat = n_feat
|
|
|
+ self.neb = neb
|
|
|
+ self.gen = gen
|
|
|
+ self.neb_epochs = neb_epochs
|
|
|
+ self.genAddNoise = genAddNoise
|
|
|
+ self.genLayerSizes = genLayerSizes
|
|
|
+
|
|
|
+ def isConfigMissing(self):
|
|
|
+ return any( x is None for x in
|
|
|
+ [ self.n_feat
|
|
|
+ , self.neb
|
|
|
+ , self.gen
|
|
|
+ , self.genAddNoise
|
|
|
+ , self.genLayerSizes
|
|
|
+ , self.neb_epochs
|
|
|
+ ])
|
|
|
+
|
|
|
+ def checkForValidConfig(self):
|
|
|
+ if self.isConfigMissing():
|
|
|
+ raise ValueError(f"Some configuration is missing.")
|
|
|
+
|
|
|
+ if self.neb > self.gen:
|
|
|
+ raise ValueError(f"Expected neb <= gen but got neb={self.neb} and gen={self.gen}.")
|
|
|
+
|
|
|
+ if sum(self.genLayerSizes) != self.gen:
|
|
|
+ raise ValueError(f"Expected the layer sizes to sum up to gen={self.gen}.")
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+ def fixMissingValuesByInputData(self, data):
|
|
|
+ config = GeneratorConfig()
|
|
|
+ config.neb = self.neb
|
|
|
+ config.gen = self.gen
|
|
|
+ config.genAddNoise = self.genAddNoise
|
|
|
+ config.genLayerSizes = self.genLayerSizes
|
|
|
+
|
|
|
+ if data is not None:
|
|
|
+ if config.n_feat is None:
|
|
|
+ config.n_feat = data.shape[1]
|
|
|
+
|
|
|
+ if config.neb is None:
|
|
|
+ config.neb = data.shape[0]
|
|
|
+ else:
|
|
|
+ config.neb = min(config.neb, data.shape[0])
|
|
|
+
|
|
|
+ if config.gen is None:
|
|
|
+ config.gen = config.neb
|
|
|
+
|
|
|
+ if config.genLayerSizes is None:
|
|
|
+ config.genLayerSizes = [config.gen]
|
|
|
+
|
|
|
+ return config
|
|
|
+
|
|
|
+ def nebShape(self, aboveSize=None):
|
|
|
+ if aboveSize is None:
|
|
|
+ return (self.neb, self.n_feat)
|
|
|
+ else:
|
|
|
+ return (aboveSize, self.neb, self.n_feat)
|
|
|
+
|
|
|
+ def genShape(self, aboveSize=None):
|
|
|
+ if aboveSize is None:
|
|
|
+ return (self.gen, self.n_feat)
|
|
|
+ else:
|
|
|
+ return (aboveSize, self.gen, self.n_feat)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
class XConvGeN(GanBaseClass):
|
|
|
"""
|
|
|
This is the ConvGeN class. ConvGeN is a synthetic point generator for imbalanced datasets.
|
|
|
"""
|
|
|
- def __init__(self, n_feat, neb=5, gen=None, neb_epochs=10, fdc=None, maj_proximal=False, debug=False):
|
|
|
+ def __init__(self, config=None, fdc=None, debug=False):
|
|
|
self.isTrained = False
|
|
|
- self.n_feat = n_feat
|
|
|
- self.neb = neb
|
|
|
- self.nebInitial = neb
|
|
|
- self.genInitial = gen
|
|
|
- self.gen = gen if gen is not None else self.neb
|
|
|
- self.neb_epochs = neb_epochs
|
|
|
+ self.config = config
|
|
|
+ self.defaultConfig = config
|
|
|
self.loss_history = None
|
|
|
self.debug = debug
|
|
|
self.minSetSize = 0
|
|
|
self.conv_sample_generator = None
|
|
|
self.maj_min_discriminator = None
|
|
|
- self.maj_proximal = maj_proximal
|
|
|
self.cg = None
|
|
|
self.canPredict = True
|
|
|
self.fdc = fdc
|
|
|
@@ -59,8 +124,8 @@ class XConvGeN(GanBaseClass):
|
|
|
"Train", "BMB", "NbhSearch", "NBH", "GenSamples", "Fit", "FixType"
|
|
|
] }
|
|
|
|
|
|
- if self.neb is not None and self.gen is not None and self.neb > self.gen:
|
|
|
- raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
|
|
|
+ if not self.config.isConfigMissing():
|
|
|
+ self.config.checkForValidConfig()
|
|
|
|
|
|
def reset(self, data):
|
|
|
"""
|
|
|
@@ -72,16 +137,8 @@ class XConvGeN(GanBaseClass):
|
|
|
"""
|
|
|
self.isTrained = False
|
|
|
|
|
|
- if data is not None:
|
|
|
- nMinoryPoints = data.shape[0]
|
|
|
- if self.nebInitial is None:
|
|
|
- self.neb = nMinoryPoints
|
|
|
- else:
|
|
|
- self.neb = min(self.nebInitial, nMinoryPoints)
|
|
|
- else:
|
|
|
- self.neb = self.nebInitial
|
|
|
-
|
|
|
- self.gen = self.genInitial if self.genInitial is not None else self.neb
|
|
|
+ self.config = self.defaultConfig.fixMissingValuesByInputData(data)
|
|
|
+ self.config.checkForValidConfig()
|
|
|
|
|
|
## instanciate generator network and visualize architecture
|
|
|
self.conv_sample_generator = self._conv_sample_gen()
|
|
|
@@ -94,7 +151,7 @@ class XConvGeN(GanBaseClass):
|
|
|
|
|
|
self.lastProgress = (-1,-1,-1)
|
|
|
if self.debug:
|
|
|
- print(f"neb={self.neb}, gen={self.gen}")
|
|
|
+ print(f"neb={self.config.neb}, gen={self.config.gen}")
|
|
|
|
|
|
print(self.conv_sample_generator.summary())
|
|
|
print('\n')
|
|
|
@@ -124,12 +181,9 @@ class XConvGeN(GanBaseClass):
|
|
|
if self.fdc is not None:
|
|
|
normalizedData = self.fdc.normalize(data)
|
|
|
|
|
|
- print(f"|N| = {normalizedData.shape}")
|
|
|
- print(f"|D| = {data.shape}")
|
|
|
-
|
|
|
self.timing["NbhSearch"].start()
|
|
|
# Precalculate neighborhoods
|
|
|
- self.nmbMin = NNSearch(self.neb).fit(haystack=normalizedData)
|
|
|
+ self.nmbMin = NNSearch(self.config.neb).fit(haystack=normalizedData)
|
|
|
self.nmbMin.basePoints = np.array([ [x.astype(np.float32) for x in p] for p in data])
|
|
|
self.timing["NbhSearch"].stop()
|
|
|
|
|
|
@@ -158,7 +212,7 @@ class XConvGeN(GanBaseClass):
|
|
|
|
|
|
## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
|
|
|
synth_num = (numOfSamples // self.minSetSize) + 1
|
|
|
- runs = (synth_num // self.gen) + 1
|
|
|
+ runs = (synth_num // self.config.gen) + 1
|
|
|
|
|
|
## Get a random list of all indices
|
|
|
indices = randomIndices(self.minSetSize)
|
|
|
@@ -184,7 +238,7 @@ class XConvGeN(GanBaseClass):
|
|
|
corrected = pairs.map(self.correct_feature_types())
|
|
|
|
|
|
## extract the exact number of synthetic samples needed to exactly balance the two classes
|
|
|
- r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.gen)))), axis=0)[:numOfSamples]
|
|
|
+ r = np.concatenate(np.array(list(corrected.take(1 + (numOfSamples // self.config.gen)))), axis=0)[:numOfSamples]
|
|
|
|
|
|
return r
|
|
|
|
|
|
@@ -202,48 +256,59 @@ class XConvGeN(GanBaseClass):
|
|
|
# ###############################################################
|
|
|
|
|
|
# Creating the Network: Generator
|
|
|
- def _conv_sample_gen(self, layerSize=None):
|
|
|
+ def _conv_sample_gen(self):
|
|
|
"""
|
|
|
The generator network to generate synthetic samples from the convex space
|
|
|
of arbitrary minority neighbourhoods
|
|
|
"""
|
|
|
|
|
|
- if layerSize is None:
|
|
|
- layerSize = (self.gen // 2) + 1
|
|
|
+ n_feat = self.config.n_feat
|
|
|
+ neb = self.config.neb
|
|
|
+ gen = self.config.gen
|
|
|
+ genLayerSizes = self.config.genLayerSizes
|
|
|
|
|
|
## takes minority batch as input
|
|
|
- min_neb_batch = Input(shape=(self.neb, self.n_feat,))
|
|
|
+ min_neb_batch = Input(shape=(neb, n_feat))
|
|
|
|
|
|
## using 1-D convolution, feature dimension remains the same
|
|
|
- x = Conv1D(self.n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
|
|
|
+ x = Conv1D(n_feat, 3, activation='relu', name="UnsharpenInput")(min_neb_batch)
|
|
|
## flatten after convolution
|
|
|
x = Flatten(name="InputMatrixToVector")(x)
|
|
|
|
|
|
synth = []
|
|
|
n = 0
|
|
|
- while n < self.gen:
|
|
|
- w = min(layerSize, self.gen - n)
|
|
|
+ if sum(genLayerSizes) < gen:
|
|
|
+ genLayerSizes.append(gen)
|
|
|
+
|
|
|
+ for layerSize in genLayerSizes:
|
|
|
+ w = min(layerSize, gen - n)
|
|
|
if w <= 0:
|
|
|
break
|
|
|
n += w
|
|
|
|
|
|
## add dense layer to transform the vector to a convenient dimension
|
|
|
- y = Dense(self.neb * w, activation='relu', name=f"P{n}_dense")(x)
|
|
|
+ y = Dense(neb * w, activation='relu', name=f"P{n}_dense")(x)
|
|
|
|
|
|
## again, witching to 2-D tensor once we have the convenient shape
|
|
|
- y = Reshape((self.neb, w), name=f"P{n}_reshape")(y)
|
|
|
+ y = Reshape((neb, w), name=f"P{n}_reshape")(y)
|
|
|
+
|
|
|
## column wise sum
|
|
|
s = K.sum(y, axis=1)
|
|
|
+
|
|
|
## adding a small constant to always ensure the column sums are non zero.
|
|
|
## if this is not done then during initialization the sum can be zero.
|
|
|
s_non_zero = Lambda(lambda x: x + .000001, name=f"P{n}_make_non_zero")(s)
|
|
|
+
|
|
|
## reprocals of the approximated column sum
|
|
|
sinv = tf.math.reciprocal(s_non_zero, name=f"P{n}_invert")
|
|
|
+
|
|
|
## At this step we ensure that column sum is 1 for every row in x.
|
|
|
## That means, each column is set of convex co-efficient
|
|
|
y = Multiply(name=f"P{n}_normalize")([sinv, y])
|
|
|
+
|
|
|
## Now we transpose the matrix. So each row is now a set of convex coefficients
|
|
|
aff = tf.transpose(y[0], name=f"P{n}_transpose")
|
|
|
+
|
|
|
## We now do matrix multiplication of the affine combinations with the original
|
|
|
## minority batch taken as input. This generates a convex transformation
|
|
|
## of the input minority batch
|
|
|
@@ -252,18 +317,19 @@ class XConvGeN(GanBaseClass):
|
|
|
|
|
|
synth = tf.concat(synth, axis=1, name="collect_planes")
|
|
|
|
|
|
- nOut = self.gen * self.n_feat
|
|
|
+ nOut = gen * n_feat
|
|
|
|
|
|
- noiseGenerator = Sequential([
|
|
|
- InputLayer(input_shape=(self.gen, self.n_feat)),
|
|
|
- Flatten(),
|
|
|
- Dense(tfp.layers.IndependentNormal.params_size(nOut)),
|
|
|
- tfp.layers.IndependentNormal(nOut)
|
|
|
- ], name="RandomNoise")
|
|
|
+ if self.config.genAddNoise:
|
|
|
+ noiseGenerator = Sequential([
|
|
|
+ InputLayer(input_shape=(gen, n_feat)),
|
|
|
+ Flatten(),
|
|
|
+ Dense(tfp.layers.IndependentNormal.params_size(nOut)),
|
|
|
+ tfp.layers.IndependentNormal(nOut)
|
|
|
+ ], name="RandomNoise")
|
|
|
|
|
|
- noise = noiseGenerator(synth)
|
|
|
- noise = Reshape((self.gen, self.n_feat), name="ReshapeNoise")(noise)
|
|
|
- synth = Add(name="AddNoise")([synth, noise])
|
|
|
+ noise = noiseGenerator(synth)
|
|
|
+ noise = Reshape((gen, n_feat), name="ReshapeNoise")(noise)
|
|
|
+ synth = Add(name="AddNoise")([synth, noise])
|
|
|
|
|
|
## finally we compile the generator with an arbitrary minortiy neighbourhood batch
|
|
|
## as input and a covex space transformation of the same number of samples as output
|
|
|
@@ -286,7 +352,7 @@ class XConvGeN(GanBaseClass):
|
|
|
|
|
|
## takes as input synthetic sample generated as input stacked upon a batch of
|
|
|
## borderline majority samples
|
|
|
- samples = Input(shape=(self.n_feat,))
|
|
|
+ samples = Input(shape=(self.config.n_feat,))
|
|
|
|
|
|
## passed through two dense layers
|
|
|
y = Dense(250, activation='relu')(samples)
|
|
|
@@ -309,6 +375,11 @@ class XConvGeN(GanBaseClass):
|
|
|
conv_coeff_generator-> generator network instance
|
|
|
maj_min_discriminator -> discriminator network instance
|
|
|
"""
|
|
|
+
|
|
|
+ n_feat = self.config.n_feat
|
|
|
+ neb = self.config.neb
|
|
|
+ gen = self.config.gen
|
|
|
+
|
|
|
## by default the discriminator trainability is switched off.
|
|
|
## Thus training ConvGeN means training the generator network as per previously
|
|
|
## trained discriminator network.
|
|
|
@@ -319,7 +390,7 @@ class XConvGeN(GanBaseClass):
|
|
|
|
|
|
## input receives a neighbourhood minority batch
|
|
|
## and a proximal majority batch concatenated
|
|
|
- batch_data = Input(shape=(2, self.gen, self.n_feat,))
|
|
|
+ batch_data = Input(shape=(2, gen, n_feat))
|
|
|
# batch_data: (batchSize, 2, gen, n_feat)
|
|
|
|
|
|
## extract minority batch
|
|
|
@@ -329,23 +400,23 @@ class XConvGeN(GanBaseClass):
|
|
|
## extract majority batch
|
|
|
maj_batch = Lambda(lambda x: x[:, 1, :, :], name="SplitForDisc")(batch_data)
|
|
|
# maj_batch: (batchSize, gen, n_feat)
|
|
|
- maj_batch = tf.reshape(maj_batch, (-1, self.n_feat), name="ReshapeForDisc")
|
|
|
+ maj_batch = tf.reshape(maj_batch, (-1, n_feat), name="ReshapeForDisc")
|
|
|
# maj_batch: (batchSize * gen, n_feat)
|
|
|
|
|
|
## pass minority batch into generator to obtain convex space transformation
|
|
|
## (synthetic samples) of the minority neighbourhood input batch
|
|
|
conv_samples = generator(min_batch)
|
|
|
# conv_batch: (batchSize, gen, n_feat)
|
|
|
- conv_samples = tf.reshape(conv_samples, (-1, self.n_feat), name="ReshapeGenOutput")
|
|
|
+ conv_samples = tf.reshape(conv_samples, (-1, n_feat), name="ReshapeGenOutput")
|
|
|
# conv_batch: (batchSize * gen, n_feat)
|
|
|
|
|
|
## pass samples into the discriminator to know its decisions
|
|
|
conv_samples = discriminator(conv_samples)
|
|
|
- conv_samples = tf.reshape(conv_samples, (-1, self.gen, 2), name="ReshapeGenDiscOutput")
|
|
|
+ conv_samples = tf.reshape(conv_samples, (-1, gen, 2), name="ReshapeGenDiscOutput")
|
|
|
# conv_batch: (batchSize * gen, 2)
|
|
|
|
|
|
maj_batch = discriminator(maj_batch)
|
|
|
- maj_batch = tf.reshape(maj_batch, (-1, self.gen, 2), name="ReshapeMajDiscOutput")
|
|
|
+ maj_batch = tf.reshape(maj_batch, (-1, gen, 2), name="ReshapeMajDiscOutput")
|
|
|
# conv_batch: (batchSize * gen, 2)
|
|
|
|
|
|
## concatenate the decisions
|
|
|
@@ -361,6 +432,11 @@ class XConvGeN(GanBaseClass):
|
|
|
|
|
|
# Training
|
|
|
def _rough_learning(self, data, discTrainCount, batchSize=32):
|
|
|
+
|
|
|
+ n_feat = self.config.n_feat
|
|
|
+ neb = self.config.neb
|
|
|
+ gen = self.config.gen
|
|
|
+
|
|
|
generator = self.conv_sample_generator
|
|
|
discriminator = self.maj_min_discriminator
|
|
|
convGeN = self.cg
|
|
|
@@ -368,8 +444,8 @@ class XConvGeN(GanBaseClass):
|
|
|
minSetSize = len(data)
|
|
|
|
|
|
## Create labels for one neighborhood training.
|
|
|
- nLabels = 2 * self.gen
|
|
|
- labels = np.array(create01Labels(nLabels, self.gen))
|
|
|
+ nLabels = 2 * gen
|
|
|
+ labels = np.array(create01Labels(nLabels, gen))
|
|
|
labelsGeN = np.array([labels])
|
|
|
|
|
|
def getNeighborhoods():
|
|
|
@@ -401,11 +477,11 @@ class XConvGeN(GanBaseClass):
|
|
|
for x in labels:
|
|
|
yield x
|
|
|
|
|
|
- padd = np.zeros((self.gen - self.neb, self.n_feat))
|
|
|
+ padd = np.zeros((gen - neb, n_feat))
|
|
|
discTrainCount = 1 + max(0, discTrainCount)
|
|
|
|
|
|
- for neb_epoch_count in range(self.neb_epochs):
|
|
|
- self.progressBar([(neb_epoch_count + 1) / self.neb_epochs, 0.5, 0.5])
|
|
|
+ for neb_epoch_count in range(self.config.neb_epochs):
|
|
|
+ self.progressBar([(neb_epoch_count + 1) / self.config.neb_epochs, 0.5, 0.5])
|
|
|
|
|
|
## Training of the discriminator.
|
|
|
#
|
|
|
@@ -428,7 +504,7 @@ class XConvGeN(GanBaseClass):
|
|
|
b = tf.data.Dataset.from_tensor_slices(labels).repeat()
|
|
|
|
|
|
# Zip data and matching labels together for training.
|
|
|
- samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * self.gen)
|
|
|
+ samples = tf.data.Dataset.zip((a, b)).batch(batchSize * 2 * gen)
|
|
|
|
|
|
# train the discriminator with the concatenated samples and the one-hot encoded labels
|
|
|
self.timing["Fit"].start()
|
|
|
@@ -479,7 +555,7 @@ class XConvGeN(GanBaseClass):
|
|
|
## min_idxs -> indices of points in minority class
|
|
|
## gen -> convex combinations generated from each neighbourhood
|
|
|
self.timing["BMB"].start()
|
|
|
- indices = randomIndices(self.minSetSize, outputSize=self.gen, indicesToIgnore=min_idxs)
|
|
|
+ indices = randomIndices(self.minSetSize, outputSize=self.config.gen, indicesToIgnore=min_idxs)
|
|
|
r = self.nmbMin.basePoints[indices]
|
|
|
self.timing["BMB"].stop()
|
|
|
return r
|
|
|
@@ -488,7 +564,7 @@ class XConvGeN(GanBaseClass):
|
|
|
def retrainDiscriminitor(self, data, labels):
|
|
|
self.maj_min_discriminator.trainable = True
|
|
|
labels = np.array([ [x, 1 - x] for x in labels])
|
|
|
- self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.neb_epochs)
|
|
|
+ self.maj_min_discriminator.fit(x=data, y=labels, batch_size=20, epochs=self.config.neb_epochs)
|
|
|
self.maj_min_discriminator.trainable = False
|
|
|
|
|
|
def progressBar(self, x):
|
|
|
@@ -527,8 +603,8 @@ class XConvGeN(GanBaseClass):
|
|
|
if len(columns) == 0:
|
|
|
return voidFunction
|
|
|
|
|
|
- neb = self.neb
|
|
|
- n_feat = self.n_feat
|
|
|
+ neb = self.config.neb
|
|
|
+ n_feat = self.config.n_feat
|
|
|
nn = tf.constant([(1.0 if x in columns else 0.0) for x in range(n_feat)])
|
|
|
if n_feat is None:
|
|
|
print("ERRROR n_feat is None")
|