4 лет назад · 7bf46653fa
--- a/SpeedTest.ipynb
+++ b/SpeedTest.ipynb
--- a/library/NNSearch.py
+++ b/library/NNSearch.py
@@ -0,0 +1,164 @@
 
															+import math
														
 
															+
														
 
															+
														
 
															+def dist(x,y):
														
 
															+    return math.sqrt(sum(map(lambda z: (z[0] - z[1])**2, zip(x, y))))
														
 
															+
														
 
															+def maxby(data, fn, startValue=0.0):
														
 
															+    m = startValue
														
 
															+    for v in data:
														
 
															+        m = max(m, fn(v))
														
 
															+    return m
														
 
															+
														
 
															+
														
 
															+class MaxHeap:
														
 
															+    def __init__(self, maxSize=None, isGreaterThan=None, smalestValue=0.0):
														
 
															+        self.heap = []
														
 
															+        self.size = 0
														
 
															+        self.maxSize = maxSize
														
 
															+        self.isGreaterThan = isGreaterThan if isGreaterThan is not None else (lambda a, b: a > b)
														
 
															+        self.smalestValue = smalestValue
														
 
															+
														
 
															+    def insert(self, v):
														
 
															+        if self.maxSize is not None and self.size >= self.maxSize:
														
 
															+            self.replaceMax(v)
														
 
															+            return
														
 
															+
														
 
															+        pos = self.size
														
 
															+        self.size += 1
														
 
															+        self.heap.append(v)
														
 
															+        while pos > 0:
														
 
															+            w = self.heap[pos // 2]
														
 
															+            if not self.isGreaterThan(v, w):
														
 
															+                break
														
 
															+            self.heap[pos] = w
														
 
															+            pos = pos // 2
														
 
															+            self.heap[pos] = v
														
 
															+
														
 
															+
														
 
															+    def childPos(self, pos):
														
 
															+        c = (pos + 1) * 2
														
 
															+        return (c - 1, c)
														
 
															+
														
 
															+
														
 
															+    def removeMax(self):
														
 
															+        if self.heap == []:
														
 
															+            self.size = 0
														
 
															+            return
														
 
															+        
														
 
															+        self.heap[0] = self.heap[-1]
														
 
															+        self.heap = self.heap[:-1]
														
 
															+        self.size -= 1
														
 
															+
														
 
															+        x = self.heap[0]
														
 
															+        pos = 0
														
 
															+        size = self.size
														
 
															+
														
 
															+        while pos < size:
														
 
															+            (left, right) = self.childPos(pos)
														
 
															+
														
 
															+            if left >= size:
														
 
															+                break
														
 
															+
														
 
															+            y = self.heap[left]
														
 
															+            if right >= size:
														
 
															+                if self.isGreaterThan(y, x):
														
 
															+                    self.heap[pos] = y
														
 
															+                    self.heap[left] = x
														
 
															+                break
														
 
															+
														
 
															+            z = self.heap[right]
														
 
															+            (best, v) = (left, y) if self.isGreaterThan(y, z) else (right, z)
														
 
															+
														
 
															+            if not self.isGreaterThan(v, x):
														
 
															+                break
														
 
															+
														
 
															+            self.heap[pos] = v
														
 
															+            self.heap[best] = x
														
 
															+            pos = best
														
 
															+
														
 
															+
														
 
															+    def replaceMax(self, x):
														
 
															+        if self.heap == []:
														
 
															+            self.heap = [x]
														
 
															+            self.size = 1
														
 
															+            return
														
 
															+        
														
 
															+
														
 
															+        if self.isGreaterThan(x, self.heap[0]):
														
 
															+            return
														
 
															+
														
 
															+        self.heap[0] = x
														
 
															+        pos = 0
														
 
															+        size = len(self.heap)
														
 
															+
														
 
															+        while pos < size:
														
 
															+            (left, right) = self.childPos(pos)
														
 
															+
														
 
															+            if left >= size:
														
 
															+                break
														
 
															+
														
 
															+            y = self.heap[left]
														
 
															+            if right >= size:
														
 
															+                if self.isGreaterThan(y, x):
														
 
															+                    self.heap[pos] = y
														
 
															+                    self.heap[left] = x
														
 
															+                break
														
 
															+
														
 
															+            z = self.heap[right]
														
 
															+            (best, v) = (left, y) if self.isGreaterThan(y, z) else (right, z)
														
 
															+
														
 
															+            if not self.isGreaterThan(v, x):
														
 
															+                break
														
 
															+
														
 
															+            self.heap[pos] = v
														
 
															+            self.heap[best] = x
														
 
															+            pos = best
														
 
															+
														
 
															+    def getMax(self):
														
 
															+        if self.heap == []:
														
 
															+            return self.smalestValue
														
 
															+        return self.heap[0]
														
 
															+
														
 
															+
														
 
															+    def toArray(self, mapFn=None):
														
 
															+        if mapFn is None:
														
 
															+            return self.heap.copy()
														
 
															+        else:
														
 
															+            return [mapFn(x) for x in self.heap]
														
 
															+
														
 
															+
														
 
															+    def length(self):
														
 
															+        return self.size
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class NNSearch:
														
 
															+    def __init__(self, nebSize=5):
														
 
															+        self.nebSize = nebSize
														
 
															+        self.neighbourhoods = []
														
 
															+
														
 
															+    def fit(self, X, nebSize=None):
														
 
															+        if nebSize == None:
														
 
															+            nebSize = self.nebSize
														
 
															+
														
 
															+        isGreaterThan = lambda x, y: x[1] > y[1]
														
 
															+        self.neighbourhoods = [MaxHeap(nebSize, isGreaterThan, (None, 0.0)) for _i in range(len(X))]
														
 
															+
														
 
															+        for (i, x) in enumerate(X):
														
 
															+            nbh = self.neighbourhoods[i]
														
 
															+            nbh.insert((i, 0.0))
														
 
															+
														
 
															+            for (j, y) in enumerate(X[i+1:]):
														
 
															+                j += i + 1
														
 
															+                d = dist(x,y)
														
 
															+                nbh.insert((j,d))
														
 
															+                self.neighbourhoods[j].insert((i,d))
														
 
															+
														
 
															+            self.neighbourhoods[i] = nbh.toArray(lambda v: v[0])
														
 
															+
														
 
															+
														
 
															+    def neighbourhoodOfItem(self, i):
														
 
															+        return self.neighbourhoods[i]
														
--- a/library/analysis.py
+++ b/library/analysis.py
@@ -179,7 +179,7 @@ def runSpeedTestForConvGan(datasetName, ganGenerator):
 
															     print(f"Total Time: {d[0]}")
														
 
															     print(f"Preparation Time: {d[1]}")
														
 
															     print(f"Test Time: {d[2]}")
														
 
															-    return d
														
 
															+    return d, gan
														
 
															 testSets = [
														
 
															     "folding_abalone_17_vs_7_8_9_10",
														
--- a/library/convGAN.py
+++ b/library/convGAN.py
@@ -221,6 +221,8 @@ class ConvGAN(GanBaseClass):
 
															         ## and a proximal majority batch concatenated
														
 
															         batch_data = Input(shape=(self.n_feat,))
														
 
															+        ##- print(f"GAN: 0..{self.neb}/{self.gen}..")
														
 
															+
														
 
															         ## extract minority batch
														
 
															         min_batch = Lambda(lambda x: x[:self.neb])(batch_data)
														
@@ -233,9 +235,11 @@ class ConvGAN(GanBaseClass):
 
															         ## concatenate the synthetic samples with the majority samples
														
 
															         new_samples = tf.concat([conv_samples, maj_batch],axis=0)
														
 
															+        ##- new_samples = tf.concat([conv_samples, conv_samples, conv_samples, conv_samples],axis=0)
														
 
															         ## pass the concatenated vector into the discriminator to know its decisions
														
 
															         output = discriminator(new_samples)
														
 
															+        ##- output = Lambda(lambda x: x[:2 * self.gen])(output)
														
 
															         ## note that, the discriminator will not be traied but will make decisions based
														
 
															         ## on its previous training while using this function
														
@@ -299,6 +303,7 @@ class ConvGAN(GanBaseClass):
 
															             ## use the GAN to make the generator learn on the decisions
														
 
															             ## made by the previous discriminator training
														
 
															+            ##- print(f"concat sample shape: {concat_sample.shape}/{labels.shape}")
														
 
															             gan_loss_history = GAN.fit(concat_sample, y=labels, verbose=0)
														
 
															             ## store the loss for the step
														
--- a/library/convGAN2.py
+++ b/library/convGAN2.py
@@ -25,6 +25,11 @@ import tensorflow as tf
 
															 from tensorflow.keras.optimizers import Adam
														
 
															 from tensorflow.keras.layers import Lambda
														
 
															+import time
														
 
															+
														
 
															+from library.NNSearch import NNSearch
														
 
															+from library.timing import timing
														
 
															+
														
 
															 import warnings
														
 
															 warnings.filterwarnings("ignore")
														
@@ -55,6 +60,11 @@ class ConvGAN2(GanBaseClass):
 
															         self.conv_sample_generator = None
														
 
															         self.maj_min_discriminator = None
														
 
															         self.cg = None
														
 
															+        self.tNbhFit = 0.0
														
 
															+        self.tNbhSearch = 0.0
														
 
															+        self.nNbhFit = 0
														
 
															+        self.nNbhSearch = 0
														
 
															+        self.timing = { name: timing(name) for name in ["reset", "train", "create points", "NMB", "BMB", "_generate_data_for_min_point","predict"]}
														
 
															         if neb > gen:
														
 
															             raise ValueError(f"Expected neb <= gen but got neb={neb} and gen={gen}.")
														
@@ -63,6 +73,7 @@ class ConvGAN2(GanBaseClass):
 
															         """
														
 
															         Resets the trained GAN to an random state.
														
 
															         """
														
 
															+        self.timing["reset"].start()
														
 
															         self.isTrained = False
														
 
															         ## instanciate generator network and visualize architecture
														
 
															         self.conv_sample_generator = self._conv_sample_gen()
														
@@ -72,6 +83,7 @@ class ConvGAN2(GanBaseClass):
 
															         ## instanciate network and visualize architecture
														
 
															         self.cg = self._convGAN(self.conv_sample_generator, self.maj_min_discriminator)
														
 
															+        self.timing["reset"].stop()
														
 
															         if self.debug:
														
 
															             print(self.conv_sample_generator.summary())
														
@@ -92,12 +104,14 @@ class ConvGAN2(GanBaseClass):
 
															         *dataSet* is a instance of /library.dataset.DataSet/. It contains the training dataset.
														
 
															         We are only interested in the first *maxListSize* points in class 1.
														
 
															         """
														
 
															+        self.timing["train"].start()
														
 
															         if dataSet.data1.shape[0] <= 0:
														
 
															             raise AttributeError("Train: Expected data class 1 to contain at least one point.")
														
 
															         self.dataSet = dataSet
														
 
															         self._rough_learning(dataSet.data1, dataSet.data0)
														
 
															         self.isTrained = True
														
 
															+        self.timing["train"].stop()
														
 
															     def generateDataPoint(self):
														
 
															         """
														
@@ -112,6 +126,7 @@ class ConvGAN2(GanBaseClass):
 
															         *numOfSamples* is a integer > 0. It gives the number of new generated samples.
														
 
															         """
														
 
															+        self.timing["create points"].start()
														
 
															         if not self.isTrained:
														
 
															             raise ValueError("Try to generate data with untrained Re.")
														
@@ -122,12 +137,14 @@ class ConvGAN2(GanBaseClass):
 
															         ## generate synth_num synthetic samples from each minority neighbourhood
														
 
															         synth_set=[]
														
 
															+        nmb = self._NMB_prepare(data_min)
														
 
															         for i in range(len(data_min)):
														
 
															-            synth_set.extend(self._generate_data_for_min_point(data_min, i, synth_num))
														
 
															+            synth_set.extend(self._generate_data_for_min_point(nmb, i, synth_num))
														
 
															-        synth_set = synth_set[:numOfSamples] ## extract the exact number of synthetic samples needed to exactly balance the two classes
														
 
															+        synth_set = np.array(synth_set[:numOfSamples]) ## extract the exact number of synthetic samples needed to exactly balance the two classes
														
 
															+        self.timing["create points"].stop()
														
 
															-        return np.array(synth_set)
														
 
															+        return synth_set
														
 
															     # ###############################################################
														
 
															     # Hidden internal functions
														
@@ -249,7 +266,7 @@ class ConvGAN2(GanBaseClass):
 
															         return model
														
 
															     # Create synthetic points
														
 
															-    def _generate_data_for_min_point(self, data_min, index, synth_num):
														
 
															+    def _generate_data_for_min_point(self, nmb, index, synth_num):
														
 
															         """
														
 
															         generate synth_num synthetic points for a particular minoity sample
														
 
															         synth_num -> required number of data points that can be generated from a neighbourhood
														
@@ -258,13 +275,19 @@ class ConvGAN2(GanBaseClass):
 
															         index -> index of the minority sample in a training data whose neighbourhood we want to obtain
														
 
															         """
														
 
															+        self.timing["_generate_data_for_min_point"].start()
														
 
															         runs = int(synth_num / self.neb) + 1
														
 
															         synth_set = []
														
 
															         for _run in range(runs):
														
 
															-            batch = self._NMB_guided(data_min, index)
														
 
															+            batch = self._NMB_guided(nmb, index)
														
 
															+            self.timing["predict"].start()
														
 
															             synth_batch = self.conv_sample_generator.predict(batch)
														
 
															-            for x in synth_batch:
														
 
															-                synth_set.append(x)
														
 
															+            self.timing["predict"].stop()
														
 
															+            synth_set.extend(synth_batch)
														
 
															+            #for x in synth_batch:
														
 
															+            #    synth_set.append(x)
														
 
															+
														
 
															+        self.timing["_generate_data_for_min_point"].stop()
														
 
															         return synth_set[:synth_num]
														
@@ -281,9 +304,10 @@ class ConvGAN2(GanBaseClass):
 
															         labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
														
 
															+        nmb = self._NMB_prepare(data_min)
														
 
															         for step in range(self.neb_epochs * len(data_min)):
														
 
															             ## generate minority neighbourhood batch for every minority class sampls by index
														
 
															-            min_batch = self._NMB_guided(data_min, min_idx)
														
 
															+            min_batch = self._NMB_guided(nmb, min_idx)
														
 
															             min_idx = min_idx + 1
														
 
															             ## generate random proximal majority batch
														
 
															             maj_batch = self._BMB(data_min, data_maj)
														
@@ -345,20 +369,26 @@ class ConvGAN2(GanBaseClass):
 
															         ## neb -> oversampling neighbourhood
														
 
															         ## gen -> convex combinations generated from each neighbourhood
														
 
															-        #neigh = NearestNeighbors(self.neb)
														
 
															-        #neigh.fit(data_maj)
														
 
															-        # bmbi = [
														
 
															-        #     neigh.kneighbors([data_min[i]], self.neb, return_distance=False)
														
 
															-        #     for i in range(len(data_min))
														
 
															-        #     ]
														
 
															-        # bmbi = np.unique(np.array(bmbi).flatten())
														
 
															-        # bmbi = shuffle(bmbi)
														
 
															-        return tf.convert_to_tensor(
														
 
															+        self.timing["BMB"].start()
														
 
															+        result = tf.convert_to_tensor(
														
 
															             data_maj[np.random.randint(len(data_maj), size=self.gen)]
														
 
															             )
														
 
															+        self.timing["BMB"].stop()
														
 
															+        return result
														
 
															+
														
 
															+    def _NMB_prepare(self, data_min):
														
 
															+        self.timing["NMB"].start()
														
 
															+        t = time.time()
														
 
															+        neigh = NNSearch(self.neb)
														
 
															+        #neigh = NearestNeighbors(self.neb)
														
 
															+        neigh.fit(data_min)
														
 
															+        self.tNbhFit += (time.time() - t)
														
 
															+        self.nNbhFit += 1
														
 
															+        self.timing["NMB"].stop()
														
 
															+        return (data_min, neigh)
														
 
															-    def _NMB_guided(self, data_min, index):
														
 
															+    def _NMB_guided(self, nmb, index):
														
 
															         ## generate a minority neighbourhood batch for a particular minority sample
														
 
															         ## we need this for minority data generation
														
@@ -366,13 +396,18 @@ class ConvGAN2(GanBaseClass):
 
															         ## index -> index of the minority sample in a training data whose neighbourhood we want to obtain
														
 
															         ## data_min -> minority class data
														
 
															         ## neb -> oversampling neighbourhood
														
 
															-
														
 
															-        neigh = NearestNeighbors(self.neb)
														
 
															-        neigh.fit(data_min)
														
 
															-        nmbi = neigh.kneighbors([data_min[index]], self.neb, return_distance=False)
														
 
															+        self.timing["NMB"].start()
														
 
															+        (data_min, neigh) = nmb
														
 
															+
														
 
															+        t = time.time()
														
 
															+        #nmbi = neigh.kneighbors([data_min[index]], self.neb, return_distance=False)
														
 
															+        nmbi = np.array([neigh.neighbourhoodOfItem(index)])
														
 
															+        self.tNbhSearch += (time.time() - t)
														
 
															+        self.nNbhSearch += 1
														
 
															         nmbi = shuffle(nmbi)
														
 
															         nmb = data_min[nmbi]
														
 
															         nmb = tf.convert_to_tensor(nmb[0])
														
 
															+        self.timing["NMB"].stop()
														
 
															         return nmb
														
--- a/library/timing.py
+++ b/library/timing.py
@@ -0,0 +1,23 @@
 
															+
														
 
															+
														
 
															+import time
														
 
															+
														
 
															+
														
 
															+class timing:
														
 
															+    def __init__(self, name="?"):
														
 
															+        self.name = name
														
 
															+        self.duration = 0.0
														
 
															+        self.startTime = None
														
 
															+        self.runCount = 0
														
 
															+
														
 
															+    def start(self):
														
 
															+        self.startTime = time.time()
														
 
															+
														
 
															+    def stop(self):
														
 
															+        if self.startTime is not None:
														
 
															+            self.duration += time.time() - self.startTime
														
 
															+            self.runCount += 1
														
 
															+        self.startTime = None
														
 
															+
														
 
															+    def __str__(self):
														
 
															+        return f"{self.name}: #{self.runCount} {self.duration:.4f}s"