Kristian Schultz 4 rokov pred
rodič
commit
31f274beed
1 zmenil súbory, kde vykonal 52 pridanie a 53 odobranie
  1. 52 53
      library/convGAN.py

+ 52 - 53
library/convGAN.py

@@ -28,6 +28,16 @@ from tensorflow.keras.layers import Lambda
 import warnings
 warnings.filterwarnings("ignore")
 
+
+
+def repeat(x, times):
+    return [x for _i in range(times)]
+
+def create01Labels(totalSize, sizeFirstHalf):
+    labels = repeat(np.array([1,0]), sizeFirstHalf)
+    labels.extend(repeat(np.array([0,1]), totalSize))
+    return np.array(labels)
+
 class ConvGAN(GanBaseClass):
     """
     This is a toy example of a GAN.
@@ -247,19 +257,11 @@ class ConvGAN(GanBaseClass):
         generator = self.conv_sample_generator
         discriminator = self.maj_min_discriminator
         GAN = self.cg
-        loss_history=[] ## this is for stroring the loss for every run
+        loss_history = [] ## this is for stroring the loss for every run
         min_idx = 0
         neb_epoch_count = 1
 
-        labels = []
-        for i in range(2 * self.gen):
-            if i < self.gen:
-                labels.append(np.array([1,0]))
-            else:
-                labels.append(np.array([0,1]))
-        labels = np.array(labels)
-        labels = tf.convert_to_tensor(labels)
-
+        labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
 
         for step in range(neb_epochs * len(data_min)):
             ## generate minority neighbourhood batch for every minority class sampls by index
@@ -365,18 +367,20 @@ class ConvGAN(GanBaseClass):
 
 
 def rough_learning_predictions(discriminator,test_data_numpy,test_labels_numpy):
-
-    ## after the first phase of training the discriminator can be used for classification
-    ## it already learns to differentiate the convex minority points with majority points during the first training phase
-    y_pred_2d=discriminator.predict(tf.convert_to_tensor(test_data_numpy))
+    """
+    after the first phase of training the discriminator can be used for classification
+    it already learns to differentiate the convex minority points with majority points
+    during the first training phase
+    """
+    y_pred_2d = discriminator.predict(tf.convert_to_tensor(test_data_numpy))
     ## discretisation of the labels
-    y_pred=np.digitize(y_pred_2d[:,0], [.5])
+    y_pred = np.digitize(y_pred_2d[:,0], [.5])
     ## prediction shows a model with good recall and less precision
-    c=confusion_matrix(test_labels_numpy, y_pred)
-    f=f1_score(test_labels_numpy, y_pred)
-    pr=precision_score(test_labels_numpy, y_pred)
-    rc=recall_score(test_labels_numpy, y_pred)
-    k=cohen_kappa_score(test_labels_numpy, y_pred)
+    c = confusion_matrix(test_labels_numpy, y_pred)
+    f = f1_score(test_labels_numpy, y_pred)
+    pr = precision_score(test_labels_numpy, y_pred)
+    rc = recall_score(test_labels_numpy, y_pred)
+    k = cohen_kappa_score(test_labels_numpy, y_pred)
     print('Rough learning confusion matrix:', c)
     print('Rough learning f1 score', f)
     print('Rough learning precision score', pr)
@@ -386,38 +390,35 @@ def rough_learning_predictions(discriminator,test_data_numpy,test_labels_numpy):
 
 
 
-
 def generate_synthetic_data(gan, data_min, data_maj):
-    ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
-    synth_num=((len(data_maj)-len(data_min))//len(data_min))+1
+    ## roughly claculate the upper bound of the synthetic samples
+    ## to be generated from each neighbourhood
+    synth_num = ((len(data_maj) - len(data_min)) // len(data_min)) + 1
 
     ## generate synth_num synthetic samples from each minority neighbourhood
     synth_set = gan.generateData(synth_num)
 
-    ovs_min_class=np.concatenate((data_min,synth_set),axis=0)
-    ovs_training_dataset=np.concatenate((ovs_min_class,data_maj),axis=0)
-    ovs_pca_labels=np.concatenate((np.zeros(len(data_min)),np.zeros(len(synth_set))+1,np.zeros(len(data_maj))+2))
-    # TODO ovs_training_labels=np.concatenate((np.zeros(len(ovs_min_class))+1,np.zeros(len(data_maj))+0))
-    ovs_training_labels_oh=[]
-    for i in range(len(ovs_training_dataset)):
-        if i<len(ovs_min_class):
-            ovs_training_labels_oh.append(np.array([1,0]))
-        else:
-            ovs_training_labels_oh.append(np.array([0,1]))
-    ovs_training_labels_oh=np.array(ovs_training_labels_oh)
-    ovs_training_labels_oh=tf.convert_to_tensor(ovs_training_labels_oh)
-
+    ovs_min_class = np.concatenate((data_min,synth_set), axis=0)
+    ovs_training_dataset = np.concatenate((ovs_min_class,data_maj), axis=0)
+    ovs_pca_labels = np.concatenate((
+        np.zeros(len(data_min)),
+        np.zeros(len(synth_set)) + 1,
+        np.zeros(len(data_maj)) + 2
+        ))
+    
+    ovs_training_labels_oh = create01Labels(len(ovs_training_dataset), len(ovs_min_class))
+    ovs_training_labels_oh = tf.convert_to_tensor(ovs_training_labels_oh)
 
     ## PCA visualization of the synthetic sata
-    ## observe how the minority samples from convex space have optimal variance and avoids overlap with the majority
+    ## observe how the minority samples from convex space have optimal variance
+    ## and avoids overlap with the majority
     pca = PCA(n_components=2)
     pca.fit(ovs_training_dataset)
-    data_pca= pca.transform(ovs_training_dataset)
+    data_pca = pca.transform(ovs_training_dataset)
 
     ## plot PCA
     plt.rcParams["figure.figsize"] = (12,12)
 
-    # TODO colors=['r', 'b', 'g']
     plt.xticks(fontsize=20)
     plt.yticks(fontsize=20)
     plt.xlabel('PCA1',fontsize=25)
@@ -439,11 +440,11 @@ def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh,
     print('\n')
     ## second phase training of the discriminator with balanced data
 
-    history_second_learning=discriminator.fit(x=ovs_training_dataset,y=ovs_training_labels_oh, batch_size=20, epochs=num_epochs)
+    history_second_learning = discriminator.fit(x=ovs_training_dataset, y=ovs_training_labels_oh, batch_size=20, epochs=num_epochs)
 
     ## loss of the second phase learning smoothly decreses
     ## this is because now the data is fixed and diverse convex combinations are no longer fed into the discriminator at every training step
-    run_range=range(1,num_epochs+1)
+    run_range = range(1, num_epochs + 1)
     plt.rcParams["figure.figsize"] = (16,10)
     plt.xticks(fontsize=20)
     plt.yticks(fontsize=20)
@@ -458,22 +459,22 @@ def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh,
     ## the recall decreases but the precision improves
     print('\n')
 
-    y_pred_2d=discriminator.predict(tf.convert_to_tensor(test_data_numpy))
-    y_pred=np.digitize(y_pred_2d[:,0], [.5])
-    c=confusion_matrix(test_labels_numpy, y_pred)
-    f=f1_score(test_labels_numpy, y_pred)
-    pr=precision_score(test_labels_numpy, y_pred)
-    rc=recall_score(test_labels_numpy, y_pred)
-    k=cohen_kappa_score(test_labels_numpy, y_pred)
+    y_pred_2d = discriminator.predict(tf.convert_to_tensor(test_data_numpy))
+    y_pred = np.digitize(y_pred_2d[:,0], [.5])
+    c = confusion_matrix(test_labels_numpy, y_pred)
+    f = f1_score(test_labels_numpy, y_pred)
+    pr = precision_score(test_labels_numpy, y_pred)
+    rc = recall_score(test_labels_numpy, y_pred)
+    k = cohen_kappa_score(test_labels_numpy, y_pred)
     print('Final learning confusion matrix:', c)
     print('Final learning f1 score', f)
     print('Final learning precision score', pr)
     print('Final learning recall score', rc)
     print('Final learning kappa score', k)
-    return c,f,pr,rc,k
+    return c, f, pr, rc, k
 
 
-def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels, neb, gen, neb_epochs,epochs_retrain_disc):
+def convGAN_train_end_to_end(training_data, training_labels, test_data, test_labels, neb, gen, neb_epochs, epochs_retrain_disc):
 
     ##minority class
     data_min=training_data[np.where(training_labels == 1)[0]]
@@ -516,7 +517,7 @@ def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels
     print('\n')
 
     ## final training results
-    c,f,pr,rc,k=final_learning(gan.maj_min_discriminator, ovs_training_dataset, ovs_training_labels_oh, test_data, test_labels, epochs_retrain_disc)
+    c,f,pr,rc,k = final_learning(gan.maj_min_discriminator, ovs_training_dataset, ovs_training_labels_oh, test_data, test_labels, epochs_retrain_disc)
 
     return ((c_r,f_r,pr_r,rc_r,k_r),(c,f,pr,rc,k))
 
@@ -551,8 +552,6 @@ def runTest():
     neb=gen=5 ##neb=gen required
     neb_epochs=10
     epochs_retrain_disc=50
-    # TODO n_feat=len(features_x[1]) ## number of features
-
 
     ## Training
     np.random.seed(42)