|
|
@@ -28,6 +28,16 @@ from tensorflow.keras.layers import Lambda
|
|
|
import warnings
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
+
|
|
|
+
|
|
|
+def repeat(x, times):
|
|
|
+ return [x for _i in range(times)]
|
|
|
+
|
|
|
+def create01Labels(totalSize, sizeFirstHalf):
|
|
|
+ labels = repeat(np.array([1,0]), sizeFirstHalf)
|
|
|
+ labels.extend(repeat(np.array([0,1]), totalSize))
|
|
|
+ return np.array(labels)
|
|
|
+
|
|
|
class ConvGAN(GanBaseClass):
|
|
|
"""
|
|
|
This is a toy example of a GAN.
|
|
|
@@ -247,19 +257,11 @@ class ConvGAN(GanBaseClass):
|
|
|
generator = self.conv_sample_generator
|
|
|
discriminator = self.maj_min_discriminator
|
|
|
GAN = self.cg
|
|
|
- loss_history=[] ## this is for stroring the loss for every run
|
|
|
+ loss_history = [] ## this is for stroring the loss for every run
|
|
|
min_idx = 0
|
|
|
neb_epoch_count = 1
|
|
|
|
|
|
- labels = []
|
|
|
- for i in range(2 * self.gen):
|
|
|
- if i < self.gen:
|
|
|
- labels.append(np.array([1,0]))
|
|
|
- else:
|
|
|
- labels.append(np.array([0,1]))
|
|
|
- labels = np.array(labels)
|
|
|
- labels = tf.convert_to_tensor(labels)
|
|
|
-
|
|
|
+ labels = tf.convert_to_tensor(create01Labels(2 * self.gen, self.gen))
|
|
|
|
|
|
for step in range(neb_epochs * len(data_min)):
|
|
|
## generate minority neighbourhood batch for every minority class sampls by index
|
|
|
@@ -365,18 +367,20 @@ class ConvGAN(GanBaseClass):
|
|
|
|
|
|
|
|
|
def rough_learning_predictions(discriminator,test_data_numpy,test_labels_numpy):
|
|
|
-
|
|
|
- ## after the first phase of training the discriminator can be used for classification
|
|
|
- ## it already learns to differentiate the convex minority points with majority points during the first training phase
|
|
|
- y_pred_2d=discriminator.predict(tf.convert_to_tensor(test_data_numpy))
|
|
|
+ """
|
|
|
+ after the first phase of training the discriminator can be used for classification
|
|
|
+ it already learns to differentiate the convex minority points with majority points
|
|
|
+ during the first training phase
|
|
|
+ """
|
|
|
+ y_pred_2d = discriminator.predict(tf.convert_to_tensor(test_data_numpy))
|
|
|
## discretisation of the labels
|
|
|
- y_pred=np.digitize(y_pred_2d[:,0], [.5])
|
|
|
+ y_pred = np.digitize(y_pred_2d[:,0], [.5])
|
|
|
## prediction shows a model with good recall and less precision
|
|
|
- c=confusion_matrix(test_labels_numpy, y_pred)
|
|
|
- f=f1_score(test_labels_numpy, y_pred)
|
|
|
- pr=precision_score(test_labels_numpy, y_pred)
|
|
|
- rc=recall_score(test_labels_numpy, y_pred)
|
|
|
- k=cohen_kappa_score(test_labels_numpy, y_pred)
|
|
|
+ c = confusion_matrix(test_labels_numpy, y_pred)
|
|
|
+ f = f1_score(test_labels_numpy, y_pred)
|
|
|
+ pr = precision_score(test_labels_numpy, y_pred)
|
|
|
+ rc = recall_score(test_labels_numpy, y_pred)
|
|
|
+ k = cohen_kappa_score(test_labels_numpy, y_pred)
|
|
|
print('Rough learning confusion matrix:', c)
|
|
|
print('Rough learning f1 score', f)
|
|
|
print('Rough learning precision score', pr)
|
|
|
@@ -386,38 +390,35 @@ def rough_learning_predictions(discriminator,test_data_numpy,test_labels_numpy):
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
def generate_synthetic_data(gan, data_min, data_maj):
|
|
|
- ## roughly claculate the upper bound of the synthetic samples to be generated from each neighbourhood
|
|
|
- synth_num=((len(data_maj)-len(data_min))//len(data_min))+1
|
|
|
+ ## roughly claculate the upper bound of the synthetic samples
|
|
|
+ ## to be generated from each neighbourhood
|
|
|
+ synth_num = ((len(data_maj) - len(data_min)) // len(data_min)) + 1
|
|
|
|
|
|
## generate synth_num synthetic samples from each minority neighbourhood
|
|
|
synth_set = gan.generateData(synth_num)
|
|
|
|
|
|
- ovs_min_class=np.concatenate((data_min,synth_set),axis=0)
|
|
|
- ovs_training_dataset=np.concatenate((ovs_min_class,data_maj),axis=0)
|
|
|
- ovs_pca_labels=np.concatenate((np.zeros(len(data_min)),np.zeros(len(synth_set))+1,np.zeros(len(data_maj))+2))
|
|
|
- # TODO ovs_training_labels=np.concatenate((np.zeros(len(ovs_min_class))+1,np.zeros(len(data_maj))+0))
|
|
|
- ovs_training_labels_oh=[]
|
|
|
- for i in range(len(ovs_training_dataset)):
|
|
|
- if i<len(ovs_min_class):
|
|
|
- ovs_training_labels_oh.append(np.array([1,0]))
|
|
|
- else:
|
|
|
- ovs_training_labels_oh.append(np.array([0,1]))
|
|
|
- ovs_training_labels_oh=np.array(ovs_training_labels_oh)
|
|
|
- ovs_training_labels_oh=tf.convert_to_tensor(ovs_training_labels_oh)
|
|
|
-
|
|
|
+ ovs_min_class = np.concatenate((data_min,synth_set), axis=0)
|
|
|
+ ovs_training_dataset = np.concatenate((ovs_min_class,data_maj), axis=0)
|
|
|
+ ovs_pca_labels = np.concatenate((
|
|
|
+ np.zeros(len(data_min)),
|
|
|
+ np.zeros(len(synth_set)) + 1,
|
|
|
+ np.zeros(len(data_maj)) + 2
|
|
|
+ ))
|
|
|
+
|
|
|
+ ovs_training_labels_oh = create01Labels(len(ovs_training_dataset), len(ovs_min_class))
|
|
|
+ ovs_training_labels_oh = tf.convert_to_tensor(ovs_training_labels_oh)
|
|
|
|
|
|
## PCA visualization of the synthetic sata
|
|
|
- ## observe how the minority samples from convex space have optimal variance and avoids overlap with the majority
|
|
|
+ ## observe how the minority samples from convex space have optimal variance
|
|
|
+ ## and avoids overlap with the majority
|
|
|
pca = PCA(n_components=2)
|
|
|
pca.fit(ovs_training_dataset)
|
|
|
- data_pca= pca.transform(ovs_training_dataset)
|
|
|
+ data_pca = pca.transform(ovs_training_dataset)
|
|
|
|
|
|
## plot PCA
|
|
|
plt.rcParams["figure.figsize"] = (12,12)
|
|
|
|
|
|
- # TODO colors=['r', 'b', 'g']
|
|
|
plt.xticks(fontsize=20)
|
|
|
plt.yticks(fontsize=20)
|
|
|
plt.xlabel('PCA1',fontsize=25)
|
|
|
@@ -439,11 +440,11 @@ def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh,
|
|
|
print('\n')
|
|
|
## second phase training of the discriminator with balanced data
|
|
|
|
|
|
- history_second_learning=discriminator.fit(x=ovs_training_dataset,y=ovs_training_labels_oh, batch_size=20, epochs=num_epochs)
|
|
|
+ history_second_learning = discriminator.fit(x=ovs_training_dataset, y=ovs_training_labels_oh, batch_size=20, epochs=num_epochs)
|
|
|
|
|
|
## loss of the second phase learning smoothly decreses
|
|
|
## this is because now the data is fixed and diverse convex combinations are no longer fed into the discriminator at every training step
|
|
|
- run_range=range(1,num_epochs+1)
|
|
|
+ run_range = range(1, num_epochs + 1)
|
|
|
plt.rcParams["figure.figsize"] = (16,10)
|
|
|
plt.xticks(fontsize=20)
|
|
|
plt.yticks(fontsize=20)
|
|
|
@@ -458,22 +459,22 @@ def final_learning(discriminator, ovs_training_dataset, ovs_training_labels_oh,
|
|
|
## the recall decreases but the precision improves
|
|
|
print('\n')
|
|
|
|
|
|
- y_pred_2d=discriminator.predict(tf.convert_to_tensor(test_data_numpy))
|
|
|
- y_pred=np.digitize(y_pred_2d[:,0], [.5])
|
|
|
- c=confusion_matrix(test_labels_numpy, y_pred)
|
|
|
- f=f1_score(test_labels_numpy, y_pred)
|
|
|
- pr=precision_score(test_labels_numpy, y_pred)
|
|
|
- rc=recall_score(test_labels_numpy, y_pred)
|
|
|
- k=cohen_kappa_score(test_labels_numpy, y_pred)
|
|
|
+ y_pred_2d = discriminator.predict(tf.convert_to_tensor(test_data_numpy))
|
|
|
+ y_pred = np.digitize(y_pred_2d[:,0], [.5])
|
|
|
+ c = confusion_matrix(test_labels_numpy, y_pred)
|
|
|
+ f = f1_score(test_labels_numpy, y_pred)
|
|
|
+ pr = precision_score(test_labels_numpy, y_pred)
|
|
|
+ rc = recall_score(test_labels_numpy, y_pred)
|
|
|
+ k = cohen_kappa_score(test_labels_numpy, y_pred)
|
|
|
print('Final learning confusion matrix:', c)
|
|
|
print('Final learning f1 score', f)
|
|
|
print('Final learning precision score', pr)
|
|
|
print('Final learning recall score', rc)
|
|
|
print('Final learning kappa score', k)
|
|
|
- return c,f,pr,rc,k
|
|
|
+ return c, f, pr, rc, k
|
|
|
|
|
|
|
|
|
-def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels, neb, gen, neb_epochs,epochs_retrain_disc):
|
|
|
+def convGAN_train_end_to_end(training_data, training_labels, test_data, test_labels, neb, gen, neb_epochs, epochs_retrain_disc):
|
|
|
|
|
|
##minority class
|
|
|
data_min=training_data[np.where(training_labels == 1)[0]]
|
|
|
@@ -516,7 +517,7 @@ def convGAN_train_end_to_end(training_data,training_labels,test_data,test_labels
|
|
|
print('\n')
|
|
|
|
|
|
## final training results
|
|
|
- c,f,pr,rc,k=final_learning(gan.maj_min_discriminator, ovs_training_dataset, ovs_training_labels_oh, test_data, test_labels, epochs_retrain_disc)
|
|
|
+ c,f,pr,rc,k = final_learning(gan.maj_min_discriminator, ovs_training_dataset, ovs_training_labels_oh, test_data, test_labels, epochs_retrain_disc)
|
|
|
|
|
|
return ((c_r,f_r,pr_r,rc_r,k_r),(c,f,pr,rc,k))
|
|
|
|
|
|
@@ -551,8 +552,6 @@ def runTest():
|
|
|
neb=gen=5 ##neb=gen required
|
|
|
neb_epochs=10
|
|
|
epochs_retrain_disc=50
|
|
|
- # TODO n_feat=len(features_x[1]) ## number of features
|
|
|
-
|
|
|
|
|
|
## Training
|
|
|
np.random.seed(42)
|