3 vuotta sitten · d345bd09b0
--- a/agglo_5dim_2NN_v3.ipynb
+++ b/agglo_5dim_2NN_v3.ipynb
--- a/fdc/clustering.py
+++ b/fdc/clustering.py
@@ -0,0 +1,22 @@
 
				+import numpy as np
			
 
				+from sklearn.cluster import AgglomerativeClustering
			
 
				+from fdc.visualize import plotCluster
			
 
				+
			
 
				+def aglo_clustering(number_of_clusters, affinity, linkage
			
 
				+                    , five_d_embedding, two_d_embedding
			
 
				+                    , visual=False):
			
 
				+    np.random.seed(42)
			
 
				+    ag_cluster = AgglomerativeClustering(
			
 
				+        n_clusters=number_of_clusters
			
 
				+        , affinity=affinity
			
 
				+        , linkage=linkage
			
 
				+        )
			
 
				+    clusters = ag_cluster.fit_predict(five_d_embedding)
			
 
				+    (values, counts) = np.unique(clusters, return_counts=True)
			
 
				+    two_d_embedding['Cluster'] = clusters
			
 
				+    
			
 
				+    if visual:
			
 
				+        plotCluster(two_d_embedding, clusterName="Cluster", xName="UMAP_0", yName="UMAP_1", stroke=3)
			
 
				+
			
 
				+    return two_d_embedding.Cluster.to_list(), counts
			
 
				+
			
--- a/fdc/fdc.py
+++ b/fdc/fdc.py
@@ -5,8 +5,7 @@ import umap.umap_ as umap
 
				 
			
 
				 # --[ Known to be used but can we avoid it? ]----
			
 
				 import pandas as pd
			
 
				-import seaborn as sns
			
 
				-import matplotlib.pyplot as plt
			
 
				+from fdc.visualize import plotMapping
			
 
				 
			
 
				 
			
 
				 def value(v, defaultValue):
			
@@ -15,20 +14,6 @@ def value(v, defaultValue):
 
				     else:
			
 
				         return v
			
 
				 
			
 
				-def draw2dMapping(data):
			
 
				-    colors_set1 = ["lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood", "cornflowerblue", "plum", "yellowgreen"]
			
 
				-    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
			
 
				-
			
 
				-    sns.lmplot(x="UMAP_0"
			
 
				-        , y="UMAP_1"
			
 
				-        , data=data
			
 
				-        , fit_reg=False
			
 
				-        , legend=False
			
 
				-        , scatter_kws={"s": 3}
			
 
				-        , palette=customPalette_set1)
			
 
				-    plt.show()
			
 
				-
			
 
				-
			
 
				 
			
 
				 def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
			
 
				     data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data)
			
@@ -36,14 +21,14 @@ def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
 
				     result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1'])
			
 
				     
			
 
				     if visual:
			
 
				-        draw2dMapping(result)
			
 
				+        plotMapping(result)
			
 
				 
			
 
				     return result
			
 
				 
			
 
				 
			
 
				 
			
 
				 @jit(nopython=True)
			
 
				-def modified_can(a,b):
			
 
				+def canberra_modified(a,b):
			
 
				     return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
			
 
				 
			
 
				 
			
@@ -78,7 +63,7 @@ class FDC:
 
				     def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
			
 
				         # used clusterings
			
 
				         self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
			
 
				-        self.clustering_ord = clustering_ord or Clustering(modified_can, 30, 0.1)
			
 
				+        self.clustering_ord = clustering_ord or Clustering(canberra_modified, 30, 0.1)
			
 
				         self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
			
 
				 
			
 
				         # Control of data output
			
@@ -158,9 +143,9 @@ class FDC:
 
				             # Show mapping if needed
			
 
				             if visual:
			
 
				                 if self.use_pandas_output:
			
 
				-                    draw2dMapping(result_reduced)
			
 
				+                    plotMapping(result_reduced)
			
 
				                 else:
			
 
				-                    draw2dMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
			
 
				+                    plotMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
			
 
				 
			
 
				 
			
 
				         # Return the results
			
--- a/fdc/hypothesisTesting.py
+++ b/fdc/hypothesisTesting.py
@@ -0,0 +1,111 @@
 
				+from scipy.stats import ranksums
			
 
				+import matplotlib as mpl
			
 
				+from scipy import stats
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+class MidpointNormalize(mpl.colors.Normalize):
			
 
				+    def __init__(self, vmin, vmax, midpoint=0, clip=False):
			
 
				+        self.midpoint = midpoint
			
 
				+        mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
			
 
				+
			
 
				+
			
 
				+    def __call__(self, value, clip=None):
			
 
				+        normalized_min = max(0, 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
			
 
				+        normalized_max = min(1, 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
			
 
				+        normalized_mid = 0.5
			
 
				+        x = [self.vmin, self.midpoint, self.vmax]
			
 
				+        y = [normalized_min, normalized_mid, normalized_max]
			
 
				+        return np.ma.masked_array(np.interp(value, x, y))
			
 
				+
			
 
				+
			
 
				+
			
 
				+class AnalysisIntern:
			
 
				+    def __init__(self, cluster_df_list, var_type="cont"):
			
 
				+        self.var_type = var_type
			
 
				+        self.cluster_df_list  = cluster_df_list
			
 
				+
			
 
				+
			
 
				+    def p_val(self, clustera, clusterb, feature):
			
 
				+        if self.var_type == 'cont':
			
 
				+            return stats.ttest_ind(
			
 
				+                np.array(clustera[feature])
			
 
				+                , np.array(clusterb[feature])
			
 
				+                ).pvalue
			
 
				+        else:
			
 
				+            return ranksums(
			
 
				+                np.array(clustera[feature])
			
 
				+                , np.array(clusterb[feature])
			
 
				+                ).pvalue
			
 
				+
			
 
				+
			
 
				+    def feature_p_val(self, feature):
			
 
				+        return np.array([
			
 
				+            np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
			
 
				+            for ci in self.cluster_df_list
			
 
				+            ])
			
 
				+
			
 
				+
			
 
				+    def p_map(self, feature):
			
 
				+        heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
			
 
				+        norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
			
 
				+
			
 
				+        p_values = self.feature_p_val(feature)
			
 
				+
			
 
				+        im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
			
 
				+        ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
			
 
				+        ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
			
 
				+
			
 
				+        num_clusters = len(self.cluster_df_list)
			
 
				+        for y in range(num_clusters):
			
 
				+            for x in range(num_clusters):
			
 
				+                plt.text(x , y
			
 
				+                    , '%.2f' % p_values[y, x]
			
 
				+                    , horizontalalignment='center'
			
 
				+                    , verticalalignment='center'
			
 
				+                    , fontsize=8
			
 
				+                    )
			
 
				+
			
 
				+        cbar = heatmap.colorbar(im)
			
 
				+        cbar.ax.set_ylabel('p-value')
			
 
				+        plt.title(feature.upper(), fontsize=16)
			
 
				+        print('\n')
			
 
				+        plt.show()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+class Analysis:
			
 
				+    def __init__(self
			
 
				+                , cont_features, ord_features, nom_features
			
 
				+                , clusters_dbscan_FDC, values_dbscan_FDC):
			
 
				+        self.cont_features = cont_features
			
 
				+        self.ord_features = ord_features
			
 
				+        self.nom_features = nom_features
			
 
				+        self.clusters_dbscan_FDC = clusters_dbscan_FDC
			
 
				+        self.values_dbscan_FDC = values_dbscan_FDC
			
 
				+
			
 
				+
			
 
				+    def run(self, data):
			
 
				+        data['Clusters'] = np.array(self.clusters_dbscan_FDC)
			
 
				+
			
 
				+        cluster_df_list=[]
			
 
				+        for cluster in self.values_dbscan_FDC:
			
 
				+            cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
			
 
				+            cluster_df.columns = list(data.columns)[:-1]
			
 
				+            cluster_df_list.append(cluster_df)
			
 
				+
			
 
				+        cluster_df_list = cluster_df_list[1:]
			
 
				+
			
 
				+        a = AnalysisIntern(cluster_df_list, "cont")
			
 
				+        for feature in self.cont_features:
			
 
				+            a.p_map(feature)
			
 
				+
			
 
				+        a = AnalysisIntern(cluster_df_list, "ord")
			
 
				+        for feature in self.ord_features:
			
 
				+            a.p_map(feature)
			
 
				+
			
 
				+        for feature in self.nom_features:
			
 
				+            a.p_map(feature)
			
 
				+
			
--- a/fdc/visualize.py
+++ b/fdc/visualize.py
@@ -0,0 +1,154 @@
 
				+import seaborn as sns
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+def plotCluster(data, clusterName="cluster", xName="FDC_1", yName="FDC_2", stroke=20):
			
 
				+    colors_set = [
			
 
				+        'lightcoral', 'cornflowerblue', 'orange','mediumorchid', 'lightseagreen'
			
 
				+        , 'olive', 'chocolate', 'steelblue', 'paleturquoise',  'lightgreen'
			
 
				+        , 'burlywood', 'lightsteelblue']
			
 
				+
			
 
				+    customPalette_set = sns.set_palette(sns.color_palette(colors_set))
			
 
				+
			
 
				+    sns.lmplot(
			
 
				+        x=xName
			
 
				+        , y=yName
			
 
				+        , data=data
			
 
				+        , fit_reg=False
			
 
				+        , legend=True
			
 
				+        , hue=clusterName
			
 
				+        , scatter_kws={"s": stroke}
			
 
				+        , palette=customPalette_set
			
 
				+        )
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+
			
 
				+def plotMapping(data, xName="UMAP_0", yName="UMAP_1"):
			
 
				+    colors_set1 = [
			
 
				+        "lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood"
			
 
				+        , "cornflowerblue", "plum", "yellowgreen"]
			
 
				+
			
 
				+    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
			
 
				+
			
 
				+    sns.lmplot(x=xName
			
 
				+        , y=yName
			
 
				+        , data=data
			
 
				+        , fit_reg=False
			
 
				+        , legend=False
			
 
				+        , scatter_kws={"s": 3}
			
 
				+        , palette=customPalette_set1)
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+
			
 
				+def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features):
			
 
				+    vizlimit = 15
			
 
				+    plt.rcParams["figure.figsize"] = (12, 6)
			
 
				+    
			
 
				+    col = sns.color_palette("Set2")
			
 
				+    
			
 
				+    rows = 3
			
 
				+    columns = 3
			
 
				+    
			
 
				+    for feature in feature_list:
			
 
				+        print('Feature name:', feature.upper())
			
 
				+        print('\n')
			
 
				+    
			
 
				+        if len(main_data[feature].value_counts()) <= vizlimit:
			
 
				+            for cluster_counter, cluster in enumerate(cluster_df_list):
			
 
				+                print('Cluster '+ str(cluster_counter + 1) + ' frequency distribution')
			
 
				+                if feature in list(rev_dict.keys()):
			
 
				+                    feat_keys=rev_dict[feature]
			
 
				+                    r = dict(zip(feat_keys.values(), feat_keys.keys()))
			
 
				+                    print(cluster.replace({feature:r})[feature].value_counts())
			
 
				+                else:
			
 
				+                    print(cluster[feature].value_counts())
			
 
				+                print('\n')
			
 
				+        
			
 
				+            print('\n')
			
 
				+            print('\n')
			
 
				+        
			
 
				+            cluster_bar = []
			
 
				+            for cluster in cluster_df_list:
			
 
				+                if feature in list(rev_dict.keys()):
			
 
				+                    y = np.array(cluster.replace({feature:r})[feature].value_counts())
			
 
				+                    x = np.array(cluster.replace({feature:r})[feature].value_counts().index)
			
 
				+                    cluster_bar.append([x,y])
			
 
				+                else:
			
 
				+                    y = np.array(cluster[feature].value_counts().sort_index())
			
 
				+                    x = np.array(cluster[feature].value_counts().sort_index().index)
			
 
				+                    cluster_bar.append([x,y])
			
 
				+                
			
 
				+            cluster_bar = np.array(cluster_bar)
			
 
				+        
			
 
				+            figx, ax = plt.subplots(rows, columns)
			
 
				+            figx.set_size_inches(10.5, 28.5)
			
 
				+            cluster_in_subplot_axis_dict = np.array([[0,0],[0,1],[0,2],[1,0],[1,1],[1,2],[2,0],[1,1],[2,2]])
			
 
				+            c = 0
			
 
				+            for i in range(rows):
			
 
				+                for j in range(columns):
			
 
				+                    ax[i,j].bar(cluster_bar[c,0], cluster_bar[c,1], color=col)
			
 
				+                    ax[i,j].tick_params(axis='x', which='major', labelsize=8, rotation=90)
			
 
				+                    ax[i,j].set_title('Cluster: ' + str(c + 1))
			
 
				+                    if c > len(cluster_df_list):
			
 
				+                        break
			
 
				+                    else:
			
 
				+                        c += 1
			
 
				+            
			
 
				+        means = []
			
 
				+        sds = []
			
 
				+        cluster_labels = []
			
 
				+        for cluster_counter, cluster in enumerate(cluster_df_list):
			
 
				+            if feature in cont_features:
			
 
				+                print('Cluster '+ str(cluster_counter + 1) + ' summary statistics')
			
 
				+                print('\n')
			
 
				+                cm = cluster[feature].mean()
			
 
				+                cs = cluster[feature].std()
			
 
				+                print('feature mean:', cm)
			
 
				+                print('feature standard deviation:', cs)
			
 
				+                print('feature median:', cluster[feature].median())
			
 
				+                print('\n')
			
 
				+                means.append(cm)
			
 
				+                sds.append(cs)
			
 
				+                cluster_labels.append('C' + str(cluster_counter + 1))
			
 
				+            
			
 
				+        means = np.array(means)
			
 
				+        sds = np.array(sds)
			
 
				+        cluster_labels = np.array(cluster_labels)
			
 
				+        
			
 
				+        print('\n')  
			
 
				+        
			
 
				+        print('Distribution of feature across clusters')
			
 
				+        if feature in cont_features:   
			
 
				+            fig, ax7 = plt.subplots()
			
 
				+            ax7.bar(cluster_labels, means, yerr=sds, color=sns.color_palette("Set3"))
			
 
				+            ax7.tick_params(axis='both', which='major', labelsize=10)
			
 
				+            plt.xlabel(feature, fontsize=15)
			
 
				+            plt.show()
			
 
				+        
			
 
				+        print('\n')
			
 
				+        print('\n')
			
 
				+        
			
 
				+        colors_set = ['lightgray', 'lightcoral', 'cornflowerblue', 'orange', 'mediumorchid'
			
 
				+            , 'lightseagreen', 'olive', 'chocolate', 'steelblue', 'paleturquoise', 'lightgreen'
			
 
				+            , 'burlywood','lightsteelblue']
			
 
				+        customPalette_set = sns.set_palette(sns.color_palette(colors_set))
			
 
				+        
			
 
				+        if feature not in cont_features:
			
 
				+            print('Feature distribution in UMAP embedding')
			
 
				+            if feature in list(rev_dict.keys()):
			
 
				+                umap_data[feature] = np.array(main_data.replace({feature:r})[feature])
			
 
				+            else:
			
 
				+                umap_data[feature] = np.array(main_data[feature])
			
 
				+            sns.lmplot(x="FDC_1", y="FDC_2",
			
 
				+                data=umap_data, 
			
 
				+                fit_reg=False, 
			
 
				+                legend=True,
			
 
				+                hue=feature, # color by cluster
			
 
				+                scatter_kws={"s": 20},
			
 
				+                palette=customPalette_set) # specify the point size
			
 
				+            plt.show()
			
 
				+        
			
 
				+        print('\n')
			
 
				+        print('\n')
			
 
				+