3 lat temu · d345bd09b0
--- a/agglo_5dim_2NN_v3.ipynb
+++ b/agglo_5dim_2NN_v3.ipynb
--- a/fdc/clustering.py
+++ b/fdc/clustering.py
@@ -0,0 +1,22 @@
 
															+import numpy as np
														
 
															+from sklearn.cluster import AgglomerativeClustering
														
 
															+from fdc.visualize import plotCluster
														
 
															+
														
 
															+def aglo_clustering(number_of_clusters, affinity, linkage
														
 
															+                    , five_d_embedding, two_d_embedding
														
 
															+                    , visual=False):
														
 
															+    np.random.seed(42)
														
 
															+    ag_cluster = AgglomerativeClustering(
														
 
															+        n_clusters=number_of_clusters
														
 
															+        , affinity=affinity
														
 
															+        , linkage=linkage
														
 
															+        )
														
 
															+    clusters = ag_cluster.fit_predict(five_d_embedding)
														
 
															+    (values, counts) = np.unique(clusters, return_counts=True)
														
 
															+    two_d_embedding['Cluster'] = clusters
														
 
															+    
														
 
															+    if visual:
														
 
															+        plotCluster(two_d_embedding, clusterName="Cluster", xName="UMAP_0", yName="UMAP_1", stroke=3)
														
 
															+
														
 
															+    return two_d_embedding.Cluster.to_list(), counts
														
 
															+
														
--- a/fdc/fdc.py
+++ b/fdc/fdc.py
@@ -5,8 +5,7 @@ import umap.umap_ as umap
 
															 # --[ Known to be used but can we avoid it? ]----
														
 
															 import pandas as pd
														
 
															-import seaborn as sns
														
 
															-import matplotlib.pyplot as plt
														
 
															+from fdc.visualize import plotMapping
														
 
															 def value(v, defaultValue):
														
@@ -15,20 +14,6 @@ def value(v, defaultValue):
 
															     else:
														
 
															         return v
														
 
															-def draw2dMapping(data):
														
 
															-    colors_set1 = ["lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood", "cornflowerblue", "plum", "yellowgreen"]
														
 
															-    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
														
 
															-
														
 
															-    sns.lmplot(x="UMAP_0"
														
 
															-        , y="UMAP_1"
														
 
															-        , data=data
														
 
															-        , fit_reg=False
														
 
															-        , legend=False
														
 
															-        , scatter_kws={"s": 3}
														
 
															-        , palette=customPalette_set1)
														
 
															-    plt.show()
														
 
															-
														
 
															-
														
 
															 def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
														
 
															     data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data)
														
@@ -36,14 +21,14 @@ def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
 
															     result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1'])
														
 
															     if visual:
														
 
															-        draw2dMapping(result)
														
 
															+        plotMapping(result)
														
 
															     return result
														
 
															 @jit(nopython=True)
														
 
															-def modified_can(a,b):
														
 
															+def canberra_modified(a,b):
														
 
															     return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
														
@@ -78,7 +63,7 @@ class FDC:
 
															     def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
														
 
															         # used clusterings
														
 
															         self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
														
 
															-        self.clustering_ord = clustering_ord or Clustering(modified_can, 30, 0.1)
														
 
															+        self.clustering_ord = clustering_ord or Clustering(canberra_modified, 30, 0.1)
														
 
															         self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
														
 
															         # Control of data output
														
@@ -158,9 +143,9 @@ class FDC:
 
															             # Show mapping if needed
														
 
															             if visual:
														
 
															                 if self.use_pandas_output:
														
 
															-                    draw2dMapping(result_reduced)
														
 
															+                    plotMapping(result_reduced)
														
 
															                 else:
														
 
															-                    draw2dMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
														
 
															+                    plotMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
														
 
															         # Return the results
														
--- a/fdc/hypothesisTesting.py
+++ b/fdc/hypothesisTesting.py
@@ -0,0 +1,111 @@
 
															+from scipy.stats import ranksums
														
 
															+import matplotlib as mpl
														
 
															+from scipy import stats
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class MidpointNormalize(mpl.colors.Normalize):
														
 
															+    def __init__(self, vmin, vmax, midpoint=0, clip=False):
														
 
															+        self.midpoint = midpoint
														
 
															+        mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
														
 
															+
														
 
															+
														
 
															+    def __call__(self, value, clip=None):
														
 
															+        normalized_min = max(0, 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
														
 
															+        normalized_max = min(1, 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
														
 
															+        normalized_mid = 0.5
														
 
															+        x = [self.vmin, self.midpoint, self.vmax]
														
 
															+        y = [normalized_min, normalized_mid, normalized_max]
														
 
															+        return np.ma.masked_array(np.interp(value, x, y))
														
 
															+
														
 
															+
														
 
															+
														
 
															+class AnalysisIntern:
														
 
															+    def __init__(self, cluster_df_list, var_type="cont"):
														
 
															+        self.var_type = var_type
														
 
															+        self.cluster_df_list  = cluster_df_list
														
 
															+
														
 
															+
														
 
															+    def p_val(self, clustera, clusterb, feature):
														
 
															+        if self.var_type == 'cont':
														
 
															+            return stats.ttest_ind(
														
 
															+                np.array(clustera[feature])
														
 
															+                , np.array(clusterb[feature])
														
 
															+                ).pvalue
														
 
															+        else:
														
 
															+            return ranksums(
														
 
															+                np.array(clustera[feature])
														
 
															+                , np.array(clusterb[feature])
														
 
															+                ).pvalue
														
 
															+
														
 
															+
														
 
															+    def feature_p_val(self, feature):
														
 
															+        return np.array([
														
 
															+            np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
														
 
															+            for ci in self.cluster_df_list
														
 
															+            ])
														
 
															+
														
 
															+
														
 
															+    def p_map(self, feature):
														
 
															+        heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
														
 
															+        norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
														
 
															+
														
 
															+        p_values = self.feature_p_val(feature)
														
 
															+
														
 
															+        im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
														
 
															+        ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
														
 
															+        ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
														
 
															+
														
 
															+        num_clusters = len(self.cluster_df_list)
														
 
															+        for y in range(num_clusters):
														
 
															+            for x in range(num_clusters):
														
 
															+                plt.text(x , y
														
 
															+                    , '%.2f' % p_values[y, x]
														
 
															+                    , horizontalalignment='center'
														
 
															+                    , verticalalignment='center'
														
 
															+                    , fontsize=8
														
 
															+                    )
														
 
															+
														
 
															+        cbar = heatmap.colorbar(im)
														
 
															+        cbar.ax.set_ylabel('p-value')
														
 
															+        plt.title(feature.upper(), fontsize=16)
														
 
															+        print('\n')
														
 
															+        plt.show()
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class Analysis:
														
 
															+    def __init__(self
														
 
															+                , cont_features, ord_features, nom_features
														
 
															+                , clusters_dbscan_FDC, values_dbscan_FDC):
														
 
															+        self.cont_features = cont_features
														
 
															+        self.ord_features = ord_features
														
 
															+        self.nom_features = nom_features
														
 
															+        self.clusters_dbscan_FDC = clusters_dbscan_FDC
														
 
															+        self.values_dbscan_FDC = values_dbscan_FDC
														
 
															+
														
 
															+
														
 
															+    def run(self, data):
														
 
															+        data['Clusters'] = np.array(self.clusters_dbscan_FDC)
														
 
															+
														
 
															+        cluster_df_list=[]
														
 
															+        for cluster in self.values_dbscan_FDC:
														
 
															+            cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
														
 
															+            cluster_df.columns = list(data.columns)[:-1]
														
 
															+            cluster_df_list.append(cluster_df)
														
 
															+
														
 
															+        cluster_df_list = cluster_df_list[1:]
														
 
															+
														
 
															+        a = AnalysisIntern(cluster_df_list, "cont")
														
 
															+        for feature in self.cont_features:
														
 
															+            a.p_map(feature)
														
 
															+
														
 
															+        a = AnalysisIntern(cluster_df_list, "ord")
														
 
															+        for feature in self.ord_features:
														
 
															+            a.p_map(feature)
														
 
															+
														
 
															+        for feature in self.nom_features:
														
 
															+            a.p_map(feature)
														
 
															+
														
--- a/fdc/visualize.py
+++ b/fdc/visualize.py
@@ -0,0 +1,154 @@
 
															+import seaborn as sns
														
 
															+import matplotlib.pyplot as plt
														
 
															+
														
 
															+def plotCluster(data, clusterName="cluster", xName="FDC_1", yName="FDC_2", stroke=20):
														
 
															+    colors_set = [
														
 
															+        'lightcoral', 'cornflowerblue', 'orange','mediumorchid', 'lightseagreen'
														
 
															+        , 'olive', 'chocolate', 'steelblue', 'paleturquoise',  'lightgreen'
														
 
															+        , 'burlywood', 'lightsteelblue']
														
 
															+
														
 
															+    customPalette_set = sns.set_palette(sns.color_palette(colors_set))
														
 
															+
														
 
															+    sns.lmplot(
														
 
															+        x=xName
														
 
															+        , y=yName
														
 
															+        , data=data
														
 
															+        , fit_reg=False
														
 
															+        , legend=True
														
 
															+        , hue=clusterName
														
 
															+        , scatter_kws={"s": stroke}
														
 
															+        , palette=customPalette_set
														
 
															+        )
														
 
															+    plt.show()
														
 
															+
														
 
															+
														
 
															+
														
 
															+def plotMapping(data, xName="UMAP_0", yName="UMAP_1"):
														
 
															+    colors_set1 = [
														
 
															+        "lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood"
														
 
															+        , "cornflowerblue", "plum", "yellowgreen"]
														
 
															+
														
 
															+    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
														
 
															+
														
 
															+    sns.lmplot(x=xName
														
 
															+        , y=yName
														
 
															+        , data=data
														
 
															+        , fit_reg=False
														
 
															+        , legend=False
														
 
															+        , scatter_kws={"s": 3}
														
 
															+        , palette=customPalette_set1)
														
 
															+    plt.show()
														
 
															+
														
 
															+
														
 
															+
														
 
															+def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features):
														
 
															+    vizlimit = 15
														
 
															+    plt.rcParams["figure.figsize"] = (12, 6)
														
 
															+    
														
 
															+    col = sns.color_palette("Set2")
														
 
															+    
														
 
															+    rows = 3
														
 
															+    columns = 3
														
 
															+    
														
 
															+    for feature in feature_list:
														
 
															+        print('Feature name:', feature.upper())
														
 
															+        print('\n')
														
 
															+    
														
 
															+        if len(main_data[feature].value_counts()) <= vizlimit:
														
 
															+            for cluster_counter, cluster in enumerate(cluster_df_list):
														
 
															+                print('Cluster '+ str(cluster_counter + 1) + ' frequency distribution')
														
 
															+                if feature in list(rev_dict.keys()):
														
 
															+                    feat_keys=rev_dict[feature]
														
 
															+                    r = dict(zip(feat_keys.values(), feat_keys.keys()))
														
 
															+                    print(cluster.replace({feature:r})[feature].value_counts())
														
 
															+                else:
														
 
															+                    print(cluster[feature].value_counts())
														
 
															+                print('\n')
														
 
															+        
														
 
															+            print('\n')
														
 
															+            print('\n')
														
 
															+        
														
 
															+            cluster_bar = []
														
 
															+            for cluster in cluster_df_list:
														
 
															+                if feature in list(rev_dict.keys()):
														
 
															+                    y = np.array(cluster.replace({feature:r})[feature].value_counts())
														
 
															+                    x = np.array(cluster.replace({feature:r})[feature].value_counts().index)
														
 
															+                    cluster_bar.append([x,y])
														
 
															+                else:
														
 
															+                    y = np.array(cluster[feature].value_counts().sort_index())
														
 
															+                    x = np.array(cluster[feature].value_counts().sort_index().index)
														
 
															+                    cluster_bar.append([x,y])
														
 
															+                
														
 
															+            cluster_bar = np.array(cluster_bar)
														
 
															+        
														
 
															+            figx, ax = plt.subplots(rows, columns)
														
 
															+            figx.set_size_inches(10.5, 28.5)
														
 
															+            cluster_in_subplot_axis_dict = np.array([[0,0],[0,1],[0,2],[1,0],[1,1],[1,2],[2,0],[1,1],[2,2]])
														
 
															+            c = 0
														
 
															+            for i in range(rows):
														
 
															+                for j in range(columns):
														
 
															+                    ax[i,j].bar(cluster_bar[c,0], cluster_bar[c,1], color=col)
														
 
															+                    ax[i,j].tick_params(axis='x', which='major', labelsize=8, rotation=90)
														
 
															+                    ax[i,j].set_title('Cluster: ' + str(c + 1))
														
 
															+                    if c > len(cluster_df_list):
														
 
															+                        break
														
 
															+                    else:
														
 
															+                        c += 1
														
 
															+            
														
 
															+        means = []
														
 
															+        sds = []
														
 
															+        cluster_labels = []
														
 
															+        for cluster_counter, cluster in enumerate(cluster_df_list):
														
 
															+            if feature in cont_features:
														
 
															+                print('Cluster '+ str(cluster_counter + 1) + ' summary statistics')
														
 
															+                print('\n')
														
 
															+                cm = cluster[feature].mean()
														
 
															+                cs = cluster[feature].std()
														
 
															+                print('feature mean:', cm)
														
 
															+                print('feature standard deviation:', cs)
														
 
															+                print('feature median:', cluster[feature].median())
														
 
															+                print('\n')
														
 
															+                means.append(cm)
														
 
															+                sds.append(cs)
														
 
															+                cluster_labels.append('C' + str(cluster_counter + 1))
														
 
															+            
														
 
															+        means = np.array(means)
														
 
															+        sds = np.array(sds)
														
 
															+        cluster_labels = np.array(cluster_labels)
														
 
															+        
														
 
															+        print('\n')  
														
 
															+        
														
 
															+        print('Distribution of feature across clusters')
														
 
															+        if feature in cont_features:   
														
 
															+            fig, ax7 = plt.subplots()
														
 
															+            ax7.bar(cluster_labels, means, yerr=sds, color=sns.color_palette("Set3"))
														
 
															+            ax7.tick_params(axis='both', which='major', labelsize=10)
														
 
															+            plt.xlabel(feature, fontsize=15)
														
 
															+            plt.show()
														
 
															+        
														
 
															+        print('\n')
														
 
															+        print('\n')
														
 
															+        
														
 
															+        colors_set = ['lightgray', 'lightcoral', 'cornflowerblue', 'orange', 'mediumorchid'
														
 
															+            , 'lightseagreen', 'olive', 'chocolate', 'steelblue', 'paleturquoise', 'lightgreen'
														
 
															+            , 'burlywood','lightsteelblue']
														
 
															+        customPalette_set = sns.set_palette(sns.color_palette(colors_set))
														
 
															+        
														
 
															+        if feature not in cont_features:
														
 
															+            print('Feature distribution in UMAP embedding')
														
 
															+            if feature in list(rev_dict.keys()):
														
 
															+                umap_data[feature] = np.array(main_data.replace({feature:r})[feature])
														
 
															+            else:
														
 
															+                umap_data[feature] = np.array(main_data[feature])
														
 
															+            sns.lmplot(x="FDC_1", y="FDC_2",
														
 
															+                data=umap_data, 
														
 
															+                fit_reg=False, 
														
 
															+                legend=True,
														
 
															+                hue=feature, # color by cluster
														
 
															+                scatter_kws={"s": 20},
														
 
															+                palette=customPalette_set) # specify the point size
														
 
															+            plt.show()
														
 
															+        
														
 
															+        print('\n')
														
 
															+        print('\n')
														
 
															+