Selaa lähdekoodia

added visualization

Kristian Schultz 3 vuotta sitten
vanhempi
commit
d345bd09b0
5 muutettua tiedostoa jossa 10425 lisäystä ja 10152 poistoa
  1. 10132 10131
      agglo_5dim_2NN_v3.ipynb
  2. 22 0
      fdc/clustering.py
  3. 6 21
      fdc/fdc.py
  4. 111 0
      fdc/hypothesisTesting.py
  5. 154 0
      fdc/visualize.py

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 10132 - 10131
agglo_5dim_2NN_v3.ipynb


+ 22 - 0
fdc/clustering.py

@@ -0,0 +1,22 @@
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from fdc.visualize import plotCluster
+
+def aglo_clustering(number_of_clusters, affinity, linkage
+                    , five_d_embedding, two_d_embedding
+                    , visual=False):
+    np.random.seed(42)
+    ag_cluster = AgglomerativeClustering(
+        n_clusters=number_of_clusters
+        , affinity=affinity
+        , linkage=linkage
+        )
+    clusters = ag_cluster.fit_predict(five_d_embedding)
+    (values, counts) = np.unique(clusters, return_counts=True)
+    two_d_embedding['Cluster'] = clusters
+    
+    if visual:
+        plotCluster(two_d_embedding, clusterName="Cluster", xName="UMAP_0", yName="UMAP_1", stroke=3)
+
+    return two_d_embedding.Cluster.to_list(), counts
+

+ 6 - 21
fdc/fdc.py

@@ -5,8 +5,7 @@ import umap.umap_ as umap
 
 # --[ Known to be used but can we avoid it? ]----
 import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
+from fdc.visualize import plotMapping
 
 
 def value(v, defaultValue):
@@ -15,20 +14,6 @@ def value(v, defaultValue):
     else:
         return v
 
-def draw2dMapping(data):
-    colors_set1 = ["lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood", "cornflowerblue", "plum", "yellowgreen"]
-    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
-
-    sns.lmplot(x="UMAP_0"
-        , y="UMAP_1"
-        , data=data
-        , fit_reg=False
-        , legend=False
-        , scatter_kws={"s": 3}
-        , palette=customPalette_set1)
-    plt.show()
-
-
 
 def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
     data_embedded = Clustering(metric, UMAP_neb, min_dist_UMAP).fit(data)
@@ -36,14 +21,14 @@ def feature_clustering(UMAP_neb, min_dist_UMAP, metric, data, visual=False):
     result = pd.DataFrame(data=data_embedded, columns=['UMAP_0', 'UMAP_1'])
     
     if visual:
-        draw2dMapping(result)
+        plotMapping(result)
 
     return result
 
 
 
 @jit(nopython=True)
-def modified_can(a,b):
+def canberra_modified(a,b):
     return np.sqrt(np.sum(np.array([np.abs(1.0 - x) / (1.0 + np.abs(x)) for x in (np.abs(a-b) + 1.0)])))
 
 
@@ -78,7 +63,7 @@ class FDC:
     def __init__(self, clustering_cont=None, clustering_ord=None, clustering_nom=None, drop_nominal=True, visual=False, with_2d_embedding=False, use_pandas_output=False):
         # used clusterings
         self.clustering_cont = clustering_cont or Clustering('euclidian', 30, 0.1)
-        self.clustering_ord = clustering_ord or Clustering(modified_can, 30, 0.1)
+        self.clustering_ord = clustering_ord or Clustering(canberra_modified, 30, 0.1)
         self.clustering_nom = clustering_nom or Clustering('hamming', 30, 0.1)
 
         # Control of data output
@@ -158,9 +143,9 @@ class FDC:
             # Show mapping if needed
             if visual:
                 if self.use_pandas_output:
-                    draw2dMapping(result_reduced)
+                    plotMapping(result_reduced)
                 else:
-                    draw2dMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
+                    plotMapping(pd.DataFrame(data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
 
 
         # Return the results

+ 111 - 0
fdc/hypothesisTesting.py

@@ -0,0 +1,111 @@
+from scipy.stats import ranksums
+import matplotlib as mpl
+from scipy import stats
+
+
+
+
+class MidpointNormalize(mpl.colors.Normalize):
+    def __init__(self, vmin, vmax, midpoint=0, clip=False):
+        self.midpoint = midpoint
+        mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
+
+
+    def __call__(self, value, clip=None):
+        normalized_min = max(0, 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
+        normalized_max = min(1, 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
+        normalized_mid = 0.5
+        x = [self.vmin, self.midpoint, self.vmax]
+        y = [normalized_min, normalized_mid, normalized_max]
+        return np.ma.masked_array(np.interp(value, x, y))
+
+
+
+class AnalysisIntern:
+    def __init__(self, cluster_df_list, var_type="cont"):
+        self.var_type = var_type
+        self.cluster_df_list  = cluster_df_list
+
+
+    def p_val(self, clustera, clusterb, feature):
+        if self.var_type == 'cont':
+            return stats.ttest_ind(
+                np.array(clustera[feature])
+                , np.array(clusterb[feature])
+                ).pvalue
+        else:
+            return ranksums(
+                np.array(clustera[feature])
+                , np.array(clusterb[feature])
+                ).pvalue
+
+
+    def feature_p_val(self, feature):
+        return np.array([
+            np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
+            for ci in self.cluster_df_list
+            ])
+
+
+    def p_map(self, feature):
+        heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
+        norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
+
+        p_values = self.feature_p_val(feature)
+
+        im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
+        ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
+        ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
+
+        num_clusters = len(self.cluster_df_list)
+        for y in range(num_clusters):
+            for x in range(num_clusters):
+                plt.text(x , y
+                    , '%.2f' % p_values[y, x]
+                    , horizontalalignment='center'
+                    , verticalalignment='center'
+                    , fontsize=8
+                    )
+
+        cbar = heatmap.colorbar(im)
+        cbar.ax.set_ylabel('p-value')
+        plt.title(feature.upper(), fontsize=16)
+        print('\n')
+        plt.show()
+
+
+
+
+class Analysis:
+    def __init__(self
+                , cont_features, ord_features, nom_features
+                , clusters_dbscan_FDC, values_dbscan_FDC):
+        self.cont_features = cont_features
+        self.ord_features = ord_features
+        self.nom_features = nom_features
+        self.clusters_dbscan_FDC = clusters_dbscan_FDC
+        self.values_dbscan_FDC = values_dbscan_FDC
+
+
+    def run(self, data):
+        data['Clusters'] = np.array(self.clusters_dbscan_FDC)
+
+        cluster_df_list=[]
+        for cluster in self.values_dbscan_FDC:
+            cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
+            cluster_df.columns = list(data.columns)[:-1]
+            cluster_df_list.append(cluster_df)
+
+        cluster_df_list = cluster_df_list[1:]
+
+        a = AnalysisIntern(cluster_df_list, "cont")
+        for feature in self.cont_features:
+            a.p_map(feature)
+
+        a = AnalysisIntern(cluster_df_list, "ord")
+        for feature in self.ord_features:
+            a.p_map(feature)
+
+        for feature in self.nom_features:
+            a.p_map(feature)
+

+ 154 - 0
fdc/visualize.py

@@ -0,0 +1,154 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def plotCluster(data, clusterName="cluster", xName="FDC_1", yName="FDC_2", stroke=20):
+    colors_set = [
+        'lightcoral', 'cornflowerblue', 'orange','mediumorchid', 'lightseagreen'
+        , 'olive', 'chocolate', 'steelblue', 'paleturquoise',  'lightgreen'
+        , 'burlywood', 'lightsteelblue']
+
+    customPalette_set = sns.set_palette(sns.color_palette(colors_set))
+
+    sns.lmplot(
+        x=xName
+        , y=yName
+        , data=data
+        , fit_reg=False
+        , legend=True
+        , hue=clusterName
+        , scatter_kws={"s": stroke}
+        , palette=customPalette_set
+        )
+    plt.show()
+
+
+
+def plotMapping(data, xName="UMAP_0", yName="UMAP_1"):
+    colors_set1 = [
+        "lightcoral", "lightseagreen", "mediumorchid", "orange", "burlywood"
+        , "cornflowerblue", "plum", "yellowgreen"]
+
+    customPalette_set1 = sns.set_palette(sns.color_palette(colors_set1))
+
+    sns.lmplot(x=xName
+        , y=yName
+        , data=data
+        , fit_reg=False
+        , legend=False
+        , scatter_kws={"s": 3}
+        , palette=customPalette_set1)
+    plt.show()
+
+
+
+def vizx(feature_list, cluster_df_list, main_data, umap_data, cont_features):
+    vizlimit = 15
+    plt.rcParams["figure.figsize"] = (12, 6)
+    
+    col = sns.color_palette("Set2")
+    
+    rows = 3
+    columns = 3
+    
+    for feature in feature_list:
+        print('Feature name:', feature.upper())
+        print('\n')
+    
+        if len(main_data[feature].value_counts()) <= vizlimit:
+            for cluster_counter, cluster in enumerate(cluster_df_list):
+                print('Cluster '+ str(cluster_counter + 1) + ' frequency distribution')
+                if feature in list(rev_dict.keys()):
+                    feat_keys=rev_dict[feature]
+                    r = dict(zip(feat_keys.values(), feat_keys.keys()))
+                    print(cluster.replace({feature:r})[feature].value_counts())
+                else:
+                    print(cluster[feature].value_counts())
+                print('\n')
+        
+            print('\n')
+            print('\n')
+        
+            cluster_bar = []
+            for cluster in cluster_df_list:
+                if feature in list(rev_dict.keys()):
+                    y = np.array(cluster.replace({feature:r})[feature].value_counts())
+                    x = np.array(cluster.replace({feature:r})[feature].value_counts().index)
+                    cluster_bar.append([x,y])
+                else:
+                    y = np.array(cluster[feature].value_counts().sort_index())
+                    x = np.array(cluster[feature].value_counts().sort_index().index)
+                    cluster_bar.append([x,y])
+                
+            cluster_bar = np.array(cluster_bar)
+        
+            figx, ax = plt.subplots(rows, columns)
+            figx.set_size_inches(10.5, 28.5)
+            cluster_in_subplot_axis_dict = np.array([[0,0],[0,1],[0,2],[1,0],[1,1],[1,2],[2,0],[1,1],[2,2]])
+            c = 0
+            for i in range(rows):
+                for j in range(columns):
+                    ax[i,j].bar(cluster_bar[c,0], cluster_bar[c,1], color=col)
+                    ax[i,j].tick_params(axis='x', which='major', labelsize=8, rotation=90)
+                    ax[i,j].set_title('Cluster: ' + str(c + 1))
+                    if c > len(cluster_df_list):
+                        break
+                    else:
+                        c += 1
+            
+        means = []
+        sds = []
+        cluster_labels = []
+        for cluster_counter, cluster in enumerate(cluster_df_list):
+            if feature in cont_features:
+                print('Cluster '+ str(cluster_counter + 1) + ' summary statistics')
+                print('\n')
+                cm = cluster[feature].mean()
+                cs = cluster[feature].std()
+                print('feature mean:', cm)
+                print('feature standard deviation:', cs)
+                print('feature median:', cluster[feature].median())
+                print('\n')
+                means.append(cm)
+                sds.append(cs)
+                cluster_labels.append('C' + str(cluster_counter + 1))
+            
+        means = np.array(means)
+        sds = np.array(sds)
+        cluster_labels = np.array(cluster_labels)
+        
+        print('\n')  
+        
+        print('Distribution of feature across clusters')
+        if feature in cont_features:   
+            fig, ax7 = plt.subplots()
+            ax7.bar(cluster_labels, means, yerr=sds, color=sns.color_palette("Set3"))
+            ax7.tick_params(axis='both', which='major', labelsize=10)
+            plt.xlabel(feature, fontsize=15)
+            plt.show()
+        
+        print('\n')
+        print('\n')
+        
+        colors_set = ['lightgray', 'lightcoral', 'cornflowerblue', 'orange', 'mediumorchid'
+            , 'lightseagreen', 'olive', 'chocolate', 'steelblue', 'paleturquoise', 'lightgreen'
+            , 'burlywood','lightsteelblue']
+        customPalette_set = sns.set_palette(sns.color_palette(colors_set))
+        
+        if feature not in cont_features:
+            print('Feature distribution in UMAP embedding')
+            if feature in list(rev_dict.keys()):
+                umap_data[feature] = np.array(main_data.replace({feature:r})[feature])
+            else:
+                umap_data[feature] = np.array(main_data[feature])
+            sns.lmplot(x="FDC_1", y="FDC_2",
+                data=umap_data, 
+                fit_reg=False, 
+                legend=True,
+                hue=feature, # color by cluster
+                scatter_kws={"s": 20},
+                palette=customPalette_set) # specify the point size
+            plt.show()
+        
+        print('\n')
+        print('\n')
+        

Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä