Selaa lähdekoodia

Replaced hypothesis testing by library version.

Kristian Schultz 3 vuotta sitten
vanhempi
commit
5e728dae59

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 515 - 515
Is your data fit for decision making using Machine Learning version 4.ipynb


+ 79 - 74
fdc/hypothesisTesting.py

@@ -1,111 +1,116 @@
 from scipy.stats import ranksums
 import matplotlib as mpl
+from matplotlib import pyplot as plt
 from scipy import stats
+import numpy as np
 
 
 
 
 class MidpointNormalize(mpl.colors.Normalize):
-    def __init__(self, vmin, vmax, midpoint=0, clip=False):
-        self.midpoint = midpoint
-        mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
+  def __init__(self, vmin, vmax, midpoint=0, clip=False):
+    self.midpoint = midpoint
+    mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
 
 
-    def __call__(self, value, clip=None):
-        normalized_min = max(0, 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
-        normalized_max = min(1, 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
-        normalized_mid = 0.5
-        x = [self.vmin, self.midpoint, self.vmax]
-        y = [normalized_min, normalized_mid, normalized_max]
-        return np.ma.masked_array(np.interp(value, x, y))
+  def __call__(self, value, clip=None):
+    normalized_min = max(0,
+        0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
+    
+    normalized_max = min(1,
+        0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
+    
+    normalized_mid = 0.5
+    
+    x = [self.vmin, self.midpoint, self.vmax]
+    y = [normalized_min, normalized_mid, normalized_max]
+    return np.ma.masked_array(np.interp(value, x, y))
 
 
 
 class AnalysisIntern:
-    def __init__(self, cluster_df_list, var_type="cont"):
-        self.var_type = var_type
-        self.cluster_df_list  = cluster_df_list
+  def __init__(self, cluster_df_list, var_type="cont"):
+    self.var_type = var_type
+    self.cluster_df_list  = cluster_df_list
 
 
-    def p_val(self, clustera, clusterb, feature):
-        if self.var_type == 'cont':
-            return stats.ttest_ind(
-                np.array(clustera[feature])
-                , np.array(clusterb[feature])
-                ).pvalue
-        else:
-            return ranksums(
-                np.array(clustera[feature])
-                , np.array(clusterb[feature])
-                ).pvalue
+  def p_val(self, clustera, clusterb, feature):
+    if self.var_type == 'cont':
+      return stats.ttest_ind(
+        np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
+    else:
+      return ranksums(
+        np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
 
 
-    def feature_p_val(self, feature):
-        return np.array([
-            np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
-            for ci in self.cluster_df_list
-            ])
+  def feature_p_val(self, feature):
+    return np.array([
+      np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
+      for ci in self.cluster_df_list
+      ])
 
 
-    def p_map(self, feature):
-        heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
-        norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
+  def p_map(self, feature):
+    heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
+    norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
 
-        p_values = self.feature_p_val(feature)
+    p_values = self.feature_p_val(feature)
 
-        im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
-        ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
-        ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
+    im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
+    ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
+    ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
 
-        num_clusters = len(self.cluster_df_list)
-        for y in range(num_clusters):
-            for x in range(num_clusters):
-                plt.text(x , y
-                    , '%.2f' % p_values[y, x]
-                    , horizontalalignment='center'
-                    , verticalalignment='center'
-                    , fontsize=8
-                    )
+    num_clusters = len(self.cluster_df_list)
+    for y in range(num_clusters):
+      for x in range(num_clusters):
+        plt.text(x , y
+          , '%.2f' % p_values[y, x]
+          , horizontalalignment='center'
+          , verticalalignment='center'
+          , fontsize=8
+          )
 
-        cbar = heatmap.colorbar(im)
-        cbar.ax.set_ylabel('p-value')
-        plt.title(feature.upper(), fontsize=16)
-        print('\n')
-        plt.show()
+    cbar = heatmap.colorbar(im)
+    cbar.ax.set_ylabel('p-value')
+    plt.title(feature.upper(), fontsize=16)
+    print('\n')
+    plt.show()
 
 
 
 
 class Analysis:
-    def __init__(self
-                , cont_features, ord_features, nom_features
-                , clusters_dbscan_FDC, values_dbscan_FDC):
-        self.cont_features = cont_features
-        self.ord_features = ord_features
-        self.nom_features = nom_features
-        self.clusters_dbscan_FDC = clusters_dbscan_FDC
-        self.values_dbscan_FDC = values_dbscan_FDC
+  def __init__(self
+               , cont_features, ord_features, nom_features
+               , clusters_dbscan_FDC, values_dbscan_FDC):
+    self.cont_features = cont_features
+    self.ord_features = ord_features
+    self.nom_features = nom_features
+    self.clusters_dbscan_FDC = clusters_dbscan_FDC
+    self.values_dbscan_FDC = values_dbscan_FDC
 
+    self.cluster_df_list = []
 
-    def run(self, data):
-        data['Clusters'] = np.array(self.clusters_dbscan_FDC)
 
-        cluster_df_list=[]
-        for cluster in self.values_dbscan_FDC:
-            cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
-            cluster_df.columns = list(data.columns)[:-1]
-            cluster_df_list.append(cluster_df)
+  def run(self, data):
+    data['Clusters'] = np.array(self.clusters_dbscan_FDC)
 
-        cluster_df_list = cluster_df_list[1:]
+    cluster_df_list=[]
+    for cluster in self.values_dbscan_FDC:
+      cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
+      cluster_df.columns = list(data.columns)[:-1]
+      cluster_df_list.append(cluster_df)
 
-        a = AnalysisIntern(cluster_df_list, "cont")
-        for feature in self.cont_features:
-            a.p_map(feature)
+    self.cluster_df_list = cluster_df_list[1:]
 
-        a = AnalysisIntern(cluster_df_list, "ord")
-        for feature in self.ord_features:
-            a.p_map(feature)
+    a = AnalysisIntern(self.cluster_df_list, "cont")
+    for feature in self.cont_features:
+      a.p_map(feature)
 
-        for feature in self.nom_features:
-            a.p_map(feature)
+    a = AnalysisIntern(cluster_df_list, "ord")
+    for feature in self.ord_features:
+      a.p_map(feature)
+
+    for feature in self.nom_features:
+      a.p_map(feature)
 

Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä