|
|
@@ -1,111 +1,116 @@
|
|
|
from scipy.stats import ranksums
|
|
|
import matplotlib as mpl
|
|
|
+from matplotlib import pyplot as plt
|
|
|
from scipy import stats
|
|
|
+import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MidpointNormalize(mpl.colors.Normalize):
|
|
|
- def __init__(self, vmin, vmax, midpoint=0, clip=False):
|
|
|
- self.midpoint = midpoint
|
|
|
- mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
|
|
|
+ def __init__(self, vmin, vmax, midpoint=0, clip=False):
|
|
|
+ self.midpoint = midpoint
|
|
|
+ mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
|
|
|
|
|
|
|
|
|
- def __call__(self, value, clip=None):
|
|
|
- normalized_min = max(0, 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
|
|
|
- normalized_max = min(1, 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
|
|
|
- normalized_mid = 0.5
|
|
|
- x = [self.vmin, self.midpoint, self.vmax]
|
|
|
- y = [normalized_min, normalized_mid, normalized_max]
|
|
|
- return np.ma.masked_array(np.interp(value, x, y))
|
|
|
+ def __call__(self, value, clip=None):
|
|
|
+ normalized_min = max(0,
|
|
|
+ 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
|
|
|
+
|
|
|
+ normalized_max = min(1,
|
|
|
+ 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
|
|
|
+
|
|
|
+ normalized_mid = 0.5
|
|
|
+
|
|
|
+ x = [self.vmin, self.midpoint, self.vmax]
|
|
|
+ y = [normalized_min, normalized_mid, normalized_max]
|
|
|
+ return np.ma.masked_array(np.interp(value, x, y))
|
|
|
|
|
|
|
|
|
|
|
|
class AnalysisIntern:
|
|
|
- def __init__(self, cluster_df_list, var_type="cont"):
|
|
|
- self.var_type = var_type
|
|
|
- self.cluster_df_list = cluster_df_list
|
|
|
+ def __init__(self, cluster_df_list, var_type="cont"):
|
|
|
+ self.var_type = var_type
|
|
|
+ self.cluster_df_list = cluster_df_list
|
|
|
|
|
|
|
|
|
- def p_val(self, clustera, clusterb, feature):
|
|
|
- if self.var_type == 'cont':
|
|
|
- return stats.ttest_ind(
|
|
|
- np.array(clustera[feature])
|
|
|
- , np.array(clusterb[feature])
|
|
|
- ).pvalue
|
|
|
- else:
|
|
|
- return ranksums(
|
|
|
- np.array(clustera[feature])
|
|
|
- , np.array(clusterb[feature])
|
|
|
- ).pvalue
|
|
|
+ def p_val(self, clustera, clusterb, feature):
|
|
|
+ if self.var_type == 'cont':
|
|
|
+ return stats.ttest_ind(
|
|
|
+ np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
|
|
|
+ else:
|
|
|
+ return ranksums(
|
|
|
+ np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
|
|
|
|
|
|
|
|
|
- def feature_p_val(self, feature):
|
|
|
- return np.array([
|
|
|
- np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
|
|
|
- for ci in self.cluster_df_list
|
|
|
- ])
|
|
|
+ def feature_p_val(self, feature):
|
|
|
+ return np.array([
|
|
|
+ np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
|
|
|
+ for ci in self.cluster_df_list
|
|
|
+ ])
|
|
|
|
|
|
|
|
|
- def p_map(self, feature):
|
|
|
- heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
|
|
|
- norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
|
|
|
+ def p_map(self, feature):
|
|
|
+ heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
|
|
|
+ norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
|
|
|
|
|
|
- p_values = self.feature_p_val(feature)
|
|
|
+ p_values = self.feature_p_val(feature)
|
|
|
|
|
|
- im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
|
|
|
- ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
|
|
|
- ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
|
|
|
+ im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
|
|
|
+ ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
|
|
|
+ ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
|
|
|
|
|
|
- num_clusters = len(self.cluster_df_list)
|
|
|
- for y in range(num_clusters):
|
|
|
- for x in range(num_clusters):
|
|
|
- plt.text(x , y
|
|
|
- , '%.2f' % p_values[y, x]
|
|
|
- , horizontalalignment='center'
|
|
|
- , verticalalignment='center'
|
|
|
- , fontsize=8
|
|
|
- )
|
|
|
+ num_clusters = len(self.cluster_df_list)
|
|
|
+ for y in range(num_clusters):
|
|
|
+ for x in range(num_clusters):
|
|
|
+ plt.text(x , y
|
|
|
+ , '%.2f' % p_values[y, x]
|
|
|
+ , horizontalalignment='center'
|
|
|
+ , verticalalignment='center'
|
|
|
+ , fontsize=8
|
|
|
+ )
|
|
|
|
|
|
- cbar = heatmap.colorbar(im)
|
|
|
- cbar.ax.set_ylabel('p-value')
|
|
|
- plt.title(feature.upper(), fontsize=16)
|
|
|
- print('\n')
|
|
|
- plt.show()
|
|
|
+ cbar = heatmap.colorbar(im)
|
|
|
+ cbar.ax.set_ylabel('p-value')
|
|
|
+ plt.title(feature.upper(), fontsize=16)
|
|
|
+ print('\n')
|
|
|
+ plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Analysis:
|
|
|
- def __init__(self
|
|
|
- , cont_features, ord_features, nom_features
|
|
|
- , clusters_dbscan_FDC, values_dbscan_FDC):
|
|
|
- self.cont_features = cont_features
|
|
|
- self.ord_features = ord_features
|
|
|
- self.nom_features = nom_features
|
|
|
- self.clusters_dbscan_FDC = clusters_dbscan_FDC
|
|
|
- self.values_dbscan_FDC = values_dbscan_FDC
|
|
|
+ def __init__(self
|
|
|
+ , cont_features, ord_features, nom_features
|
|
|
+ , clusters_dbscan_FDC, values_dbscan_FDC):
|
|
|
+ self.cont_features = cont_features
|
|
|
+ self.ord_features = ord_features
|
|
|
+ self.nom_features = nom_features
|
|
|
+ self.clusters_dbscan_FDC = clusters_dbscan_FDC
|
|
|
+ self.values_dbscan_FDC = values_dbscan_FDC
|
|
|
|
|
|
+ self.cluster_df_list = []
|
|
|
|
|
|
- def run(self, data):
|
|
|
- data['Clusters'] = np.array(self.clusters_dbscan_FDC)
|
|
|
|
|
|
- cluster_df_list=[]
|
|
|
- for cluster in self.values_dbscan_FDC:
|
|
|
- cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
|
|
|
- cluster_df.columns = list(data.columns)[:-1]
|
|
|
- cluster_df_list.append(cluster_df)
|
|
|
+ def run(self, data):
|
|
|
+ data['Clusters'] = np.array(self.clusters_dbscan_FDC)
|
|
|
|
|
|
- cluster_df_list = cluster_df_list[1:]
|
|
|
+ cluster_df_list=[]
|
|
|
+ for cluster in self.values_dbscan_FDC:
|
|
|
+ cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
|
|
|
+ cluster_df.columns = list(data.columns)[:-1]
|
|
|
+ cluster_df_list.append(cluster_df)
|
|
|
|
|
|
- a = AnalysisIntern(cluster_df_list, "cont")
|
|
|
- for feature in self.cont_features:
|
|
|
- a.p_map(feature)
|
|
|
+ self.cluster_df_list = cluster_df_list[1:]
|
|
|
|
|
|
- a = AnalysisIntern(cluster_df_list, "ord")
|
|
|
- for feature in self.ord_features:
|
|
|
- a.p_map(feature)
|
|
|
+ a = AnalysisIntern(self.cluster_df_list, "cont")
|
|
|
+ for feature in self.cont_features:
|
|
|
+ a.p_map(feature)
|
|
|
|
|
|
- for feature in self.nom_features:
|
|
|
- a.p_map(feature)
|
|
|
+ a = AnalysisIntern(cluster_df_list, "ord")
|
|
|
+ for feature in self.ord_features:
|
|
|
+ a.p_map(feature)
|
|
|
+
|
|
|
+ for feature in self.nom_features:
|
|
|
+ a.p_map(feature)
|
|
|
|