| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- from scipy.stats import ranksums
- import matplotlib as mpl
- from matplotlib import pyplot as plt
- from scipy import stats
- import numpy as np
- class MidpointNormalize(mpl.colors.Normalize):
- def __init__(self, vmin, vmax, midpoint=0, clip=False):
- self.midpoint = midpoint
- mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
- def __call__(self, value, clip=None):
- normalized_min = max(0,
- 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
-
- normalized_max = min(1,
- 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
-
- normalized_mid = 0.5
-
- x = [self.vmin, self.midpoint, self.vmax]
- y = [normalized_min, normalized_mid, normalized_max]
- return np.ma.masked_array(np.interp(value, x, y))
- class AnalysisIntern:
- def __init__(self, cluster_df_list, var_type="cont"):
- self.var_type = var_type
- self.cluster_df_list = cluster_df_list
- def p_val(self, clustera, clusterb, feature):
- if self.var_type == 'cont':
- return stats.ttest_ind(
- np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
- else:
- return ranksums(
- np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
- def feature_p_val(self, feature):
- return np.array([
- np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
- for ci in self.cluster_df_list
- ])
- def p_map(self, feature):
- heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
- norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
- p_values = self.feature_p_val(feature)
- im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
- ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
- ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
- num_clusters = len(self.cluster_df_list)
- for y in range(num_clusters):
- for x in range(num_clusters):
- plt.text(x , y
- , '%.2f' % p_values[y, x]
- , horizontalalignment='center'
- , verticalalignment='center'
- , fontsize=8
- )
- cbar = heatmap.colorbar(im)
- cbar.ax.set_ylabel('p-value')
- plt.title(feature.upper(), fontsize=16)
- print('\n')
- plt.show()
- class Analysis:
- def __init__(self
- , cont_features, ord_features, nom_features
- , clusters_dbscan_FDC, values_dbscan_FDC):
- self.cont_features = cont_features
- self.ord_features = ord_features
- self.nom_features = nom_features
- self.clusters_dbscan_FDC = clusters_dbscan_FDC
- self.values_dbscan_FDC = values_dbscan_FDC
- self.cluster_df_list = []
- def run(self, data):
- data['Clusters'] = np.array(self.clusters_dbscan_FDC)
- cluster_df_list=[]
- for cluster in self.values_dbscan_FDC:
- cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
- cluster_df.columns = list(data.columns)[:-1]
- cluster_df_list.append(cluster_df)
- self.cluster_df_list = cluster_df_list[1:]
- a = AnalysisIntern(self.cluster_df_list, "cont")
- for feature in self.cont_features:
- a.p_map(feature)
- a = AnalysisIntern(cluster_df_list, "ord")
- for feature in self.ord_features:
- a.p_map(feature)
- for feature in self.nom_features:
- a.p_map(feature)
|