hypothesisTesting.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. from scipy.stats import ranksums
  2. import matplotlib as mpl
  3. from matplotlib import pyplot as plt
  4. from scipy import stats
  5. import numpy as np
  6. class MidpointNormalize(mpl.colors.Normalize):
  7. def __init__(self, vmin, vmax, midpoint=0, clip=False):
  8. self.midpoint = midpoint
  9. mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
  10. def __call__(self, value, clip=None):
  11. normalized_min = max(0,
  12. 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
  13. normalized_max = min(1,
  14. 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
  15. normalized_mid = 0.5
  16. x = [self.vmin, self.midpoint, self.vmax]
  17. y = [normalized_min, normalized_mid, normalized_max]
  18. return np.ma.masked_array(np.interp(value, x, y))
  19. class AnalysisIntern:
  20. def __init__(self, cluster_df_list, var_type="cont"):
  21. self.var_type = var_type
  22. self.cluster_df_list = cluster_df_list
  23. def p_val(self, clustera, clusterb, feature):
  24. if self.var_type == 'cont':
  25. return stats.ttest_ind(
  26. np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
  27. else:
  28. return ranksums(
  29. np.array(clustera[feature]) , np.array(clusterb[feature])).pvalue
  30. def feature_p_val(self, feature):
  31. return np.array([
  32. np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
  33. for ci in self.cluster_df_list
  34. ])
  35. def p_map(self, feature):
  36. heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
  37. norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
  38. p_values = self.feature_p_val(feature)
  39. im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
  40. ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
  41. ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
  42. num_clusters = len(self.cluster_df_list)
  43. for y in range(num_clusters):
  44. for x in range(num_clusters):
  45. plt.text(x , y
  46. , '%.2f' % p_values[y, x]
  47. , horizontalalignment='center'
  48. , verticalalignment='center'
  49. , fontsize=8
  50. )
  51. cbar = heatmap.colorbar(im)
  52. cbar.ax.set_ylabel('p-value')
  53. plt.title(feature.upper(), fontsize=16)
  54. print('\n')
  55. plt.show()
  56. class Analysis:
  57. def __init__(self
  58. , cont_features, ord_features, nom_features
  59. , clusters_dbscan_FDC, values_dbscan_FDC):
  60. self.cont_features = cont_features
  61. self.ord_features = ord_features
  62. self.nom_features = nom_features
  63. self.clusters_dbscan_FDC = clusters_dbscan_FDC
  64. self.values_dbscan_FDC = values_dbscan_FDC
  65. self.cluster_df_list = []
  66. def run(self, data):
  67. data['Clusters'] = np.array(self.clusters_dbscan_FDC)
  68. cluster_df_list=[]
  69. for cluster in self.values_dbscan_FDC:
  70. cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
  71. cluster_df.columns = list(data.columns)[:-1]
  72. cluster_df_list.append(cluster_df)
  73. self.cluster_df_list = cluster_df_list[1:]
  74. a = AnalysisIntern(self.cluster_df_list, "cont")
  75. for feature in self.cont_features:
  76. a.p_map(feature)
  77. a = AnalysisIntern(cluster_df_list, "ord")
  78. for feature in self.ord_features:
  79. a.p_map(feature)
  80. for feature in self.nom_features:
  81. a.p_map(feature)