hypothesisTesting.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. from scipy.stats import ranksums
  2. import matplotlib as mpl
  3. from scipy import stats
  4. class MidpointNormalize(mpl.colors.Normalize):
  5. def __init__(self, vmin, vmax, midpoint=0, clip=False):
  6. self.midpoint = midpoint
  7. mpl.colors.Normalize.__init__(self, vmin, vmax, clip)
  8. def __call__(self, value, clip=None):
  9. normalized_min = max(0, 0.5 * (1 - abs((self.midpoint - self.vmin) / (self.midpoint - self.vmax))))
  10. normalized_max = min(1, 0.5 * (1 + abs((self.vmax - self.midpoint) / (self.midpoint - self.vmin))))
  11. normalized_mid = 0.5
  12. x = [self.vmin, self.midpoint, self.vmax]
  13. y = [normalized_min, normalized_mid, normalized_max]
  14. return np.ma.masked_array(np.interp(value, x, y))
  15. class AnalysisIntern:
  16. def __init__(self, cluster_df_list, var_type="cont"):
  17. self.var_type = var_type
  18. self.cluster_df_list = cluster_df_list
  19. def p_val(self, clustera, clusterb, feature):
  20. if self.var_type == 'cont':
  21. return stats.ttest_ind(
  22. np.array(clustera[feature])
  23. , np.array(clusterb[feature])
  24. ).pvalue
  25. else:
  26. return ranksums(
  27. np.array(clustera[feature])
  28. , np.array(clusterb[feature])
  29. ).pvalue
  30. def feature_p_val(self, feature):
  31. return np.array([
  32. np.array([self.p_val(ci, cj, feature) for cj in self.cluster_df_list])
  33. for ci in self.cluster_df_list
  34. ])
  35. def p_map(self, feature):
  36. heatmap, ax = plt.subplots(figsize=(8, 8), dpi=600)
  37. norm = MidpointNormalize(vmin=0, vmax=1, midpoint=0.5)
  38. p_values = self.feature_p_val(feature)
  39. im = ax.imshow(p_values , cmap='coolwarm' , norm=norm)
  40. ax.set_xticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
  41. ax.set_yticklabels(['','C1','C2','C3','C4','C5','C6','C7','C8','C9'])
  42. num_clusters = len(self.cluster_df_list)
  43. for y in range(num_clusters):
  44. for x in range(num_clusters):
  45. plt.text(x , y
  46. , '%.2f' % p_values[y, x]
  47. , horizontalalignment='center'
  48. , verticalalignment='center'
  49. , fontsize=8
  50. )
  51. cbar = heatmap.colorbar(im)
  52. cbar.ax.set_ylabel('p-value')
  53. plt.title(feature.upper(), fontsize=16)
  54. print('\n')
  55. plt.show()
  56. class Analysis:
  57. def __init__(self
  58. , cont_features, ord_features, nom_features
  59. , clusters_dbscan_FDC, values_dbscan_FDC):
  60. self.cont_features = cont_features
  61. self.ord_features = ord_features
  62. self.nom_features = nom_features
  63. self.clusters_dbscan_FDC = clusters_dbscan_FDC
  64. self.values_dbscan_FDC = values_dbscan_FDC
  65. def run(self, data):
  66. data['Clusters'] = np.array(self.clusters_dbscan_FDC)
  67. cluster_df_list=[]
  68. for cluster in self.values_dbscan_FDC:
  69. cluster_df = data.loc[data['Clusters'] == cluster].drop(columns=['Clusters'])
  70. cluster_df.columns = list(data.columns)[:-1]
  71. cluster_df_list.append(cluster_df)
  72. cluster_df_list = cluster_df_list[1:]
  73. a = AnalysisIntern(cluster_df_list, "cont")
  74. for feature in self.cont_features:
  75. a.p_map(feature)
  76. a = AnalysisIntern(cluster_df_list, "ord")
  77. for feature in self.ord_features:
  78. a.p_map(feature)
  79. for feature in self.nom_features:
  80. a.p_map(feature)