Преглед на файлове

Calculated set similarity.

Kristian Schultz преди 3 години
родител
ревизия
7b6971f93d
променени са 1 файла, в които са добавени 149 реда и са изтрити 0 реда
  1. 149 0
      calcSimilarity.py

+ 149 - 0
calcSimilarity.py

@@ -0,0 +1,149 @@
+import json
+import math
+from library.analysis import testSets, generators
+
+
+def loadDataset(name):
+    with open(name) as f:
+        return json.load(f)
+    return None
+
+
+
+def dist(x,y):
+    s = 0.0
+    for (a,b) in zip(x,y):
+        s += (a - b) * (a - b)
+    return math.sqrt(s)
+
+def distSet(s1, s2, compareSame=False):
+    dSet = None
+    for x in s1:
+        dPoint = None
+        for y in s2:
+            d = dist(x, y)
+            if d > 0 or not compareSame:
+                if dPoint is None:
+                    dPoint = d
+                else:
+                    dPoint = min(dPoint, d)
+
+        if dSet is None:
+            dSet = dPoint
+        elif dPoint is not None:
+            dSet = max(dSet, dPoint)
+
+    return dSet
+
+
+def calcDistancesOfSlice(path, stepNr, sliceNr):
+    data = loadDataset(f"{path}Step{stepNr}_Slice{sliceNr}.json")
+
+    data_min = data['minority']
+    data_maj = data['majority']
+    data_syn = data['synthetic']
+    data = None
+
+    d_min = distSet(data_min, data_min, True)
+    d_maj = distSet(data_min, data_maj) / d_min
+    d_syn = distSet(data_min, data_syn) / d_min
+    return (d_min, d_maj, d_syn)
+
+
+class Stat:
+    def __init__(self, name="?"):
+        self.mi = None
+        self.mx = None
+        self.s = 0.0
+        self.n = 0.0
+        self.name = name
+
+    def add(self, x):
+        if self.n == 0.0:
+            self.mi = x
+            self.mx = x
+            self.s = x
+            self.n = 1.0
+        else:
+            self.mi = min(self.mi, x)
+            self.mx = max(self.mx, x)
+            self.s += x
+            self.n += 1.0
+
+    def __str__(self):
+        return f"{self.name} [{self.mi:.3f} .. {self.s / self.n:.3f} .. {self.mx:.3f}]"
+
+    def value(self):
+        return {
+            "min": self.mi,
+            "max": self.mx,
+            "avg": self.s / self.n
+            }
+
+
+
+class StatTriple:
+    def __init__(self, title=""):
+        self.s_min = Stat(title + " minority ")
+        self.s_maj = Stat(title + " majority ")
+        self.s_syn = Stat(title + " synthetic")
+
+    def add(self, d):
+        self.s_min.add(d[0])
+        self.s_maj.add(d[1])
+        self.s_syn.add(d[2])
+
+
+    def print(self):
+        print(self.s_min)
+        print(self.s_maj)
+        print(self.s_syn)
+
+    def value(self):
+        return {
+            "minority": self.s_min.value(),
+            "majority_factor": self.s_maj.value(),
+            "synthetic_factor": self.s_syn.value()
+            }
+
+
+def calcStatistic(path, title=""):
+    s_triple = StatTriple(title)
+
+    for stepNr in [1,2,3,4,5]:
+        for sliceNr in [1,2,3,4,5]:
+            d = calcDistancesOfSlice(path, stepNr, sliceNr)
+            s_triple.add(d)
+
+    s_triple.print()
+    return s_triple.value()
+
+statistic = {}
+
+for g in generators.keys():
+    statistic[g] = {}
+    sAverage = StatTriple("Average")
+    print(f"--------[ {g} ]--------")
+    for s in testSets:
+        st = calcStatistic(f"data_result/{g}/{s}/", s)
+        statistic[g][s] = st
+        sAverage.add( (st["minority"]["avg"], st["majority_factor"]["avg"], st["synthetic_factor"]["avg"]) )
+
+    sAverage.print()
+    statistic[g]["Average"] = sAverage.value()
+    print()
+
+
+print(f"--------[ JSON ]--------")
+#print(json.dumps(statistic))
+
+with open("data_result/similarity.json", "w") as f:
+    f.write(json.dumps(statistic))
+
+print(f"--------[ summary ]--------")
+
+for g in generators.keys():
+    print(f"{g:32s}: {statistic[g]['Average']['synthetic_factor']['avg']}")
+
+
+print(f"--------[ done ]--------")