Jelajahi Sumber

Added time measurement

Kristian Schultz 3 tahun lalu
induk
melakukan
081194e358

File diff ditekan karena terlalu besar
+ 479 - 451
Is your data fit for decision making using Machine Learning version 4.ipynb


+ 15 - 0
fdc/fdc.py

@@ -3,6 +3,8 @@ import numpy as np
 from numba import jit
 import umap.umap_ as umap
 
+from fdc.tools import Timing
+
 # --[ Known to be used but can we avoid it? ]----
 import pandas as pd
 from fdc.visualize import plotMapping
@@ -107,6 +109,8 @@ class FDC:
                   visual=None
                   ):
 
+        timing = Timing("FDC.normalize")
+
         # Take instance value if parameter was not given.
         visual = value(visual, self.visual)
         with_2d_embedding = value(with_2d_embedding, self.with_2d_embedding)
@@ -115,6 +119,8 @@ class FDC:
         np.random.seed(42)
         concat_column_names = []
         concat_lists = []
+        
+        timing.step("init")
 
         # Reducing features into 2dim or 1dim
         actions = [
@@ -129,12 +135,14 @@ class FDC:
                 concat_lists.append(emb)
                 for n in range(emb.shape[1]):
                     concat_column_names.append(f"{name}_UMAP_{n}")
+            timing.step(f"clustering {name}")
 
         # Merge results
         if concat_lists == []:
             raise ValueError("Expected at least one non empty column list.") 
 
         result_concat = np.concatenate(concat_lists, axis=1)
+        timing.step("concat")
 
         # Create 2d embedding from 5d embedding
         if with_2d_embedding or visual:
@@ -145,10 +153,13 @@ class FDC:
                 , metric='euclidean'
                 , random_state=42
                 ).fit_transform(result_concat)
+
+            timing.step("umap 5 -> 2")
         
             if self.use_pandas_output:
                 result_reduced = pd.DataFrame(
                     data=result_reduced, columns=['UMAP_0', 'UMAP_1'])
+                timing.step("array -> DataFrame")
 
             # Show mapping if needed
             if visual:
@@ -157,11 +168,15 @@ class FDC:
                 else:
                     plotMapping(pd.DataFrame(
                         data=result_reduced, columns=['UMAP_0', 'UMAP_1']))
+                timing.step("plotting")
 
         # Transform to pandas DataFrame if needed.
         if self.use_pandas_output:
             result_concat = pd.DataFrame(
                 data=result_concat, columns=concat_column_names)
+            timing.step("array -> DataFrame")
+
+        timing.step("total")
 
         if with_2d_embedding:
             #returns both 5D and 2D embeddings

+ 12 - 0
fdc/missingValues.py

@@ -58,25 +58,37 @@ def create_distance_matrix(dense_data):
 
 
 def fix_missing_values(data, limit=4):
+    timing = tools.Timing("fix_missing_values")
+
     dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
+    timing.step("dense_data_pool")
+
     dense_data = data[dense_data_pool].dropna()
+    timing.step("dense_data")
+
     data = data.loc[np.array(dense_data.index)]
+    timing.step("data.loc")
     
     distance_matrix = create_distance_matrix(dense_data)
+    timing.step("distance_matrix")
 
     missing_value_list = [ x
         for x in list(data.columns)
         if x not in dense_data_pool
         ]
+    timing.step("missing_value_list")
 
     total_impute = create_total_impute(
         data, distance_matrix, missing_value_list)
+    timing.step("total_impute")
 
     for f, value in enumerate(missing_value_list):
         missing_value_indices = data[data[value].isnull()].index.tolist()
         for i, value_index in enumerate(missing_value_indices):
             data.at[value_index, value] = total_impute[f][i]
 
+    timing.step("update data")
+
     return data
 
 

+ 21 - 0
fdc/tools.py

@@ -1,3 +1,5 @@
+import time
+
 
 def count(testFn, items):
     s = 0
@@ -6,3 +8,22 @@ def count(testFn, items):
             s += 1
     return s
 
+
+
+class Timing:
+    def __init__(self, name="Duration"):
+        self.name = name
+        self.tStart = time.process_time()
+        self.tStepStart = self.tStart
+
+    def step(self, message=""):
+        now = time.process_time()
+        duration = now - self.tStart
+        durationStep = now - self.tStepStart
+        self.tStepStart = now
+
+        if message == "":
+            print(f"{self.name}: {durationStep:0.5f} / {duration:0.3f}s")
+        else:
+            print(f"{self.name} ({message}): {durationStep:0.5f} / {duration:0.3f}s")
+        return duration

Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini