Jelajahi Sumber

Speed up for distance matrix calculation.

Kristian Schultz 3 tahun lalu
induk
melakukan
93664357ea

File diff ditekan karena terlalu besar
+ 524 - 524
Is your data fit for decision making using Machine Learning version 4.ipynb


+ 34 - 1
fdc/missingValues.py

@@ -44,7 +44,7 @@ def create_total_impute(data, distance_matrix, missing_value_list):
         ]
         ]
 
 
 
 
-def create_distance_matrix(dense_data):
+def create_distance_matrix_old(dense_data):
     dense_data_index = np.array(dense_data.index)
     dense_data_index = np.array(dense_data.index)
     dense_data = np.array(dense_data)
     dense_data = np.array(dense_data)
 
 
@@ -56,6 +56,39 @@ def create_distance_matrix(dense_data):
         ])
         ])
 
 
 
 
+def create_distance_matrix(dense_data):
+    dense_data_index = np.array(dense_data.index)
+    dense_data = np.array(dense_data)
+    size = len(dense_data)
+
+    matrix = [[ None for i in range(size)] for j in range(size)]
+
+    # Calculate the distances. As the distance matrix is symmetric we can do
+    # that with at most n*n/2 distance evaluations.
+    for nx, x in enumerate(dense_data):
+        for ny, y in enumerate(dense_data):
+            # Same index so distance is 0.0
+            if nx == ny:
+                matrix[nx][ny] = 0.0
+            
+            # As the matrix is symetric we can copy already calculated values.
+            # As distance computation is expencive we should do that.
+            elif nx > ny:
+                matrix[nx][ny] = matrix[ny][nx]
+            
+            # Unseen pair so calculate the distance.
+            else:
+                matrix[nx][ny] = distance.euclidean(x, y)
+
+    # Calculate the indices and replace the distance rows.
+    # So we create our result matrix and do cleanup at the same time.
+    for n in range(size):
+        matrix[n] = dense_data_index[ np.argsort(matrix[n]) ]
+
+    return np.array(matrix)
+
+
+
 
 
 def fix_missing_values(data, limit=4):
 def fix_missing_values(data, limit=4):
     timing = tools.Timing("fix_missing_values")
     timing = tools.Timing("fix_missing_values")

Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini