Преглед изворни кода

Speed up for missing values part including infinite loop fix.

Kristian Schultz пре 3 година
родитељ
комит
7f04937c14

Разлика између датотеке није приказан због своје велике величине
+ 5 - 0
Is your data fit for decision making using Machine Learning version 4.ipynb


+ 82 - 0
fdc/missingValues.py

@@ -0,0 +1,82 @@
+import math
+import numpy as np
+from scipy.spatial import distance
+import fdc.tools as tools
+
+
+def create_total_impute(data, distance_matrix, missing_value_list):
+    def create_value_list(f, index):
+        index_in_dist_mat = np.where(distance_matrix[:,0] == index)[0][0]
+        value_list = []
+        for neb_index in distance_matrix[index_in_dist_mat][1:]:
+            impute_value = data.loc[[neb_index]][f]
+            if float(impute_value) == float(impute_value):
+                value_list.append(float(impute_value))
+            
+            if len(value_list) >= 6:
+                break
+
+        return np.array(value_list)
+
+    def feature_impute_master(f):
+        missing_value_indices = data[data[f].isnull()].index.tolist()
+        return np.array([
+            create_value_list(f, index)
+            for index in missing_value_indices
+            ])
+
+    def imputed_value(row):
+        intcounter = tools.count(lambda x: (not math.isnan(x)) and 0 == (x - int(x)), row)
+
+        if intcounter == len(row):
+            return np.array(np.bincount(row.astype(int)).argmax())
+        else:
+            return np.array(np.mean(row))
+
+    total_impute_master = [
+        feature_impute_master(f)
+        for f in missing_value_list
+        ]
+
+    return [
+        [imputed_value(row) for row in plane]
+        for plane in total_impute_master
+        ]
+
+
+def create_distance_matrix(dense_data):
+    dense_data_index = np.array(dense_data.index)
+    dense_data = np.array(dense_data)
+
+    return np.array([
+        dense_data_index[
+            np.argsort([ distance.euclidean(x, y) for y in dense_data ])
+            ]
+        for x in dense_data
+        ])
+
+
+
+def fix_missing_values(data, limit=4):
+    dense_data_pool = list(data.isna().sum().index[data.isna().sum() < limit])
+    dense_data = data[dense_data_pool].dropna()
+    data = data.loc[np.array(dense_data.index)]
+    
+    distance_matrix = create_distance_matrix(dense_data)
+
+    missing_value_list = [ x
+        for x in list(data.columns)
+        if x not in dense_data_pool
+        ]
+
+    total_impute = create_total_impute(
+        data, distance_matrix, missing_value_list)
+
+    for f, value in enumerate(missing_value_list):
+        missing_value_indices = data[data[value].isnull()].index.tolist()
+        for i, value_index in enumerate(missing_value_indices):
+            data.at[value_index, value] = total_impute[f][i]
+
+    return data
+
+

Неке датотеке нису приказане због велике количине промена