From a13baf354934e8e2ebc7232cf21a33b1ca2a45b7 Mon Sep 17 00:00:00 2001
From: Tom Selier <tbj.selier@gmail.com>
Date: Sat, 14 Oct 2023 10:20:12 +0200
Subject: [PATCH] model tech

---
 .../decision_tree/decision_tree.py            | 44 +++++++++++++------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/experiments/decision_tree/decision_tree.py b/src/experiments/decision_tree/decision_tree.py
index 80bd4c6..8df1901 100644
--- a/src/experiments/decision_tree/decision_tree.py
+++ b/src/experiments/decision_tree/decision_tree.py
@@ -49,7 +49,6 @@ i = 0
 with open(SIFT_PATH, 'r') as file:
     reader = csv.reader(file, delimiter= ',')
     matrix = list(reader)
-
     data = [[] for x in range(len(matrix)-1)]
     for row in matrix[1:]:
         ## append data to lists
@@ -62,9 +61,9 @@ with open(SIFT_PATH, 'r') as file:
     normalized = preprocessing.normalize(data, axis=0, norm='max')
     norm = list(normalized.tolist())
 
-steps = np.linspace(1, 50, 2, dtype=np.int64)
+steps = np.linspace(0, 9, 10, dtype=np.int64)
 # steps = np.linspace(1, 100, 10, dtype=np.int64)
-# steps = np.linspace(0, 0.2, 11, dtype=np.float64)
+# steps = np.linspace(0, 1, 11, dtype=np.float64)
 accuracy = []
 precision = []
 recall = []
@@ -74,12 +73,19 @@ phi = []
 for step in steps:
     actual = []
     predicted = []
+    # weights = {}
+    # for idx, element in enumerate(Tree):
+    #     # print(idx, element)
+    #     weights[idx] = 0.1
+    # weights[5] = 1
 
     for i in range(len(norm)):
         temp_data = norm.pop(i)
         temp_label = labels.pop(i)
 
         # model = tree.DecisionTreeClassifier(
+        #     # class_weight=weights,
+        #     class_weight=None,
         #     min_samples_leaf=2,
         #     max_depth=None, # < 5 is worse, None good too
         #     random_state=False, # No change
@@ -88,22 +94,24 @@ for step in steps:
         #     ccp_alpha=0 # Pruning: Keep this 0
         # )
         # model = ensemble.RandomForestClassifier(
+        #     n_estimators=20, # higher is better, but slower (def: 100)
         #     criterion='gini', # gini best
         # )
-        model = ensemble.ExtraTreesClassifier(
-        )
-        # model = neighbors.KNeighborsClassifier(
-        #     algorithm='auto',
-        #     leaf_size=step,
-        #     n_neighbors=1,
-        #     n_jobs=-1
+        # model = ensemble.ExtraTreesClassifier(
         # )
+        model = neighbors.KNeighborsClassifier(
+            algorithm='auto',
+            leaf_size=2,
+            n_neighbors=1,
+            n_jobs=-1
+        )
         # model = ensemble.BaggingClassifier(
         # )
         # model = svm.SVC(decision_function_shape='ovr'
         # )
         model = model.fit(norm, labels)
         result = model.predict([temp_data])
+        # features = model.feature_importances_
         del model
 
         norm.append(temp_data)
@@ -120,6 +128,13 @@ for step in steps:
 
     print(step)
 
+# Feature importance
+# plt.bar(matrix[0][1:], features)
+# fig, ax = plt.subplots()
+# ax.set_title("Feature Importance")
+# ax.barh(matrix[0][1:], features)
+# plt.show()
+
 # Scores
 # https://www.evidentlyai.com/classification-metrics/multi-class-metrics
 # For all: higher is better
@@ -127,17 +142,17 @@ fig, axs = plt.subplots(2, 2)
 fig.set_size_inches(12.5, 10)
 
 axs[0, 0].plot(steps, accuracy)
-axs[0, 0].set_title("Accuracy")
+axs[0, 0].set_title("Accuracy: $\mu$: %f"%np.mean(accuracy))
 axs[0, 0].grid()
 axs[0, 0].set_ylim(0, 1)
 
 axs[0, 1].plot(steps, precision)
-axs[0, 1].set_title("Precision")
+axs[0, 1].set_title("Precision $\mu$: %f"%np.mean(precision))
 axs[0, 1].grid()
 axs[0, 1].set_ylim(0, 1)
 
 axs[1, 0].plot(steps, recall)
-axs[1, 0].set_title("Recall")
+axs[1, 0].set_title("Recall $\mu$: %f"%np.mean(recall))
 axs[1, 0].grid()
 axs[1, 0].set_ylim(0, 1)
 
@@ -148,6 +163,7 @@ axs[1, 1].set_title("ROC AUC")
 axs[1, 1].legend()
 axs[1, 1].grid()
 axs[1, 1].set_ylim(0, 1)
+plt.show()
 
 # Confusion matrix
 c_matrix = metrics.confusion_matrix(actual, predicted)
@@ -160,7 +176,7 @@ plt.show(block=False)
 # 0 random prediction
 # -1 opposite prediction
 plt.plot(steps, phi)
-plt.title("Matthews Correlation Coefficient")
+plt.title("Matthews Correlation Coefficient $\mu$: %f"%np.mean(phi))
 plt.grid()
 plt.ylim(-1, 1)
 plt.show()
\ No newline at end of file