From c6c9b50e9d7c4d494c672889dedab79bd69cb3c8 Mon Sep 17 00:00:00 2001
From: Tom Selier <tbj.selier@gmail.com>
Date: Fri, 13 Oct 2023 21:54:50 +0200
Subject: [PATCH] added MCC

---
 .../decision_tree/decision_tree.py            | 116 ++++++++++++------
 src/experiments/knn/knn.py                    |   5 +-
 2 files changed, 79 insertions(+), 42 deletions(-)

diff --git a/src/experiments/decision_tree/decision_tree.py b/src/experiments/decision_tree/decision_tree.py
index 969dd4b..80bd4c6 100644
--- a/src/experiments/decision_tree/decision_tree.py
+++ b/src/experiments/decision_tree/decision_tree.py
@@ -2,15 +2,17 @@ from enum import Enum
 from sklearn import tree
 from sklearn import metrics
 from sklearn import preprocessing
-import sklearn
+from sklearn import neighbors
+from sklearn import ensemble
+from sklearn import svm
 from matplotlib import pyplot as plt
 import pandas as pd
 import numpy as np
 import random
 import csv
 
-SIFT_PATH = "..\\algorithms\\data\\sift.csv"
-# SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv"
+# SIFT_PATH = "..\\algorithms\\data\\sift.csv"
+SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv"
 
 class Tree(Enum):
     ACCASIA = 0
@@ -22,11 +24,7 @@ class Tree(Enum):
     LINDE = 6
     PLATAAN = 7
 
-# [[tree1_data],[tree2_data]]
-# [tree1_label, tree2_label]
-
 def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
-    
     #creating a set of all the unique classes using the actual class list
     unique_class = set(actual_class)
     roc_auc_dict = {}
@@ -45,10 +43,8 @@ def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
 
     return roc_auc_dict
 
-
 labels = []
 i = 0
-done = False
 
 with open(SIFT_PATH, 'r') as file:
     reader = csv.reader(file, delimiter= ',')
@@ -66,33 +62,51 @@ with open(SIFT_PATH, 'r') as file:
     normalized = preprocessing.normalize(data, axis=0, norm='max')
     norm = list(normalized.tolist())
 
-steps = np.linspace(2, 20, 10, dtype=np.int64)
+steps = np.linspace(1, 50, 2, dtype=np.int64)
+# steps = np.linspace(1, 100, 10, dtype=np.int64)
+# steps = np.linspace(0, 0.2, 11, dtype=np.float64)
 accuracy = []
 precision = []
 recall = []
 roc = []
+phi = []
 
 for step in steps:
     actual = []
     predicted = []
 
-    for i in range(100):
-        test_index = random.randint(1, 101)
-        temp_data = data.pop(test_index)
-        temp_label = labels.pop(test_index)
-        del dec_tree
+    for i in range(len(norm)):
+        temp_data = norm.pop(i)
+        temp_label = labels.pop(i)
 
-        dec_tree = tree.DecisionTreeClassifier(
-            min_samples_leaf=2,
-            max_depth=None,
-            random_state=False,
-            criterion='gini', 
-            splitter='best')
-        dec_tree = dec_tree.fit(data, labels)
-        result = dec_tree.predict([matrix[test_index][1:]])
+        # model = tree.DecisionTreeClassifier(
+        #     min_samples_leaf=2,
+        #     max_depth=None, # < 5 is worse, None good too
+        #     random_state=False, # No change
+        #     criterion='gini', # MCC + 0.1
+        #     splitter='best',
+        #     ccp_alpha=0 # Pruning: Keep this 0
+        # )
+        # model = ensemble.RandomForestClassifier(
+        #     criterion='gini', # gini best
+        # )
+        model = ensemble.ExtraTreesClassifier(
+        )
+        # model = neighbors.KNeighborsClassifier(
+        #     algorithm='auto',
+        #     leaf_size=step,
+        #     n_neighbors=1,
+        #     n_jobs=-1
+        # )
+        # model = ensemble.BaggingClassifier(
+        # )
+        # model = svm.SVC(decision_function_shape='ovr'
+        # )
+        model = model.fit(norm, labels)
+        result = model.predict([temp_data])
+        del model
 
-        # normalized_list.append(temp_data)
-        data.append(temp_data)
+        norm.append(temp_data)
         labels.append(temp_label)
 
         actual.append(temp_label)
@@ -102,29 +116,51 @@ for step in steps:
     precision.append(metrics.precision_score(actual, predicted, average='macro'))
     recall.append(metrics.recall_score(actual, predicted, average='macro'))
     roc.append(roc_auc_score_multiclass(actual, predicted))
+    phi.append(metrics.matthews_corrcoef(actual, predicted))
 
     print(step)
 
 # Scores
 # https://www.evidentlyai.com/classification-metrics/multi-class-metrics
-plt.plot(accuracy)
-plt.title("Accuracy")
-plt.show()
-plt.plot(precision)
-plt.title("Precision")
-plt.show()
-plt.plot(recall)
-plt.title("Recall")
-plt.show()
+# For all: higher is better
+fig, axs = plt.subplots(2, 2)
+fig.set_size_inches(12.5, 10)
+
+axs[0, 0].plot(steps, accuracy)
+axs[0, 0].set_title("Accuracy")
+axs[0, 0].grid()
+axs[0, 0].set_ylim(0, 1)
+
+axs[0, 1].plot(steps, precision)
+axs[0, 1].set_title("Precision")
+axs[0, 1].grid()
+axs[0, 1].set_ylim(0, 1)
+
+axs[1, 0].plot(steps, recall)
+axs[1, 0].set_title("Recall")
+axs[1, 0].grid()
+axs[1, 0].set_ylim(0, 1)
+
 df = pd.DataFrame(roc)
-plt.figure()
-for i in range(7):
-    plt.plot(df[i], label=Tree(i).name)
-plt.legend()
-plt.show()
+for i in range(8):
+    axs[1, 1].plot(steps, df[i], label=Tree(i).name)
+axs[1, 1].set_title("ROC AUC")
+axs[1, 1].legend()
+axs[1, 1].grid()
+axs[1, 1].set_ylim(0, 1)
 
 # Confusion matrix
 c_matrix = metrics.confusion_matrix(actual, predicted)
 cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=c_matrix)
 cm_display.plot()
-plt.show(block=False)
\ No newline at end of file
+plt.show(block=False)
+
+# MCC
+# 1 perfect prediction
+# 0 random prediction
+# -1 opposite prediction
+plt.plot(steps, phi)
+plt.title("Matthews Correlation Coefficient")
+plt.grid()
+plt.ylim(-1, 1)
+plt.show()
\ No newline at end of file
diff --git a/src/experiments/knn/knn.py b/src/experiments/knn/knn.py
index d06d20d..b9a0b36 100644
--- a/src/experiments/knn/knn.py
+++ b/src/experiments/knn/knn.py
@@ -6,7 +6,7 @@ import csv
 from sklearn.preprocessing import MinMaxScaler
 from enum import Enum
 import random
-from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, matthews_corrcoef
 
 class Tree(Enum):
     ACCASIA = 0
@@ -109,4 +109,5 @@ print("Accuracy score", accuracy_score(tag_true, tag_predict))
 print("Precision score (macro)", precision_score(tag_true, tag_predict, average='macro'))
 print("Precision score (micro)", precision_score(tag_true, tag_predict, average='micro'))
 print("Recall score (macro)", recall_score(tag_true, tag_predict, average='macro'))
-print("Recall score (micro)", recall_score(tag_true, tag_predict, average='micro'))
\ No newline at end of file
+print("Recall score (micro)", recall_score(tag_true, tag_predict, average='micro'))
+print("MCC", matthews_corrcoef(tag_true, tag_predict))
\ No newline at end of file