From 9bdd9c14f5ac5daeb2c63e7a884e515c38705ce1 Mon Sep 17 00:00:00 2001
From: Tom Selier <tbj.selier@gmail.com>
Date: Fri, 13 Oct 2023 20:10:06 +0200
Subject: [PATCH] decision tree sucks

---
 .../decision_tree/decision_tree.py            | 107 ++++++++++++++----
 1 file changed, 82 insertions(+), 25 deletions(-)

diff --git a/src/experiments/decision_tree/decision_tree.py b/src/experiments/decision_tree/decision_tree.py
index fa928ea..969dd4b 100644
--- a/src/experiments/decision_tree/decision_tree.py
+++ b/src/experiments/decision_tree/decision_tree.py
@@ -1,15 +1,16 @@
+from enum import Enum
 from sklearn import tree
 from sklearn import metrics
 from sklearn import preprocessing
-from sklearn.ensemble import RandomForestClassifier
-# from ...helpers.treenum import Tree
-from enum import Enum
-import csv
-import random
+import sklearn
 from matplotlib import pyplot as plt
+import pandas as pd
 import numpy as np
+import random
+import csv
 
 SIFT_PATH = "..\\algorithms\\data\\sift.csv"
+# SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv"
 
 class Tree(Enum):
     ACCASIA = 0
@@ -24,6 +25,27 @@ class Tree(Enum):
 # [[tree1_data],[tree2_data]]
 # [tree1_label, tree2_label]
 
+def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
+    
+    #creating a set of all the unique classes using the actual class list
+    unique_class = set(actual_class)
+    roc_auc_dict = {}
+    for per_class in unique_class:
+        
+        #creating a list of all the classes except the current class 
+        other_class = [x for x in unique_class if x != per_class]
+
+        #marking the current class as 1 and all other classes as 0
+        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
+        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
+
+        #using the sklearn metrics method to calculate the roc_auc_score
+        roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average)
+        roc_auc_dict[per_class] = roc_auc
+
+    return roc_auc_dict
+
+
 labels = []
 i = 0
 done = False
@@ -44,30 +66,65 @@ with open(SIFT_PATH, 'r') as file:
     normalized = preprocessing.normalize(data, axis=0, norm='max')
     norm = list(normalized.tolist())
 
-actual = []
-predicted = []
-for i in range(75):
-    test_index = random.randint(1, 101)
-    temp_data = data.pop(test_index)
-    temp_label = labels.pop(test_index)
+steps = np.linspace(2, 20, 10, dtype=np.int64)
+accuracy = []
+precision = []
+recall = []
+roc = []
 
-    # dec_tree = tree.DecisionTreeClassifier(
-    #     criterion='entropy', 
-    #     splitter='best')
-    dec_tree = RandomForestClassifier(max_depth=None)
-    dec_tree = dec_tree.fit(data, labels)
-    result = dec_tree.predict([matrix[test_index][1:]])
+for step in steps:
+    actual = []
+    predicted = []
 
-    # normalized_list.append(temp_data)
-    data.append(temp_data)
-    labels.append(temp_label)
+    for i in range(100):
+        test_index = random.randint(1, 101)
+        temp_data = data.pop(test_index)
+        temp_label = labels.pop(test_index)
+        del dec_tree
 
-    actual.append(temp_label)
-    predicted.append(result[0])
+        dec_tree = tree.DecisionTreeClassifier(
+            min_samples_leaf=2,
+            max_depth=None,
+            random_state=False,
+            criterion='gini', 
+            splitter='best')
+        dec_tree = dec_tree.fit(data, labels)
+        result = dec_tree.predict([matrix[test_index][1:]])
 
+        # normalized_list.append(temp_data)
+        data.append(temp_data)
+        labels.append(temp_label)
+
+        actual.append(temp_label)
+        predicted.append(result[0])
+
+    accuracy.append(metrics.accuracy_score(actual, predicted))
+    precision.append(metrics.precision_score(actual, predicted, average='macro'))
+    recall.append(metrics.recall_score(actual, predicted, average='macro'))
+    roc.append(roc_auc_score_multiclass(actual, predicted))
+
+    print(step)
+
+# Scores
+# https://www.evidentlyai.com/classification-metrics/multi-class-metrics
+plt.plot(accuracy)
+plt.title("Accuracy")
+plt.show()
+plt.plot(precision)
+plt.title("Precision")
+plt.show()
+plt.plot(recall)
+plt.title("Recall")
+plt.show()
+df = pd.DataFrame(roc)
+plt.figure()
+for i in range(7):
+    plt.plot(df[i], label=Tree(i).name)
+plt.legend()
+plt.show()
+
+# Confusion matrix
 c_matrix = metrics.confusion_matrix(actual, predicted)
 cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=c_matrix)
 cm_display.plot()
-plt.show(block=False)
-# print("Testdata: \t" + Tree[matrix[test_index][0].upper()].name)
-# print("Predicted: \t" + Tree(result[0]).name)
\ No newline at end of file
+plt.show(block=False)
\ No newline at end of file