changes

2023-10-20 11:09:38 +02:00 · 2023-10-20 11:09:38 +02:00 · 8850f957ae
commit 8850f957ae
parent 3f384138b2
3 changed files with 151 additions and 104 deletions
--- a/src/experiments/csv_merger.py
+++ b/src/experiments/csv_merger.py
@ -26,8 +26,12 @@ for element in data[1:]:
                element.append(idx)
            print(element[1], new_element[1])

+for element in data:
+    print(len(element))
+
 with open(OUTPUT_DIR + "combined.csv", 'w') as file:
    for element in data:
-        for idx in element:
+        for idx in element[:-1]:
            file.write(str(idx) + ',')
+        file.write(str(element[-1]))
        file.write('\n')
--- a/src/experiments/decision_tree/decision_tree.py
+++ b/src/experiments/decision_tree/decision_tree.py
@ -10,9 +10,9 @@ import pandas as pd
 import numpy as np
 import random
 import csv
+import plots

-# SIFT_PATH = "..\\algorithms\\data\\sift.csv"
-SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv"
+PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\\dataset\\csv\\combined.csv"

 class Tree(Enum):
    ACCASIA = 0
@ -24,36 +24,24 @@ class Tree(Enum):
    LINDE = 6
    PLATAAN = 7

-def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
-    #creating a set of all the unique classes using the actual class list
-    unique_class = set(actual_class)
-    roc_auc_dict = {}
-    for per_class in unique_class:
-        
-        #creating a list of all the classes except the current class 
-        other_class = [x for x in unique_class if x != per_class]
-
-        #marking the current class as 1 and all other classes as 0
-        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
-        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
-
-        #using the sklearn metrics method to calculate the roc_auc_score
-        roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average)
-        roc_auc_dict[per_class] = roc_auc
-
-    return roc_auc_dict
-
 labels = []
+actual_list = []
+predicted_list = []
 i = 0

-with open(SIFT_PATH, 'r') as file:
+with open(PATH, 'r') as file:
    reader = csv.reader(file, delimiter= ',')
    matrix = list(reader)
    data = [[] for x in range(len(matrix)-1)]
+    
+    # Load all but the headers
    for row in matrix[1:]:
+
        ## append data to lists
        labels.append(Tree[row[0].upper()].value)
-        for element in row[1:]:
+        
+        # append all but ID and tree
+        for element in row[2:]:
            data[i].append(float(element))
        i += 1

@ -61,30 +49,17 @@ with open(SIFT_PATH, 'r') as file:
    normalized = preprocessing.normalize(data, axis=0, norm='max')
    norm = list(normalized.tolist())

-steps = np.linspace(0, 9, 10, dtype=np.int64)
-# steps = np.linspace(1, 100, 10, dtype=np.int64)
-# steps = np.linspace(0, 1, 11, dtype=np.float64)
-accuracy = []
-precision = []
-recall = []
-roc = []
-phi = []
+steps = np.linspace(0.1, 1.0, 10, dtype=np.float64)

 for step in steps:
    actual = []
    predicted = []
-    # weights = {}
-    # for idx, element in enumerate(Tree):
-    #     # print(idx, element)
-    #     weights[idx] = 0.1
-    # weights[5] = 1

    for i in range(len(norm)):
        temp_data = norm.pop(i)
        temp_label = labels.pop(i)

        # model = tree.DecisionTreeClassifier(
-        #     # class_weight=weights,
        #     class_weight=None,
        #     min_samples_leaf=2,
        #     max_depth=None, # < 5 is worse, None good too
@ -98,21 +73,23 @@ for step in steps:
        #     criterion='gini', # gini best
        # )
        # model = ensemble.ExtraTreesClassifier(
+        #     n_estimators=150 # higher is better, but slower (def: 100)
        # )
-        model = neighbors.KNeighborsClassifier(
-            algorithm='auto',
-            leaf_size=2,
-            n_neighbors=1,
-            n_jobs=-1
+        # model = neighbors.KNeighborsClassifier(
+        #     algorithm='auto',
+        #     leaf_size=2,
+        #     n_neighbors=step,
+        # )
+        model = ensemble.BaggingClassifier(
+            n_estimators=5,
+            max_samples=.5,
+            max_features=.5,
+            bootstrap=False
        )
-        # model = ensemble.BaggingClassifier(
-        # )
        # model = svm.SVC(decision_function_shape='ovr'
        # )
        model = model.fit(norm, labels)
        result = model.predict([temp_data])
-        # features = model.feature_importances_
-        del model

        norm.append(temp_data)
        labels.append(temp_label)
@ -120,63 +97,12 @@ for step in steps:
        actual.append(temp_label)
        predicted.append(result[0])

-    accuracy.append(metrics.accuracy_score(actual, predicted))
-    precision.append(metrics.precision_score(actual, predicted, average='macro'))
-    recall.append(metrics.recall_score(actual, predicted, average='macro'))
-    roc.append(roc_auc_score_multiclass(actual, predicted))
-    phi.append(metrics.matthews_corrcoef(actual, predicted))
+    actual_list.append(actual)
+    predicted_list.append(predicted)

    print(step)

-# Feature importance
-# plt.bar(matrix[0][1:], features)
-# fig, ax = plt.subplots()
-# ax.set_title("Feature Importance")
-# ax.barh(matrix[0][1:], features)
-# plt.show()
-
-# Scores
-# https://www.evidentlyai.com/classification-metrics/multi-class-metrics
-# For all: higher is better
-fig, axs = plt.subplots(2, 2)
-fig.set_size_inches(12.5, 10)
-
-axs[0, 0].plot(steps, accuracy)
-axs[0, 0].set_title("Accuracy: $\mu$: %f"%np.mean(accuracy))
-axs[0, 0].grid()
-axs[0, 0].set_ylim(0, 1)
-
-axs[0, 1].plot(steps, precision)
-axs[0, 1].set_title("Precision $\mu$: %f"%np.mean(precision))
-axs[0, 1].grid()
-axs[0, 1].set_ylim(0, 1)
-
-axs[1, 0].plot(steps, recall)
-axs[1, 0].set_title("Recall $\mu$: %f"%np.mean(recall))
-axs[1, 0].grid()
-axs[1, 0].set_ylim(0, 1)
-
-df = pd.DataFrame(roc)
-for i in range(8):
-    axs[1, 1].plot(steps, df[i], label=Tree(i).name)
-axs[1, 1].set_title("ROC AUC")
-axs[1, 1].legend()
-axs[1, 1].grid()
-axs[1, 1].set_ylim(0, 1)
-plt.show()
-
-# Confusion matrix
-c_matrix = metrics.confusion_matrix(actual, predicted)
-cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=c_matrix)
-cm_display.plot()
-plt.show(block=False)
-
-# MCC
-# 1 perfect prediction
-# 0 random prediction
-# -1 opposite prediction
-plt.plot(steps, phi)
-plt.title("Matthews Correlation Coefficient $\mu$: %f"%np.mean(phi))
-plt.grid()
-plt.ylim(-1, 1)
-plt.show()
+plots.plotMetrics(actual_list, predicted_list)
+plots.plotConfusion(actual_list[0], predicted_list[0])
+if (hasattr(model, "feature_importances_")):
+    plots.plotFeatures(matrix[0][2:], model.feature_importances_)
--- a/src/experiments/decision_tree/plots.py
+++ b/src/experiments/decision_tree/plots.py
@ -0,0 +1,117 @@
+from sklearn import metrics
+from matplotlib import pyplot as plt
+from numpy import linspace
+from enum import Enum
+
+import pandas as pd
+
+class Tree(Enum):
+    ACCASIA = 0
+    BERK = 1
+    EIK = 2
+    ELS = 3
+    ESDOORN = 4
+    ES = 5
+    LINDE = 6
+    PLATAAN = 7
+
+def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
+    #creating a set of all the unique classes using the actual class list
+    unique_class = set(actual_class)
+    roc_auc_dict = {}
+    for per_class in unique_class:
+        
+        #creating a list of all the classes except the current class 
+        other_class = [x for x in unique_class if x != per_class]
+
+        #marking the current class as 1 and all other classes as 0
+        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
+        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
+
+        #using the sklearn metrics method to calculate the roc_auc_score
+        roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average)
+        roc_auc_dict[per_class] = roc_auc
+
+    return roc_auc_dict
+
+def plotMetrics(true_list, predict_list, stepsize = 1) -> None:
+    '''
+    Creates fancy plots for model metrics.
+
+    Arguments:
+    true_list ([[]]): List of lists with true tags 
+    predict_list ([[]]): List of lists with predicted tags
+    stepsize (int): Defines x-axis step of the graphs (def: 1)
+    '''
+    # Source
+    # https://www.evidentlyai.com/classification-metrics/multi-class-metrics
+
+    ## Load data ##
+    accuracy = []
+    precision = []
+    recall = []
+    roc = []
+    mcc = []
+    steps = linspace(0, len(true_list)-1, int(len(true_list)/stepsize))
+
+    for true, predict in zip(true_list, predict_list):
+        assert len(true) == len(predict)
+        accuracy.append(metrics.accuracy_score(true, predict))
+        precision.append(metrics.precision_score(true, predict, average="macro"))
+        recall.append(metrics.recall_score(true, predict, average="macro"))
+        roc.append(roc_auc_score_multiclass(true, predict))
+        mcc.append(metrics.matthews_corrcoef(true, predict))
+
+    ## Plots ##
+    fig, axs = plt.subplots(3, 2)    
+
+    # Accuracy
+    axs[0, 0].plot(steps, accuracy)
+    axs[0, 0].set_title("Accuracy")
+    axs[0, 0].set_ylim(0, 1)
+    axs[0, 0].grid()
+
+    # Precision
+    axs[0, 1].plot(steps, precision)
+    axs[0, 1].set_title("Precision")
+    axs[0, 1].set_ylim(0, 1)
+    axs[0, 1].grid()
+
+    # Recall
+    axs[1, 0].plot(steps, recall)
+    axs[1, 0].set_title("Recall")
+    axs[1, 0].set_ylim(0, 1)
+    axs[1, 0].grid()
+
+    # ROC
+    df = pd.DataFrame(roc)
+    for i in range(8):
+        axs[1, 1].plot(steps, df[i], label=Tree(i).name)
+    axs[1, 1].set_title("ROC AUC")
+    axs[1, 1].legend()
+    axs[1, 1].grid()
+    axs[1, 1].set_ylim(0, 1)
+
+    # MCC
+    axs[2, 0].plot(steps, mcc)
+    axs[2, 0].set_title("MCC")
+    axs[2, 0].grid()
+    axs[2, 0].set_ylim(-1, 1)
+
+    plt.show()
+    return
+
+def plotConfusion(actual, predicted) -> None:
+    matrix = metrics.confusion_matrix(actual, predicted)
+    plot = metrics.ConfusionMatrixDisplay(confusion_matrix=matrix)
+    plot.plot()
+
+    plt.show()
+    return
+
+def plotFeatures(names, features) -> None:
+    fig, ax = plt.subplots()
+    ax.set_title("Feature Importance")
+    ax.barh(names, features)
+    plt.show()
+    pass