changes

2023-10-20 11:09:38 +02:00 · 2023-10-20 11:09:38 +02:00 · 8850f957ae
commit 8850f957ae
parent 3f384138b2
3 changed files with 151 additions and 104 deletions
--- a/src/experiments/csv_merger.py
+++ b/src/experiments/csv_merger.py
@ -26,8 +26,12 @@ for element in data[1:]:
                element.append(idx)
            print(element[1], new_element[1])
 for element in data:
    print(len(element))
 with open(OUTPUT_DIR + "combined.csv", 'w') as file:
    for element in data:
-        for idx in element:
+        for idx in element[:-1]:
            file.write(str(idx) + ',')
        file.write(str(element[-1]))
        file.write('\n')
--- a/src/experiments/decision_tree/decision_tree.py
+++ b/src/experiments/decision_tree/decision_tree.py
@ -10,9 +10,9 @@ import pandas as pd
 import numpy as np
 import random
 import csv
 import plots
-# SIFT_PATH = "..\\algorithms\\data\\sift.csv"
+PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\\dataset\\csv\\combined.csv"
 SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv"
 class Tree(Enum):
    ACCASIA = 0
@ -24,36 +24,24 @@ class Tree(Enum):
    LINDE = 6
    PLATAAN = 7
 def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]
        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    return roc_auc_dict
 labels = []
 actual_list = []
 predicted_list = []
 i = 0
-with open(SIFT_PATH, 'r') as file:
+with open(PATH, 'r') as file:
    reader = csv.reader(file, delimiter= ',')
    matrix = list(reader)
    data = [[] for x in range(len(matrix)-1)]
    # Load all but the headers
    for row in matrix[1:]:
        ## append data to lists
        labels.append(Tree[row[0].upper()].value)
-        for element in row[1:]:
+        
        # append all but ID and tree
        for element in row[2:]:
            data[i].append(float(element))
        i += 1
@ -61,30 +49,17 @@ with open(SIFT_PATH, 'r') as file:
    normalized = preprocessing.normalize(data, axis=0, norm='max')
    norm = list(normalized.tolist())
-steps = np.linspace(0, 9, 10, dtype=np.int64)
+steps = np.linspace(0.1, 1.0, 10, dtype=np.float64)
 # steps = np.linspace(1, 100, 10, dtype=np.int64)
 # steps = np.linspace(0, 1, 11, dtype=np.float64)
 accuracy = []
 precision = []
 recall = []
 roc = []
 phi = []
 for step in steps:
    actual = []
    predicted = []
    # weights = {}
    # for idx, element in enumerate(Tree):
    #     # print(idx, element)
    #     weights[idx] = 0.1
    # weights[5] = 1
    for i in range(len(norm)):
        temp_data = norm.pop(i)
        temp_label = labels.pop(i)
        # model = tree.DecisionTreeClassifier(
        #     # class_weight=weights,
        #     class_weight=None,
        #     min_samples_leaf=2,
        #     max_depth=None, # < 5 is worse, None good too
@ -98,21 +73,23 @@ for step in steps:
        #     criterion='gini', # gini best
        # )
        # model = ensemble.ExtraTreesClassifier(
        #     n_estimators=150 # higher is better, but slower (def: 100)
        # )
-        model = neighbors.KNeighborsClassifier(
+        # model = neighbors.KNeighborsClassifier(
-            algorithm='auto',
+        #     algorithm='auto',
-            leaf_size=2,
+        #     leaf_size=2,
-            n_neighbors=1,
+        #     n_neighbors=step,
-            n_jobs=-1
+        # )
        model = ensemble.BaggingClassifier(
            n_estimators=5,
            max_samples=.5,
            max_features=.5,
            bootstrap=False
        )
        # model = ensemble.BaggingClassifier(
        # )
        # model = svm.SVC(decision_function_shape='ovr'
        # )
        model = model.fit(norm, labels)
        result = model.predict([temp_data])
        # features = model.feature_importances_
        del model
        norm.append(temp_data)
        labels.append(temp_label)
@ -120,63 +97,12 @@ for step in steps:
        actual.append(temp_label)
        predicted.append(result[0])
-    accuracy.append(metrics.accuracy_score(actual, predicted))
+    actual_list.append(actual)
-    precision.append(metrics.precision_score(actual, predicted, average='macro'))
+    predicted_list.append(predicted)
    recall.append(metrics.recall_score(actual, predicted, average='macro'))
    roc.append(roc_auc_score_multiclass(actual, predicted))
    phi.append(metrics.matthews_corrcoef(actual, predicted))
    print(step)
-# Feature importance
+plots.plotMetrics(actual_list, predicted_list)
-# plt.bar(matrix[0][1:], features)
+plots.plotConfusion(actual_list[0], predicted_list[0])
-# fig, ax = plt.subplots()
+if (hasattr(model, "feature_importances_")):
-# ax.set_title("Feature Importance")
+    plots.plotFeatures(matrix[0][2:], model.feature_importances_)
 # ax.barh(matrix[0][1:], features)
 # plt.show()
 # Scores
 # https://www.evidentlyai.com/classification-metrics/multi-class-metrics
 # For all: higher is better
 fig, axs = plt.subplots(2, 2)
 fig.set_size_inches(12.5, 10)
 axs[0, 0].plot(steps, accuracy)
 axs[0, 0].set_title("Accuracy: $\mu$: %f"%np.mean(accuracy))
 axs[0, 0].grid()
 axs[0, 0].set_ylim(0, 1)
 axs[0, 1].plot(steps, precision)
 axs[0, 1].set_title("Precision $\mu$: %f"%np.mean(precision))
 axs[0, 1].grid()
 axs[0, 1].set_ylim(0, 1)
 axs[1, 0].plot(steps, recall)
 axs[1, 0].set_title("Recall $\mu$: %f"%np.mean(recall))
 axs[1, 0].grid()
 axs[1, 0].set_ylim(0, 1)
 df = pd.DataFrame(roc)
 for i in range(8):
    axs[1, 1].plot(steps, df[i], label=Tree(i).name)
 axs[1, 1].set_title("ROC AUC")
 axs[1, 1].legend()
 axs[1, 1].grid()
 axs[1, 1].set_ylim(0, 1)
 plt.show()
 # Confusion matrix
 c_matrix = metrics.confusion_matrix(actual, predicted)
 cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=c_matrix)
 cm_display.plot()
 plt.show(block=False)
 # MCC
 # 1 perfect prediction
 # 0 random prediction
 # -1 opposite prediction
 plt.plot(steps, phi)
 plt.title("Matthews Correlation Coefficient $\mu$: %f"%np.mean(phi))
 plt.grid()
 plt.ylim(-1, 1)
 plt.show()
--- a/src/experiments/decision_tree/plots.py
+++ b/src/experiments/decision_tree/plots.py
@ -0,0 +1,117 @@
 from sklearn import metrics
 from matplotlib import pyplot as plt
 from numpy import linspace
 from enum import Enum
 import pandas as pd
 class Tree(Enum):
    ACCASIA = 0
    BERK = 1
    EIK = 2
    ELS = 3
    ESDOORN = 4
    ES = 5
    LINDE = 6
    PLATAAN = 7
 def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]
        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    return roc_auc_dict
 def plotMetrics(true_list, predict_list, stepsize = 1) -> None:
    '''
    Creates fancy plots for model metrics.
    Arguments:
    true_list ([[]]): List of lists with true tags 
    predict_list ([[]]): List of lists with predicted tags
    stepsize (int): Defines x-axis step of the graphs (def: 1)
    '''
    # Source
    # https://www.evidentlyai.com/classification-metrics/multi-class-metrics
    ## Load data ##
    accuracy = []
    precision = []
    recall = []
    roc = []
    mcc = []
    steps = linspace(0, len(true_list)-1, int(len(true_list)/stepsize))
    for true, predict in zip(true_list, predict_list):
        assert len(true) == len(predict)
        accuracy.append(metrics.accuracy_score(true, predict))
        precision.append(metrics.precision_score(true, predict, average="macro"))
        recall.append(metrics.recall_score(true, predict, average="macro"))
        roc.append(roc_auc_score_multiclass(true, predict))
        mcc.append(metrics.matthews_corrcoef(true, predict))
    ## Plots ##
    fig, axs = plt.subplots(3, 2)    
    # Accuracy
    axs[0, 0].plot(steps, accuracy)
    axs[0, 0].set_title("Accuracy")
    axs[0, 0].set_ylim(0, 1)
    axs[0, 0].grid()
    # Precision
    axs[0, 1].plot(steps, precision)
    axs[0, 1].set_title("Precision")
    axs[0, 1].set_ylim(0, 1)
    axs[0, 1].grid()
    # Recall
    axs[1, 0].plot(steps, recall)
    axs[1, 0].set_title("Recall")
    axs[1, 0].set_ylim(0, 1)
    axs[1, 0].grid()
    # ROC
    df = pd.DataFrame(roc)
    for i in range(8):
        axs[1, 1].plot(steps, df[i], label=Tree(i).name)
    axs[1, 1].set_title("ROC AUC")
    axs[1, 1].legend()
    axs[1, 1].grid()
    axs[1, 1].set_ylim(0, 1)
    # MCC
    axs[2, 0].plot(steps, mcc)
    axs[2, 0].set_title("MCC")
    axs[2, 0].grid()
    axs[2, 0].set_ylim(-1, 1)
    plt.show()
    return
 def plotConfusion(actual, predicted) -> None:
    matrix = metrics.confusion_matrix(actual, predicted)
    plot = metrics.ConfusionMatrixDisplay(confusion_matrix=matrix)
    plot.plot()
    plt.show()
    return
 def plotFeatures(names, features) -> None:
    fig, ax = plt.subplots()
    ax.set_title("Feature Importance")
    ax.barh(names, features)
    plt.show()
    pass