From 8850f957ae8e70a8dec403678427009bc0613b23 Mon Sep 17 00:00:00 2001 From: Tom Selier Date: Fri, 20 Oct 2023 11:09:38 +0200 Subject: [PATCH] changes --- src/experiments/csv_merger.py | 6 +- .../decision_tree/decision_tree.py | 132 ++++-------------- src/experiments/decision_tree/plots.py | 117 ++++++++++++++++ 3 files changed, 151 insertions(+), 104 deletions(-) create mode 100644 src/experiments/decision_tree/plots.py diff --git a/src/experiments/csv_merger.py b/src/experiments/csv_merger.py index c8a8ce2..c21012d 100644 --- a/src/experiments/csv_merger.py +++ b/src/experiments/csv_merger.py @@ -26,8 +26,12 @@ for element in data[1:]: element.append(idx) print(element[1], new_element[1]) +for element in data: + print(len(element)) + with open(OUTPUT_DIR + "combined.csv", 'w') as file: for element in data: - for idx in element: + for idx in element[:-1]: file.write(str(idx) + ',') + file.write(str(element[-1])) file.write('\n') \ No newline at end of file diff --git a/src/experiments/decision_tree/decision_tree.py b/src/experiments/decision_tree/decision_tree.py index 8df1901..30653c6 100644 --- a/src/experiments/decision_tree/decision_tree.py +++ b/src/experiments/decision_tree/decision_tree.py @@ -10,9 +10,9 @@ import pandas as pd import numpy as np import random import csv +import plots -# SIFT_PATH = "..\\algorithms\\data\\sift.csv" -SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv" +PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\\dataset\\csv\\combined.csv" class Tree(Enum): ACCASIA = 0 @@ -24,36 +24,24 @@ class Tree(Enum): LINDE = 6 PLATAAN = 7 -def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"): - #creating a set of all the unique classes using the actual class list - unique_class = set(actual_class) - roc_auc_dict = {} - for per_class in unique_class: - - #creating a list of all the classes except the current class - other_class = [x for x in unique_class if x != per_class] - - #marking the current class as 1 and all other classes as 0 - new_actual_class = [0 if x in other_class else 1 for x in actual_class] - new_pred_class = [0 if x in other_class else 1 for x in pred_class] - - #using the sklearn metrics method to calculate the roc_auc_score - roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average) - roc_auc_dict[per_class] = roc_auc - - return roc_auc_dict - labels = [] +actual_list = [] +predicted_list = [] i = 0 -with open(SIFT_PATH, 'r') as file: +with open(PATH, 'r') as file: reader = csv.reader(file, delimiter= ',') matrix = list(reader) data = [[] for x in range(len(matrix)-1)] + + # Load all but the headers for row in matrix[1:]: + ## append data to lists labels.append(Tree[row[0].upper()].value) - for element in row[1:]: + + # append all but ID and tree + for element in row[2:]: data[i].append(float(element)) i += 1 @@ -61,30 +49,17 @@ with open(SIFT_PATH, 'r') as file: normalized = preprocessing.normalize(data, axis=0, norm='max') norm = list(normalized.tolist()) -steps = np.linspace(0, 9, 10, dtype=np.int64) -# steps = np.linspace(1, 100, 10, dtype=np.int64) -# steps = np.linspace(0, 1, 11, dtype=np.float64) -accuracy = [] -precision = [] -recall = [] -roc = [] -phi = [] +steps = np.linspace(0.1, 1.0, 10, dtype=np.float64) for step in steps: actual = [] predicted = [] - # weights = {} - # for idx, element in enumerate(Tree): - # # print(idx, element) - # weights[idx] = 0.1 - # weights[5] = 1 for i in range(len(norm)): temp_data = norm.pop(i) temp_label = labels.pop(i) # model = tree.DecisionTreeClassifier( - # # class_weight=weights, # class_weight=None, # min_samples_leaf=2, # max_depth=None, # < 5 is worse, None good too @@ -98,21 +73,23 @@ for step in steps: # criterion='gini', # gini best # ) # model = ensemble.ExtraTreesClassifier( + # n_estimators=150 # higher is better, but slower (def: 100) # ) - model = neighbors.KNeighborsClassifier( - algorithm='auto', - leaf_size=2, - n_neighbors=1, - n_jobs=-1 + # model = neighbors.KNeighborsClassifier( + # algorithm='auto', + # leaf_size=2, + # n_neighbors=step, + # ) + model = ensemble.BaggingClassifier( + n_estimators=5, + max_samples=.5, + max_features=.5, + bootstrap=False ) - # model = ensemble.BaggingClassifier( - # ) # model = svm.SVC(decision_function_shape='ovr' # ) model = model.fit(norm, labels) result = model.predict([temp_data]) - # features = model.feature_importances_ - del model norm.append(temp_data) labels.append(temp_label) @@ -120,63 +97,12 @@ for step in steps: actual.append(temp_label) predicted.append(result[0]) - accuracy.append(metrics.accuracy_score(actual, predicted)) - precision.append(metrics.precision_score(actual, predicted, average='macro')) - recall.append(metrics.recall_score(actual, predicted, average='macro')) - roc.append(roc_auc_score_multiclass(actual, predicted)) - phi.append(metrics.matthews_corrcoef(actual, predicted)) + actual_list.append(actual) + predicted_list.append(predicted) print(step) -# Feature importance -# plt.bar(matrix[0][1:], features) -# fig, ax = plt.subplots() -# ax.set_title("Feature Importance") -# ax.barh(matrix[0][1:], features) -# plt.show() - -# Scores -# https://www.evidentlyai.com/classification-metrics/multi-class-metrics -# For all: higher is better -fig, axs = plt.subplots(2, 2) -fig.set_size_inches(12.5, 10) - -axs[0, 0].plot(steps, accuracy) -axs[0, 0].set_title("Accuracy: $\mu$: %f"%np.mean(accuracy)) -axs[0, 0].grid() -axs[0, 0].set_ylim(0, 1) - -axs[0, 1].plot(steps, precision) -axs[0, 1].set_title("Precision $\mu$: %f"%np.mean(precision)) -axs[0, 1].grid() -axs[0, 1].set_ylim(0, 1) - -axs[1, 0].plot(steps, recall) -axs[1, 0].set_title("Recall $\mu$: %f"%np.mean(recall)) -axs[1, 0].grid() -axs[1, 0].set_ylim(0, 1) - -df = pd.DataFrame(roc) -for i in range(8): - axs[1, 1].plot(steps, df[i], label=Tree(i).name) -axs[1, 1].set_title("ROC AUC") -axs[1, 1].legend() -axs[1, 1].grid() -axs[1, 1].set_ylim(0, 1) -plt.show() - -# Confusion matrix -c_matrix = metrics.confusion_matrix(actual, predicted) -cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=c_matrix) -cm_display.plot() -plt.show(block=False) - -# MCC -# 1 perfect prediction -# 0 random prediction -# -1 opposite prediction -plt.plot(steps, phi) -plt.title("Matthews Correlation Coefficient $\mu$: %f"%np.mean(phi)) -plt.grid() -plt.ylim(-1, 1) -plt.show() \ No newline at end of file +plots.plotMetrics(actual_list, predicted_list) +plots.plotConfusion(actual_list[0], predicted_list[0]) +if (hasattr(model, "feature_importances_")): + plots.plotFeatures(matrix[0][2:], model.feature_importances_) \ No newline at end of file diff --git a/src/experiments/decision_tree/plots.py b/src/experiments/decision_tree/plots.py new file mode 100644 index 0000000..8659423 --- /dev/null +++ b/src/experiments/decision_tree/plots.py @@ -0,0 +1,117 @@ +from sklearn import metrics +from matplotlib import pyplot as plt +from numpy import linspace +from enum import Enum + +import pandas as pd + +class Tree(Enum): + ACCASIA = 0 + BERK = 1 + EIK = 2 + ELS = 3 + ESDOORN = 4 + ES = 5 + LINDE = 6 + PLATAAN = 7 + +def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"): + #creating a set of all the unique classes using the actual class list + unique_class = set(actual_class) + roc_auc_dict = {} + for per_class in unique_class: + + #creating a list of all the classes except the current class + other_class = [x for x in unique_class if x != per_class] + + #marking the current class as 1 and all other classes as 0 + new_actual_class = [0 if x in other_class else 1 for x in actual_class] + new_pred_class = [0 if x in other_class else 1 for x in pred_class] + + #using the sklearn metrics method to calculate the roc_auc_score + roc_auc = metrics.roc_auc_score(new_actual_class, new_pred_class, average = average) + roc_auc_dict[per_class] = roc_auc + + return roc_auc_dict + +def plotMetrics(true_list, predict_list, stepsize = 1) -> None: + ''' + Creates fancy plots for model metrics. + + Arguments: + true_list ([[]]): List of lists with true tags + predict_list ([[]]): List of lists with predicted tags + stepsize (int): Defines x-axis step of the graphs (def: 1) + ''' + # Source + # https://www.evidentlyai.com/classification-metrics/multi-class-metrics + + ## Load data ## + accuracy = [] + precision = [] + recall = [] + roc = [] + mcc = [] + steps = linspace(0, len(true_list)-1, int(len(true_list)/stepsize)) + + for true, predict in zip(true_list, predict_list): + assert len(true) == len(predict) + accuracy.append(metrics.accuracy_score(true, predict)) + precision.append(metrics.precision_score(true, predict, average="macro")) + recall.append(metrics.recall_score(true, predict, average="macro")) + roc.append(roc_auc_score_multiclass(true, predict)) + mcc.append(metrics.matthews_corrcoef(true, predict)) + + ## Plots ## + fig, axs = plt.subplots(3, 2) + + # Accuracy + axs[0, 0].plot(steps, accuracy) + axs[0, 0].set_title("Accuracy") + axs[0, 0].set_ylim(0, 1) + axs[0, 0].grid() + + # Precision + axs[0, 1].plot(steps, precision) + axs[0, 1].set_title("Precision") + axs[0, 1].set_ylim(0, 1) + axs[0, 1].grid() + + # Recall + axs[1, 0].plot(steps, recall) + axs[1, 0].set_title("Recall") + axs[1, 0].set_ylim(0, 1) + axs[1, 0].grid() + + # ROC + df = pd.DataFrame(roc) + for i in range(8): + axs[1, 1].plot(steps, df[i], label=Tree(i).name) + axs[1, 1].set_title("ROC AUC") + axs[1, 1].legend() + axs[1, 1].grid() + axs[1, 1].set_ylim(0, 1) + + # MCC + axs[2, 0].plot(steps, mcc) + axs[2, 0].set_title("MCC") + axs[2, 0].grid() + axs[2, 0].set_ylim(-1, 1) + + plt.show() + return + +def plotConfusion(actual, predicted) -> None: + matrix = metrics.confusion_matrix(actual, predicted) + plot = metrics.ConfusionMatrixDisplay(confusion_matrix=matrix) + plot.plot() + + plt.show() + return + +def plotFeatures(names, features) -> None: + fig, ax = plt.subplots() + ax.set_title("Feature Importance") + ax.barh(names, features) + plt.show() + pass \ No newline at end of file