From c6c9b50e9d7c4d494c672889dedab79bd69cb3c8 Mon Sep 17 00:00:00 2001 From: Tom Selier Date: Fri, 13 Oct 2023 21:54:50 +0200 Subject: [PATCH] added MCC --- .../decision_tree/decision_tree.py | 116 ++++++++++++------ src/experiments/knn/knn.py | 5 +- 2 files changed, 79 insertions(+), 42 deletions(-) diff --git a/src/experiments/decision_tree/decision_tree.py b/src/experiments/decision_tree/decision_tree.py index 969dd4b..80bd4c6 100644 --- a/src/experiments/decision_tree/decision_tree.py +++ b/src/experiments/decision_tree/decision_tree.py @@ -2,15 +2,17 @@ from enum import Enum from sklearn import tree from sklearn import metrics from sklearn import preprocessing -import sklearn +from sklearn import neighbors +from sklearn import ensemble +from sklearn import svm from matplotlib import pyplot as plt import pandas as pd import numpy as np import random import csv -SIFT_PATH = "..\\algorithms\\data\\sift.csv" -# SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv" +# SIFT_PATH = "..\\algorithms\\data\\sift.csv" +SIFT_PATH = "C:\\Users\\Tom\\Desktop\\Files\\Repositories\\EV5_Beeldherk_Bomen\datacsv\\result-2023-10-13T14.46.23.csv" class Tree(Enum): ACCASIA = 0 @@ -22,11 +24,7 @@ class Tree(Enum): LINDE = 6 PLATAAN = 7 -# [[tree1_data],[tree2_data]] -# [tree1_label, tree2_label] - def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"): - #creating a set of all the unique classes using the actual class list unique_class = set(actual_class) roc_auc_dict = {} @@ -45,10 +43,8 @@ def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"): return roc_auc_dict - labels = [] i = 0 -done = False with open(SIFT_PATH, 'r') as file: reader = csv.reader(file, delimiter= ',') @@ -66,33 +62,51 @@ with open(SIFT_PATH, 'r') as file: normalized = preprocessing.normalize(data, axis=0, norm='max') norm = list(normalized.tolist()) -steps = np.linspace(2, 20, 10, dtype=np.int64) +steps = np.linspace(1, 50, 2, dtype=np.int64) +# steps = np.linspace(1, 100, 10, dtype=np.int64) +# steps = np.linspace(0, 0.2, 11, dtype=np.float64) accuracy = [] precision = [] recall = [] roc = [] +phi = [] for step in steps: actual = [] predicted = [] - for i in range(100): - test_index = random.randint(1, 101) - temp_data = data.pop(test_index) - temp_label = labels.pop(test_index) - del dec_tree + for i in range(len(norm)): + temp_data = norm.pop(i) + temp_label = labels.pop(i) - dec_tree = tree.DecisionTreeClassifier( - min_samples_leaf=2, - max_depth=None, - random_state=False, - criterion='gini', - splitter='best') - dec_tree = dec_tree.fit(data, labels) - result = dec_tree.predict([matrix[test_index][1:]]) + # model = tree.DecisionTreeClassifier( + # min_samples_leaf=2, + # max_depth=None, # < 5 is worse, None good too + # random_state=False, # No change + # criterion='gini', # MCC + 0.1 + # splitter='best', + # ccp_alpha=0 # Pruning: Keep this 0 + # ) + # model = ensemble.RandomForestClassifier( + # criterion='gini', # gini best + # ) + model = ensemble.ExtraTreesClassifier( + ) + # model = neighbors.KNeighborsClassifier( + # algorithm='auto', + # leaf_size=step, + # n_neighbors=1, + # n_jobs=-1 + # ) + # model = ensemble.BaggingClassifier( + # ) + # model = svm.SVC(decision_function_shape='ovr' + # ) + model = model.fit(norm, labels) + result = model.predict([temp_data]) + del model - # normalized_list.append(temp_data) - data.append(temp_data) + norm.append(temp_data) labels.append(temp_label) actual.append(temp_label) @@ -102,29 +116,51 @@ for step in steps: precision.append(metrics.precision_score(actual, predicted, average='macro')) recall.append(metrics.recall_score(actual, predicted, average='macro')) roc.append(roc_auc_score_multiclass(actual, predicted)) + phi.append(metrics.matthews_corrcoef(actual, predicted)) print(step) # Scores # https://www.evidentlyai.com/classification-metrics/multi-class-metrics -plt.plot(accuracy) -plt.title("Accuracy") -plt.show() -plt.plot(precision) -plt.title("Precision") -plt.show() -plt.plot(recall) -plt.title("Recall") -plt.show() +# For all: higher is better +fig, axs = plt.subplots(2, 2) +fig.set_size_inches(12.5, 10) + +axs[0, 0].plot(steps, accuracy) +axs[0, 0].set_title("Accuracy") +axs[0, 0].grid() +axs[0, 0].set_ylim(0, 1) + +axs[0, 1].plot(steps, precision) +axs[0, 1].set_title("Precision") +axs[0, 1].grid() +axs[0, 1].set_ylim(0, 1) + +axs[1, 0].plot(steps, recall) +axs[1, 0].set_title("Recall") +axs[1, 0].grid() +axs[1, 0].set_ylim(0, 1) + df = pd.DataFrame(roc) -plt.figure() -for i in range(7): - plt.plot(df[i], label=Tree(i).name) -plt.legend() -plt.show() +for i in range(8): + axs[1, 1].plot(steps, df[i], label=Tree(i).name) +axs[1, 1].set_title("ROC AUC") +axs[1, 1].legend() +axs[1, 1].grid() +axs[1, 1].set_ylim(0, 1) # Confusion matrix c_matrix = metrics.confusion_matrix(actual, predicted) cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=c_matrix) cm_display.plot() -plt.show(block=False) \ No newline at end of file +plt.show(block=False) + +# MCC +# 1 perfect prediction +# 0 random prediction +# -1 opposite prediction +plt.plot(steps, phi) +plt.title("Matthews Correlation Coefficient") +plt.grid() +plt.ylim(-1, 1) +plt.show() \ No newline at end of file diff --git a/src/experiments/knn/knn.py b/src/experiments/knn/knn.py index d06d20d..b9a0b36 100644 --- a/src/experiments/knn/knn.py +++ b/src/experiments/knn/knn.py @@ -6,7 +6,7 @@ import csv from sklearn.preprocessing import MinMaxScaler from enum import Enum import random -from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, matthews_corrcoef class Tree(Enum): ACCASIA = 0 @@ -109,4 +109,5 @@ print("Accuracy score", accuracy_score(tag_true, tag_predict)) print("Precision score (macro)", precision_score(tag_true, tag_predict, average='macro')) print("Precision score (micro)", precision_score(tag_true, tag_predict, average='micro')) print("Recall score (macro)", recall_score(tag_true, tag_predict, average='macro')) -print("Recall score (micro)", recall_score(tag_true, tag_predict, average='micro')) \ No newline at end of file +print("Recall score (micro)", recall_score(tag_true, tag_predict, average='micro')) +print("MCC", matthews_corrcoef(tag_true, tag_predict)) \ No newline at end of file