Skip to content

Model classifier

ModelClassifierMixin

Parent class (Mixin) for classifier models

Source code in template_vision/models_training/classifiers/model_classifier.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
class ModelClassifierMixin:
    '''Parent class (Mixin) for classifier models'''

    # Not implemented :
    # -> predict : To be implementd by the parent class when using this mixin

    def __init__(self, level_save: str = 'HIGH', **kwargs) -> None:
        '''Initialization of the class

        Kwargs:
            level_save (str): Level of saving
                LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
                MEDIUM: LOWlevel_save + hdf5 + pkl + plots
                HIGH: MEDIUM + predictions
        Raises:
            ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
        '''
        super().__init__(level_save=level_save, **kwargs)  # forwards level_save & all unused arguments

        if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
            raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")

        # Get logger
        self.logger = logging.getLogger(__name__)

        # Model type
        self.model_type = 'classifier'

        # Classes list to use (set on fit)
        self.list_classes = None
        self.dict_classes = None

        # Other options
        self.level_save = level_save

    @utils.trained_needed
    def predict_with_proba(self, df_test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        '''Predictions on test set with probabilities

        Args:
            df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
        Returns:
            (np.ndarray): Array, shape = [n_samples, n_classes]
            (np.ndarray): Array, shape = [n_samples, n_classes]
        '''
        # Process
        predicted_proba = self.predict(df_test, return_proba=True)
        predicted_class = self.get_classes_from_proba(predicted_proba)
        return predicted_class, predicted_proba

    @utils.trained_needed
    def get_predict_position(self, df_test: pd.DataFrame, y_true) -> np.ndarray:
        '''Gets the order of predictions of y_true.
        Positions start at 1 (not 0)

        Args:
            df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
            y_true (?): Array-like, shape = [n_samples, n_features] - Classes
        Returns:
            (?): Array, shape = [n_samples]
        '''
        # Process
        # Cast as pd.Series
        y_true = pd.Series(y_true)
        # Get predicted probabilities
        predicted_proba = self.predict(df_test, return_proba=True)
        # Get position
        order = predicted_proba.argsort()
        ranks = len(self.list_classes) - order.argsort()
        df_probas = pd.DataFrame(ranks, columns=self.list_classes)
        predict_positions = np.array([df_probas.loc[i, cl] if cl in df_probas.columns else -1 for i, cl in enumerate(y_true)])
        return predict_positions

    def get_classes_from_proba(self, predicted_proba: np.ndarray) -> np.ndarray:
        '''Gets the classes from probabilities

        Args:
            predicted_proba (np.ndarray): The probabilities predicted by the model, shape = [n_samples, n_classes]
        Returns:
            predicted_class (np.ndarray): Shape = [n_samples]
        '''
        predicted_class = np.vectorize(lambda x: self.dict_classes[x])(predicted_proba.argmax(axis=-1))
        return predicted_class

    def get_top_n_from_proba(self, predicted_proba: np.ndarray, n: int = 5) -> Tuple[list, list]:
        '''Gets the Top n predictions from probabilities

        Args:
            predicted_proba (np.ndarray): Predicted probabilities = [n_samples, n_classes]
        kwargs:
            n (int): Number of classes to return
        Raises:
            ValueError: If the number of classes to return is greater than the number of classes of the model
        Returns:
            top_n (list): Top n predicted classes
            top_n_proba (list): Top n probabilities (corresponding to the top_n list of classes)
        '''
        if self.list_classes is not None and n > len(self.list_classes):
            raise ValueError("The number of classes to return is greater than the number of classes of the model")
        # Process
        idx = predicted_proba.argsort()[:, -n:][:, ::-1]
        top_n_proba = list(np.take_along_axis(predicted_proba, idx, axis=1))
        top_n = list(np.vectorize(lambda x: self.dict_classes[x])(idx))
        return top_n, top_n_proba

    def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
        '''Gets a list of classes from the predictions (mainly useful for multi-labels)

        Args:
            y (list | np.ndarray): Array-like, shape = [n_samples, n_classes], arrays of 0s and 1s
        Returns:
            (?): List of classes
        '''
        return list(y) if type(y) == np.ndarray else y

    def get_and_save_metrics(self, y_true, y_pred, list_files_x: Union[list, None] = None,
                             type_data: str = '') -> pd.DataFrame:
        '''Gets and saves the metrics of a model

        Args:
            y_true (?): Array-like [n_samples, 1] if classifier
                # If classifier, class of each image
                # If object detector, list of list of bboxes per image
                    bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
            y_pred (?): Array-like [n_samples, 1] if classifier
                # If classifier, class of each image
                # If object detector, list of list of bboxes per image
                    bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
        Kwargs:
            list_files_x (list): Input images file paths
            type_data (str): Type of dataset (validation, test, ...)
        Returns:
            pd.DataFrame: The d
        '''
        # Cast to np.array
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        # Check shapes
        if len(y_true.shape) == 2 and y_true.shape[1] == 1:
            y_true = np.ravel(y_true)
        if len(y_pred.shape) == 2 and y_pred.shape[1] == 1:
            y_pred = np.ravel(y_pred)

        # Save a predictionn file if wanted
        if self.level_save == 'HIGH':
            # Inverse transform
            y_true_df = list(self.inverse_transform(y_true))
            y_pred_df = list(self.inverse_transform(y_pred))

            # Concat in a dataframe
            df = pd.DataFrame({'y_true': y_true_df, 'y_pred': y_pred_df})
            # Ajout colonne file_path si possible
            if list_files_x is not None:
                df['file_path'] = list_files_x
            # Add a matched column
            df['matched'] = (df['y_true'] == df['y_pred']).astype(int)

            #  Save predictions
            file_path = os.path.join(self.model_dir, f"predictions{'_' + type_data if len(type_data) > 0 else ''}.csv")
            df.sort_values('matched', ascending=True).to_csv(file_path, sep=';', index=None, encoding='utf-8')

        # Gets global f1 score / acc_tot / trues / falses / precision / recall / support
        f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        trues = np.sum(y_true == y_pred)
        falses = np.sum(y_true != y_pred)
        acc_tot = accuracy_score(y_true, y_pred)
        precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        labels_tmp, counts_tmp = np.unique(y_true, return_counts=True)
        support = [0.] * len(self.list_classes) + [1.0]
        for i, cl in enumerate(self.list_classes):
            if cl in labels_tmp:
                idx_tmp = list(labels_tmp).index(cl)
                support[i] = counts_tmp[idx_tmp] / y_pred.shape[0]

        # Global Statistics
        self.logger.info('-- * * * * * * * * * * * * * * --')
        self.logger.info(f"Statistics f1-score{' ' + type_data if len(type_data) > 0 else ''}")
        self.logger.info('--------------------------------')
        self.logger.info(f"Total accuracy : {round(acc_tot * 100, 2)}% \t Trues: {trues} \t Falses: {falses}")
        self.logger.info(f"F1-score (weighted) : {round(f1_weighted, 5)}")
        self.logger.info(f"Precision (weighted) : {round(precision_weighted, 5)}")
        self.logger.info(f"Recall (weighted) : {round(recall_weighted, 5)}")
        self.logger.info('--------------------------------')

        # Metrics file
        dict_df_stats = {}

        # Add metrics
        labels = self.list_classes
        # Plot confusion matrices if level_save > LOW
        if self.level_save in ['MEDIUM', 'HIGH']:
            if len(labels) > 50:
                self.logger.warning(
                    f"Warning, there are {len(labels)} categories to plot in the confusion matrix.\n"
                    "Heavy chances of slowness/display bugs/crashes...\n"
                    "SKIP the plots"
                )
            else:
                # Global statistics
                c_mat = confusion_matrix(y_true, y_pred, labels=labels)
                self._plot_confusion_matrix(c_mat, labels, type_data=type_data, normalized=False)
                self._plot_confusion_matrix(c_mat, labels, type_data=type_data, normalized=True)

        # Get statistics per class
        for i, label in enumerate(labels):
            label_str = str(label)  # Fix : If label is an int, can cause some problems (e.g. only zeroes in the confusion matrix)
            none_class = 'None' if label_str != 'None' else 'others'  # Check that the class is not already 'None'
            y_true_tmp = [label_str if _ == label else none_class for _ in y_true]
            y_pred_tmp = [label_str if _ == label else none_class for _ in y_pred]
            c_mat_tmp = confusion_matrix(y_true_tmp, y_pred_tmp, labels=[none_class, label_str])
            dict_df_stats[i] = self._update_info_from_c_mat(c_mat_tmp, label, log_info=False)

        # Add global statistics
        dict_df_stats[i+1] = {
            'Label': 'All',
            'F1-Score': f1_weighted,
            'Accuracy': acc_tot,
            'Precision': precision_weighted,
            'Recall': recall_weighted,
            'Trues': trues,
            'Falses': falses,
            'True positive': None,
            'True negative': None,
            'False positive': None,
            'False negative': None,
            'Condition positive': None,
            'Condition negative': None,
            'Predicted positive': None,
            'Predicted negative': None,
        }
        df_stats = pd.DataFrame.from_dict(dict_df_stats, orient='index')

        # Add support
        df_stats['Support'] = support

        # Save .csv
        file_path = os.path.join(self.model_dir, f"f1{'_' + type_data if len(type_data) > 0 else ''}@{f1_weighted}.csv")
        df_stats.to_csv(file_path, sep=';', index=False, encoding='utf-8')

        # Save accuracy
        acc_path = os.path.join(self.model_dir, f"acc{'_' + type_data if len(type_data) > 0 else ''}@{round(acc_tot, 5)}")
        with open(acc_path, 'w'):
            pass

        return df_stats

    def get_metrics_simple_monolabel(self, y_true, y_pred) -> pd.DataFrame:
        '''Gets metrics on mono-label predictions
        Same as the method get_and_save_metrics but without all the fluff (save, etc.)

        Args:
            y_true (?): Array-like, shape = [n_samples,]
            y_pred (?): Array-like, shape = [n_samples,]
        Returns:
            pd.DataFrame: The dataframe containing statistics
        '''
        # Cast to np.array
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        # Check shapes
        if len(y_true.shape) == 2 and y_true.shape[1] == 1:
            y_true = np.ravel(y_true)
        if len(y_pred.shape) == 2 and y_pred.shape[1] == 1:
            y_pred = np.ravel(y_pred)

        # Gets global f1 score / acc_tot / trues / falses / precision / recall / support
        f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        trues = np.sum(y_true == y_pred)
        falses = np.sum(y_true != y_pred)
        acc_tot = accuracy_score(y_true, y_pred)
        precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        labels_tmp, counts_tmp = np.unique(y_true, return_counts=True)
        support = [0.] * len(self.list_classes) + [1.0]
        for i, cl in enumerate(self.list_classes):
            if cl in labels_tmp:
                idx_tmp = list(labels_tmp).index(cl)
                support[i] = counts_tmp[idx_tmp] / y_pred.shape[0]

        # DataFrame metrics
        dict_df_stats = {}

        # Get statistics per class
        labels = self.list_classes
        for i, label in enumerate(labels):
            label_str = str(label)  # Fix : If label is an int, can cause some problems (e.g. only zeroes in the confusion matrix)
            none_class = 'None' if label_str != 'None' else 'others'  # Check that the class is not already 'None'
            y_true_tmp = [label_str if _ == label else none_class for _ in y_true]
            y_pred_tmp = [label_str if _ == label else none_class for _ in y_pred]
            c_mat_tmp = confusion_matrix(y_true_tmp, y_pred_tmp, labels=[none_class, label_str])
            dict_df_stats[i] = self._update_info_from_c_mat(c_mat_tmp, label, log_info=False)

        # Add global statistics
        dict_df_stats[i+1] = {
            'Label': 'All',
            'F1-Score': f1_weighted,
            'Accuracy': acc_tot,
            'Precision': precision_weighted,
            'Recall': recall_weighted,
            'Trues': trues,
            'Falses': falses,
            'True positive': None,
            'True negative': None,
            'False positive': None,
            'False negative': None,
            'Condition positive': None,
            'Condition negative': None,
            'Predicted positive': None,
            'Predicted negative': None,
        }
        df_stats = pd.DataFrame.from_dict(dict_df_stats, orient='index')

        # Add support
        df_stats['Support'] = support

        # Return dataframe
        return df_stats

    def _update_info_from_c_mat(self, c_mat: np.ndarray, label: str, log_info: bool = True) -> dict:
        '''Updates a dataframe for the method get_and_save_metrics, given a confusion matrix

        Args:
            c_mat (np.ndarray): Confusion matrix
            label (str): Label to use
        Kwargs:
            log_info (bool): If the statistics must be logged
        Returns:
            dict: Dictionary with the information for the update of the dataframe
        '''
        # Extract all needed info from c_mat
        true_negative = c_mat[0][0]
        true_positive = c_mat[1][1]
        false_negative = c_mat[1][0]
        false_positive = c_mat[0][1]
        condition_positive = false_negative + true_positive
        condition_negative = false_positive + true_negative
        predicted_positive = false_positive + true_positive
        predicted_negative = false_negative + true_negative
        trues_cat = true_negative + true_positive
        falses_cat = false_negative + false_positive
        accuracy = (true_negative + true_positive) / (true_negative + true_positive + false_negative + false_positive)
        precision = 0 if predicted_positive == 0 else true_positive / predicted_positive
        recall = 0 if condition_positive == 0 else true_positive / condition_positive
        f1 = 0 if precision + recall == 0 else 2 * precision * recall / (precision + recall)

        # Display some info
        if log_info:
            self.logger.info(
                f"F1-score: {round(f1, 5)}  \t Precision: {round(100 * precision, 2)}% \t"
                f"Recall: {round(100 * recall, 2)}% \t Trues: {trues_cat} \t Falses: {falses_cat} \t\t --- {label} "
            )

        # Return result
        return {
            'Label': f'{label}',
            'F1-Score': f1,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'Trues': trues_cat,
            'Falses': falses_cat,
            'True positive': true_positive,
            'True negative': true_negative,
            'False positive': false_positive,
            'False negative': false_negative,
            'Condition positive': condition_positive,
            'Condition negative': condition_negative,
            'Predicted positive': predicted_positive,
            'Predicted negative': predicted_negative,
        }

    def _plot_confusion_matrix(self, c_mat: np.ndarray, labels: list, type_data: str = '',
                               normalized: bool = False, subdir: Union[str, None] = None) -> None:
        '''Plots a confusion matrix

        Args:
            c_mat (np.ndarray): Confusion matrix
            labels (list): Labels to plot
        Kwargs:
            type_data (str): Type of dataset (validation, test, ...)
            normalized (bool): If the confusion matrix should be normalized
            subdir (str): Sub-directory for writing the plot
        '''

        # Get title
        if normalized:
            title = f"Normalized confusion matrix{' - ' + type_data if len(type_data) > 0 else ''}"
        else:
            title = f"Confusion matrix, without normalization{' - ' + type_data if len(type_data) > 0 else ''}"

        # Init. plot
        width = round(10 + 0.5 * len(c_mat))
        height = round(4 / 5 * width)
        fig, ax = plt.subplots(figsize=(width, height))

        # Plot
        if normalized:
            c_mat = c_mat.astype('float') / c_mat.sum(axis=1)[:, np.newaxis]
            sns.heatmap(c_mat, annot=True, fmt=".2f", cmap=plt.cm.Blues, ax=ax) # type: ignore
        else:
            sns.heatmap(c_mat, annot=True, fmt="d", cmap=plt.cm.Blues, ax=ax) # type: ignore

        # labels, title and ticks
        ax.set_xlabel('Predicted classes', fontsize=height * 2)
        ax.set_ylabel('Real classes', fontsize=height * 2)
        ax.set_title(title, fontsize=width * 2)
        ax.xaxis.set_ticklabels(labels)
        ax.yaxis.set_ticklabels(labels)
        plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
        plt.setp(ax.get_yticklabels(), rotation=30, horizontalalignment='right')
        plt.tight_layout()

        # Save
        plots_path = os.path.join(self.model_dir, 'plots')
        if subdir is not None:  # Add subdir
            plots_path = os.path.join(plots_path, subdir)
        file_name = f"{type_data + '_' if len(type_data) > 0 else ''}confusion_matrix{'_normalized' if normalized else ''}.png"
        if not os.path.exists(plots_path):
            os.makedirs(plots_path)
        plt.savefig(os.path.join(plots_path, file_name))

        # Close figures
        plt.close('all')

    def save(self, json_data: Union[dict, None] = None) -> None:
        '''Saves the model

        Kwargs:
            json_data (dict): Additional configurations to be saved
        '''
        # Save model
        if json_data is None:
            json_data = {}

        json_data['list_classes'] = self.list_classes
        json_data['dict_classes'] = self.dict_classes

        # Save
        super().save(json_data=json_data)

__init__(level_save='HIGH', **kwargs)

Initialization of the class

Kwargs

level_save (str): Level of saving LOW: stats + configurations + logger keras - /! The model can't be reused /! - MEDIUM: LOWlevel_save + hdf5 + pkl + plots HIGH: MEDIUM + predictions

Raises: ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])

Source code in template_vision/models_training/classifiers/model_classifier.py
def __init__(self, level_save: str = 'HIGH', **kwargs) -> None:
    '''Initialization of the class

    Kwargs:
        level_save (str): Level of saving
            LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
            MEDIUM: LOWlevel_save + hdf5 + pkl + plots
            HIGH: MEDIUM + predictions
    Raises:
        ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
    '''
    super().__init__(level_save=level_save, **kwargs)  # forwards level_save & all unused arguments

    if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
        raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")

    # Get logger
    self.logger = logging.getLogger(__name__)

    # Model type
    self.model_type = 'classifier'

    # Classes list to use (set on fit)
    self.list_classes = None
    self.dict_classes = None

    # Other options
    self.level_save = level_save

get_and_save_metrics(y_true, y_pred, list_files_x=None, type_data='')

Gets and saves the metrics of a model

Parameters:

Name Type Description Default
y_true ?

Array-like [n_samples, 1] if classifier

If classifier, class of each image

If object detector, list of list of bboxes per image

bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
required
y_pred ?

Array-like [n_samples, 1] if classifier

If classifier, class of each image

If object detector, list of list of bboxes per image

bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
required

Kwargs: list_files_x (list): Input images file paths type_data (str): Type of dataset (validation, test, ...) Returns: pd.DataFrame: The d

Source code in template_vision/models_training/classifiers/model_classifier.py
def get_and_save_metrics(self, y_true, y_pred, list_files_x: Union[list, None] = None,
                         type_data: str = '') -> pd.DataFrame:
    '''Gets and saves the metrics of a model

    Args:
        y_true (?): Array-like [n_samples, 1] if classifier
            # If classifier, class of each image
            # If object detector, list of list of bboxes per image
                bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
        y_pred (?): Array-like [n_samples, 1] if classifier
            # If classifier, class of each image
            # If object detector, list of list of bboxes per image
                bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
    Kwargs:
        list_files_x (list): Input images file paths
        type_data (str): Type of dataset (validation, test, ...)
    Returns:
        pd.DataFrame: The d
    '''
    # Cast to np.array
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Check shapes
    if len(y_true.shape) == 2 and y_true.shape[1] == 1:
        y_true = np.ravel(y_true)
    if len(y_pred.shape) == 2 and y_pred.shape[1] == 1:
        y_pred = np.ravel(y_pred)

    # Save a predictionn file if wanted
    if self.level_save == 'HIGH':
        # Inverse transform
        y_true_df = list(self.inverse_transform(y_true))
        y_pred_df = list(self.inverse_transform(y_pred))

        # Concat in a dataframe
        df = pd.DataFrame({'y_true': y_true_df, 'y_pred': y_pred_df})
        # Ajout colonne file_path si possible
        if list_files_x is not None:
            df['file_path'] = list_files_x
        # Add a matched column
        df['matched'] = (df['y_true'] == df['y_pred']).astype(int)

        #  Save predictions
        file_path = os.path.join(self.model_dir, f"predictions{'_' + type_data if len(type_data) > 0 else ''}.csv")
        df.sort_values('matched', ascending=True).to_csv(file_path, sep=';', index=None, encoding='utf-8')

    # Gets global f1 score / acc_tot / trues / falses / precision / recall / support
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    trues = np.sum(y_true == y_pred)
    falses = np.sum(y_true != y_pred)
    acc_tot = accuracy_score(y_true, y_pred)
    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    labels_tmp, counts_tmp = np.unique(y_true, return_counts=True)
    support = [0.] * len(self.list_classes) + [1.0]
    for i, cl in enumerate(self.list_classes):
        if cl in labels_tmp:
            idx_tmp = list(labels_tmp).index(cl)
            support[i] = counts_tmp[idx_tmp] / y_pred.shape[0]

    # Global Statistics
    self.logger.info('-- * * * * * * * * * * * * * * --')
    self.logger.info(f"Statistics f1-score{' ' + type_data if len(type_data) > 0 else ''}")
    self.logger.info('--------------------------------')
    self.logger.info(f"Total accuracy : {round(acc_tot * 100, 2)}% \t Trues: {trues} \t Falses: {falses}")
    self.logger.info(f"F1-score (weighted) : {round(f1_weighted, 5)}")
    self.logger.info(f"Precision (weighted) : {round(precision_weighted, 5)}")
    self.logger.info(f"Recall (weighted) : {round(recall_weighted, 5)}")
    self.logger.info('--------------------------------')

    # Metrics file
    dict_df_stats = {}

    # Add metrics
    labels = self.list_classes
    # Plot confusion matrices if level_save > LOW
    if self.level_save in ['MEDIUM', 'HIGH']:
        if len(labels) > 50:
            self.logger.warning(
                f"Warning, there are {len(labels)} categories to plot in the confusion matrix.\n"
                "Heavy chances of slowness/display bugs/crashes...\n"
                "SKIP the plots"
            )
        else:
            # Global statistics
            c_mat = confusion_matrix(y_true, y_pred, labels=labels)
            self._plot_confusion_matrix(c_mat, labels, type_data=type_data, normalized=False)
            self._plot_confusion_matrix(c_mat, labels, type_data=type_data, normalized=True)

    # Get statistics per class
    for i, label in enumerate(labels):
        label_str = str(label)  # Fix : If label is an int, can cause some problems (e.g. only zeroes in the confusion matrix)
        none_class = 'None' if label_str != 'None' else 'others'  # Check that the class is not already 'None'
        y_true_tmp = [label_str if _ == label else none_class for _ in y_true]
        y_pred_tmp = [label_str if _ == label else none_class for _ in y_pred]
        c_mat_tmp = confusion_matrix(y_true_tmp, y_pred_tmp, labels=[none_class, label_str])
        dict_df_stats[i] = self._update_info_from_c_mat(c_mat_tmp, label, log_info=False)

    # Add global statistics
    dict_df_stats[i+1] = {
        'Label': 'All',
        'F1-Score': f1_weighted,
        'Accuracy': acc_tot,
        'Precision': precision_weighted,
        'Recall': recall_weighted,
        'Trues': trues,
        'Falses': falses,
        'True positive': None,
        'True negative': None,
        'False positive': None,
        'False negative': None,
        'Condition positive': None,
        'Condition negative': None,
        'Predicted positive': None,
        'Predicted negative': None,
    }
    df_stats = pd.DataFrame.from_dict(dict_df_stats, orient='index')

    # Add support
    df_stats['Support'] = support

    # Save .csv
    file_path = os.path.join(self.model_dir, f"f1{'_' + type_data if len(type_data) > 0 else ''}@{f1_weighted}.csv")
    df_stats.to_csv(file_path, sep=';', index=False, encoding='utf-8')

    # Save accuracy
    acc_path = os.path.join(self.model_dir, f"acc{'_' + type_data if len(type_data) > 0 else ''}@{round(acc_tot, 5)}")
    with open(acc_path, 'w'):
        pass

    return df_stats

get_classes_from_proba(predicted_proba)

Gets the classes from probabilities

Parameters:

Name Type Description Default
predicted_proba ndarray

The probabilities predicted by the model, shape = [n_samples, n_classes]

required

Returns: predicted_class (np.ndarray): Shape = [n_samples]

Source code in template_vision/models_training/classifiers/model_classifier.py
def get_classes_from_proba(self, predicted_proba: np.ndarray) -> np.ndarray:
    '''Gets the classes from probabilities

    Args:
        predicted_proba (np.ndarray): The probabilities predicted by the model, shape = [n_samples, n_classes]
    Returns:
        predicted_class (np.ndarray): Shape = [n_samples]
    '''
    predicted_class = np.vectorize(lambda x: self.dict_classes[x])(predicted_proba.argmax(axis=-1))
    return predicted_class

get_metrics_simple_monolabel(y_true, y_pred)

Gets metrics on mono-label predictions Same as the method get_and_save_metrics but without all the fluff (save, etc.)

Parameters:

Name Type Description Default
y_true ?

Array-like, shape = [n_samples,]

required
y_pred ?

Array-like, shape = [n_samples,]

required

Returns: pd.DataFrame: The dataframe containing statistics

Source code in template_vision/models_training/classifiers/model_classifier.py
def get_metrics_simple_monolabel(self, y_true, y_pred) -> pd.DataFrame:
    '''Gets metrics on mono-label predictions
    Same as the method get_and_save_metrics but without all the fluff (save, etc.)

    Args:
        y_true (?): Array-like, shape = [n_samples,]
        y_pred (?): Array-like, shape = [n_samples,]
    Returns:
        pd.DataFrame: The dataframe containing statistics
    '''
    # Cast to np.array
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Check shapes
    if len(y_true.shape) == 2 and y_true.shape[1] == 1:
        y_true = np.ravel(y_true)
    if len(y_pred.shape) == 2 and y_pred.shape[1] == 1:
        y_pred = np.ravel(y_pred)

    # Gets global f1 score / acc_tot / trues / falses / precision / recall / support
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    trues = np.sum(y_true == y_pred)
    falses = np.sum(y_true != y_pred)
    acc_tot = accuracy_score(y_true, y_pred)
    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    labels_tmp, counts_tmp = np.unique(y_true, return_counts=True)
    support = [0.] * len(self.list_classes) + [1.0]
    for i, cl in enumerate(self.list_classes):
        if cl in labels_tmp:
            idx_tmp = list(labels_tmp).index(cl)
            support[i] = counts_tmp[idx_tmp] / y_pred.shape[0]

    # DataFrame metrics
    dict_df_stats = {}

    # Get statistics per class
    labels = self.list_classes
    for i, label in enumerate(labels):
        label_str = str(label)  # Fix : If label is an int, can cause some problems (e.g. only zeroes in the confusion matrix)
        none_class = 'None' if label_str != 'None' else 'others'  # Check that the class is not already 'None'
        y_true_tmp = [label_str if _ == label else none_class for _ in y_true]
        y_pred_tmp = [label_str if _ == label else none_class for _ in y_pred]
        c_mat_tmp = confusion_matrix(y_true_tmp, y_pred_tmp, labels=[none_class, label_str])
        dict_df_stats[i] = self._update_info_from_c_mat(c_mat_tmp, label, log_info=False)

    # Add global statistics
    dict_df_stats[i+1] = {
        'Label': 'All',
        'F1-Score': f1_weighted,
        'Accuracy': acc_tot,
        'Precision': precision_weighted,
        'Recall': recall_weighted,
        'Trues': trues,
        'Falses': falses,
        'True positive': None,
        'True negative': None,
        'False positive': None,
        'False negative': None,
        'Condition positive': None,
        'Condition negative': None,
        'Predicted positive': None,
        'Predicted negative': None,
    }
    df_stats = pd.DataFrame.from_dict(dict_df_stats, orient='index')

    # Add support
    df_stats['Support'] = support

    # Return dataframe
    return df_stats

get_predict_position(df_test, y_true)

Gets the order of predictions of y_true. Positions start at 1 (not 0)

Parameters:

Name Type Description Default
df_test DataFrame

DataFrame to be predicted, with column file_path

required
y_true ?

Array-like, shape = [n_samples, n_features] - Classes

required

Returns: (?): Array, shape = [n_samples]

Source code in template_vision/models_training/classifiers/model_classifier.py
@utils.trained_needed
def get_predict_position(self, df_test: pd.DataFrame, y_true) -> np.ndarray:
    '''Gets the order of predictions of y_true.
    Positions start at 1 (not 0)

    Args:
        df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
        y_true (?): Array-like, shape = [n_samples, n_features] - Classes
    Returns:
        (?): Array, shape = [n_samples]
    '''
    # Process
    # Cast as pd.Series
    y_true = pd.Series(y_true)
    # Get predicted probabilities
    predicted_proba = self.predict(df_test, return_proba=True)
    # Get position
    order = predicted_proba.argsort()
    ranks = len(self.list_classes) - order.argsort()
    df_probas = pd.DataFrame(ranks, columns=self.list_classes)
    predict_positions = np.array([df_probas.loc[i, cl] if cl in df_probas.columns else -1 for i, cl in enumerate(y_true)])
    return predict_positions

get_top_n_from_proba(predicted_proba, n=5)

Gets the Top n predictions from probabilities

Parameters:

Name Type Description Default
predicted_proba ndarray

Predicted probabilities = [n_samples, n_classes]

required

kwargs: n (int): Number of classes to return Raises: ValueError: If the number of classes to return is greater than the number of classes of the model Returns: top_n (list): Top n predicted classes top_n_proba (list): Top n probabilities (corresponding to the top_n list of classes)

Source code in template_vision/models_training/classifiers/model_classifier.py
def get_top_n_from_proba(self, predicted_proba: np.ndarray, n: int = 5) -> Tuple[list, list]:
    '''Gets the Top n predictions from probabilities

    Args:
        predicted_proba (np.ndarray): Predicted probabilities = [n_samples, n_classes]
    kwargs:
        n (int): Number of classes to return
    Raises:
        ValueError: If the number of classes to return is greater than the number of classes of the model
    Returns:
        top_n (list): Top n predicted classes
        top_n_proba (list): Top n probabilities (corresponding to the top_n list of classes)
    '''
    if self.list_classes is not None and n > len(self.list_classes):
        raise ValueError("The number of classes to return is greater than the number of classes of the model")
    # Process
    idx = predicted_proba.argsort()[:, -n:][:, ::-1]
    top_n_proba = list(np.take_along_axis(predicted_proba, idx, axis=1))
    top_n = list(np.vectorize(lambda x: self.dict_classes[x])(idx))
    return top_n, top_n_proba

inverse_transform(y)

Gets a list of classes from the predictions (mainly useful for multi-labels)

Parameters:

Name Type Description Default
y list | ndarray

Array-like, shape = [n_samples, n_classes], arrays of 0s and 1s

required

Returns: (?): List of classes

Source code in template_vision/models_training/classifiers/model_classifier.py
def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
    '''Gets a list of classes from the predictions (mainly useful for multi-labels)

    Args:
        y (list | np.ndarray): Array-like, shape = [n_samples, n_classes], arrays of 0s and 1s
    Returns:
        (?): List of classes
    '''
    return list(y) if type(y) == np.ndarray else y

predict_with_proba(df_test)

Predictions on test set with probabilities

Parameters:

Name Type Description Default
df_test DataFrame

DataFrame to be predicted, with column file_path

required

Returns: (np.ndarray): Array, shape = [n_samples, n_classes] (np.ndarray): Array, shape = [n_samples, n_classes]

Source code in template_vision/models_training/classifiers/model_classifier.py
@utils.trained_needed
def predict_with_proba(self, df_test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    '''Predictions on test set with probabilities

    Args:
        df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
    Returns:
        (np.ndarray): Array, shape = [n_samples, n_classes]
        (np.ndarray): Array, shape = [n_samples, n_classes]
    '''
    # Process
    predicted_proba = self.predict(df_test, return_proba=True)
    predicted_class = self.get_classes_from_proba(predicted_proba)
    return predicted_class, predicted_proba

save(json_data=None)

Saves the model

Kwargs

json_data (dict): Additional configurations to be saved

Source code in template_vision/models_training/classifiers/model_classifier.py
def save(self, json_data: Union[dict, None] = None) -> None:
    '''Saves the model

    Kwargs:
        json_data (dict): Additional configurations to be saved
    '''
    # Save model
    if json_data is None:
        json_data = {}

    json_data['list_classes'] = self.list_classes
    json_data['dict_classes'] = self.dict_classes

    # Save
    super().save(json_data=json_data)