Skip to content

Model class

ModelClass

Parent class for the models

Source code in template_num/models_training/model_class.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
class ModelClass:
    '''Parent class for the models'''

    _default_name = 'none'
    # Variable annotation : https://www.python.org/dev/peps/pep-0526/
    # Solves lots of typing errors, cf mypy
    multi_label: Union[bool, None]
    list_classes: Union[list, None]
    dict_classes: Union[dict, None]

    # Not implemented :
    # -> fit
    # -> predict
    # -> predict_proba
    # -> inverse_transform
    # -> get_and_save_metrics

    def __init__(self, model_dir: Union[str, None] = None, model_name: Union[str, None] = None,
                 x_col: Union[list, None] = None, y_col: Union[str, int, list, None] = None, random_seed: Union[int, None] = None,
                 preprocess_pipeline: Union[ColumnTransformer, None] = None, level_save: str = 'HIGH', **kwargs) -> None:
        '''Initialization of the parent class.

        Kwargs:
            model_dir (str): Folder where to save the model
                If None, creates a directory based on the model's name and the date (most common usage)
            model_name (str): The name of the model
            x_col (list): Names of the columns used for the training - x
            y_col (str or int or list if multi-labels): Name of the model's target column(s) - y
            random_seed (int): Seed to use for packages randomness
            preprocess_pipeline (ColumnTransformer): The pipeline used for preprocessing. If None -> no preprocessing !
            level_save (str): Level of saving
                LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
                MEDIUM: LOW + hdf5 + pkl + plots
                HIGH: MEDIUM + predictions
        Raises:
            ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
            NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)
        '''
        if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
            raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")

        # Get logger
        self.logger = logging.getLogger(__name__)

        # Model type -> 'classifier' or 'regressor' depending on the model
        self.model_type = None

        # Model name
        self.model_name = self._default_name if model_name is None else model_name

        # Names of the columns used
        self.x_col = x_col
        self.y_col = y_col

        # Random seed
        self.random_seed = random_seed

        # Can be None if reloading a model
        if x_col is None:
            self.logger.warning("Warning, the attribute x_col is not given! The model might not work as intended.")
        if y_col is None:
            self.logger.warning("Warning, the attribute y_col is not given! The model might not work as intended.")

        # Model folder
        if model_dir is None:
            self.model_dir = self._get_new_model_dir()
        else:
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            if not os.path.isdir(model_dir):
                raise NotADirectoryError(f"{model_dir} is not a valid directory")
            self.model_dir = os.path.abspath(model_dir)

        # Preprocessing pipeline
        self.preprocess_pipeline = preprocess_pipeline
        if self.preprocess_pipeline is not None:
            try:
                check_is_fitted(self.preprocess_pipeline)
            except NotFittedError as e:
                self.logger.error("The preprocessing pipeline hasn't been fitted !")
                self.logger.error(repr(e))
                raise NotFittedError()
            # We get the associated columns (and a check if there has been a fit is done)
            self.columns_in, self.mandatory_columns = utils_models.get_columns_pipeline(self.preprocess_pipeline)
        else:
            # We can't define a "no_preprocess" pipeline since we should fit it
            # So we take care of that at the first fit
            self.logger.warning("Warning, no preprocessing pipeline given !")
            self.columns_in, self.mandatory_columns = None, None

        # Other options
        self.level_save = level_save

        # is trained ?
        self.trained = False
        self.nb_fit = 0

        # Configuration dict. to be logged. Set on save.
        self.json_dict: Dict[Any, Any] = {}

    def fit(self, x_train, y_train, **kwargs) -> None:
        '''Trains the model

        Args:
            x_train (?): Array-like, shape = [n_samples, n_features]
            y_train (?): Array-like, shape = [n_samples, n_targets]
        '''
        raise NotImplementedError("'fit' needs to be overridden")

    def predict(self, x_test: pd.DataFrame, **kwargs) -> np.ndarray:
        '''Predictions on the test set

        Args:
            x_test (pd.DataFrame): DataFrame with the test data to be predicted
        Returns:
            (np.ndarray): Array, shape = [n_samples, n_classes]
        '''
        raise NotImplementedError("'predict' needs to be overridden")

    def predict_proba(self, x_test: pd.DataFrame, **kwargs) -> np.ndarray:
        '''Predicts probabilities on the test dataset

        Args:
            x_test (pd.DataFrame): DataFrame with the test data to be predicted
        Returns:
            (np.ndarray): Array, shape = [n_samples, n_classes]
        '''
        raise NotImplementedError("'predict_proba' needs to be overridden")

    def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
        '''Gets the final format of prediction
            - Classification : classes from predictions
            - Regression : values (identity function)

        Args:
            y (list | np.ndarray): Array-like, shape = [n_samples,]
                   OR 1D array shape = [n_classes] (only one prediction)
        Returns:
            (?): Array, shape = [n_samples, ?]
        '''
        raise NotImplementedError("'inverse_transform' needs to be overridden")

    def get_and_save_metrics(self, y_true, y_pred, df_x: Union[pd.DataFrame, None] = None,
                             series_to_add: Union[List[pd.Series], None] = None,
                             type_data: str = '') -> pd.DataFrame:
        '''Gets and saves the metrics of a model

        Args:
            y_true (?): Array-like, shape = [n_samples, n_targets]
            y_pred (?): Array-like, shape = [n_samples, n_targets]
        Kwargs:
            df_x (pd.DataFrame or None): Input dataFrame used for the prediction
            series_to_add (list): List of pd.Series to add to the dataframe
            type_data (str): Type of dataset (validation, test, ...)
        Returns:
            pd.DataFrame: The dataframe containing the statistics
        '''
        raise NotImplementedError("'get_and_save_metrics' needs to be overridden")

    def save(self, json_data: Union[dict, None] = None) -> None:
        '''Saves the model

        Kwargs:
            json_data (dict): Additional configurations to be saved
        '''

        # Manage paths
        pkl_path = os.path.join(self.model_dir, f"{self.model_name}.pkl")
        preprocess_pipeline_path = os.path.join(self.model_dir, "preprocess_pipeline.pkl")
        conf_path = os.path.join(self.model_dir, "configurations.json")

        # Save the model & preprocessing pipeline if level_save > 'LOW'
        if self.level_save in ['MEDIUM', 'HIGH']:
            with open(pkl_path, 'wb') as f:
                pickle.dump(self, f)
            # Useful for reload_from_standalone, otherwise, saved as a class attribute
            with open(preprocess_pipeline_path, 'wb') as f:
                pickle.dump(self.preprocess_pipeline, f)

        # Saving JSON configuration
        json_dict = {
            'maintainers': 'Agence DataServices',
            'gabarit_version': '1.3.4.dev0+local',
            'date': datetime.now().strftime("%d/%m/%Y - %H:%M:%S"),  # Not the same as the folder's name
            'package_version': utils.get_package_version(),
            'model_name': self.model_name,
            'model_dir': self.model_dir,
            'model_type': self.model_type,
            'trained': self.trained,
            'nb_fit': self.nb_fit,
            'x_col': self.x_col,
            'y_col': self.y_col,
            'columns_in': self.columns_in,
            'mandatory_columns': self.mandatory_columns,
            'random_seed': self.random_seed,
            'level_save': self.level_save,
            'librairie': None,
        }
        # Merge json_data if not None
        if json_data is not None:
            # Priority given to json_data !
            json_dict = {**json_dict, **json_data}

        # Add conf to attributes
        self.json_dict = json_dict

        # Save conf
        with open(conf_path, 'w', encoding='utf-8') as json_file:
            json.dump(json_dict, json_file, indent=4, cls=utils.NpEncoder)

        # Now, save a properties file for the model upload
        self._save_upload_properties(json_dict)

    def _save_upload_properties(self, json_dict: Union[dict, None] = None) -> None:
        '''Prepares a configuration file for a future export (e.g on an artifactory)

        Kwargs:
            json_dict: Configurations to save
        '''
        if json_dict is None:
            json_dict = {}

        # Manage paths
        properties_path = os.path.join(self.model_dir, "properties.json")
        vanilla_model_upload_instructions = os.path.join(utils.get_ressources_path(), 'model_upload_instructions.md')
        specific_model_upload_instructions = os.path.join(self.model_dir, "model_upload_instructions.md")

        # First, we define a list of "allowed" properties
        allowed_properties = ["maintainers", "gabarit_version", "date", "package_version", "model_name", "list_classes",
                              "librairie", "fit_time"]
        # Now we filter these properties
        final_dict = {k: v for k, v in json_dict.items() if k in allowed_properties}
        # Save
        with open(properties_path, 'w', encoding='utf-8') as f:
            json.dump(final_dict, f, indent=4, cls=utils.NpEncoder)

        # Add instructions to upload a model to a storage solution (e.g. Artifactory)
        with open(vanilla_model_upload_instructions, 'r', encoding='utf-8') as f:
            content = f.read()
        # TODO: to be improved
        new_content = content.replace('model_dir_path_identifier', os.path.abspath(self.model_dir))
        with open(specific_model_upload_instructions, 'w', encoding='utf-8') as f:
            f.write(new_content)

    def _get_new_model_dir(self) -> str:
        '''Gets a folder where to save the model

        Returns:
            str: Path to the folder
        '''
        models_dir = utils.get_models_path()
        subfolder = os.path.join(models_dir, self.model_name)
        folder_name = datetime.now().strftime(f"{self.model_name}_%Y_%m_%d-%H_%M_%S")
        model_dir = os.path.join(subfolder, folder_name)
        if os.path.isdir(model_dir):
            time.sleep(1)  # Wait 1 second so that the 'date' changes...
            return self._get_new_model_dir()  # Get new directory name
        else:
            os.makedirs(model_dir)
        return model_dir

    def _check_input_format(self, x_input: Union[pd.DataFrame, np.ndarray], y_input: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
                            fit_function: bool = False) -> Tuple[Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, pd.Series, np.ndarray, None]]:
        '''Checks the inputs of a function. We check the number of columns and the ordering.

        Strategy :
            - If fit function, set preprocessing pipeline, columns_in, mandatory_columns, x_col (if not set), y_col (if not set) with input data
            - Then, for both x & y
                - If input data has a column attribute
                    - If we can find all needed columns, reorder the dataset using only the needed columns (so it works if we have more columns)
                    - Else, raise an error if length do not match (otherwise log a warning)
                - Else, raise an error if length do not match (otherwise log a warning)

        We also set the pipeline to a passthrough pipeline if None

        Args:
            x_input (pd.DataFrame, np.ndarray): Array-like, shape = [n_samples, n_features]
        Kwargs:
            y_input (pd.DataFrame, pd.Series, np.ndarray): Array-like, shape = [n_samples, n_target]
                Mandatory if fit_function
            fit_function (bool): If it is a fit function
        Raises:
            AttributeError: If fit_function == True, but y_input is None
            ValueError: If one of the inputs hasn't the right number of columns
        Returns:
            (pd.DataFrame, np.ndarray): x_input, may be reordered if needed
            (pd.DataFrame, pd.Series, np.ndarray): y_input, may be reordered if needed
        '''
        # Getting some info first
        x_input_shape = x_input.shape[-1] if len(x_input.shape) > 1 else 1
        if y_input is not None:
            y_input_shape = y_input.shape[-1] if len(y_input.shape) > 1 else 1
        else:
            y_input_shape = 0  # not used

        # Manage fit_function = True
        if fit_function:
            if y_input is None:
                raise AttributeError("The argument y_input is mandatory if fit_function == True")
            # Set x_col if not set yet
            if self.x_col is None:
                self.logger.warning("Warning, the attribute x_col was not given when creating the model")
                self.logger.warning("We set it now with the input data of the fit function")
                if hasattr(x_input, 'columns'):
                    # TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
                    self.x_col = list(x_input.columns)  # type: ignore
                else:
                    self.x_col = [_ for _ in range(x_input_shape)]
            # Same thing for y_col
            if self.y_col is None:
                self.logger.warning("Warning, the attribute y_col was not given when creating the model")
                self.logger.warning("We set it now with the input data of the fit function")
                if hasattr(y_input, 'columns'):
                    # TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
                    self.y_col = list(y_input.columns)  # type: ignore
                else:
                    self.y_col = [_ for _ in range(y_input_shape)]
                # If there is only one element, we get rid of the list
                if y_input_shape == 1:
                    self.y_col = self.y_col[0]
            # If pipeline, columns_in or mandatory_columns is None, sets it
            if self.preprocess_pipeline is None:  # ie no pipeline given when initializing the class
                preprocess_str = "no_preprocess"
                preprocess_pipeline = preprocess.get_pipeline(preprocess_str)  # Warning, the pipeline must be fitted
                preprocess_pipeline.fit(x_input)  # We fit the pipeline to set the necessary columns for the pipeline
                # Set attributes
                self.preprocess_pipeline = preprocess_pipeline
                self.columns_in, self.mandatory_columns = utils_models.get_columns_pipeline(self.preprocess_pipeline)

        # Checking x_input
        if self.x_col is None:
            self.logger.warning("Can't check the input format (x) because x_col is not set...")
        else:
            x_col_len = len(self.x_col)
            # We check the presence of the columns
            if hasattr(x_input, 'columns'):
                can_reorder = True
                for col in self.x_col:
                    # TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
                    if col not in x_input.columns:  # type: ignore
                        can_reorder = False
                        self.logger.warning(f"The column {col} is missing from the input (x)")
                # If we can't reorder :
                # 1. Exact number of columns : we write a warning message and continue with columns renamed
                # 2. Not the correct number of column : raise an error
                if not can_reorder:
                    if x_input_shape != x_col_len:
                        raise ValueError(f"Input data (x) is not in the right format ({x_input_shape} != {x_col_len})")
                    self.logger.warning("The names of the columns (x) do not match. The process continues since there is the right number of columns")
                    x_input = x_input.copy()  # needs a copy as we wil change columns names
                    x_input.columns = self.x_col  # type: ignore
                # If we can reorder :
                # 1. Same number of inputs but not the same order -> we just reorder
                # 2. More columns ? -> we just take the needed subset + log a warning message
                else:
                    # TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
                    if list(x_input.columns) != self.x_col:  # type: ignore
                        if x_input_shape == x_col_len:
                            self.logger.warning("The input columns (x) are not in the right order -> automatic reordering !")
                        else:
                            self.logger.warning("More columns in input (x) than needed, but we can find the needed columns -> only considering the needed columns")
                        x_input = x_input[self.x_col]
            else:
                if x_input_shape != len(self.x_col):
                    raise ValueError(f"Input data (x) is not in the right format ({x_input_shape} != {x_col_len})")
                self.logger.warning("The input (x) does not have the 'columns' attribute -> can't check the ordering of the columns")

        # Checking y_input
        if y_input is not None:
            if self.y_col is None:
                self.logger.warning("Can't check the input format (y) because y_col is not set...")
            else:
                # Checking y_input format
                y_col_len = len(self.y_col) if type(self.y_col) == list else 1
                # We check the presence of the columns
                if hasattr(y_input, 'columns'):
                    can_reorder = True
                    target_cols = self.y_col if type(self.y_col) == list else [self.y_col]
                    for col in target_cols:
                        # TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
                        if col not in y_input.columns:  # type: ignore
                            can_reorder = False
                            self.logger.warning(f"The column {col} is missing from the input (y)")
                    # If we can't reorder :
                    # 1. Exact number of columns : we write a warning message and continue with columns renamed
                    # 2. Not the correct number of column : raise an error
                    if not can_reorder:
                        if y_input_shape != y_col_len:
                            raise ValueError(f"Input data (y) is not in the right format ({y_input_shape} != {y_col_len})")
                        self.logger.warning("The names of the columns (y) do not match. The process continues since there is the right number of columns")
                        y_input = y_input.copy()  # needs a copy as we wil change columns names
                        y_input.columns = self.y_col  # type: ignore
                    # If we can reorder :
                    # 1. Same number of inputs but not the same order -> we just reorder
                    # 2. More columns ? -> we just take the needed subset + log a warning message
                    else:
                        # TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
                        if list(y_input.columns) != target_cols:  # type: ignore
                            if y_input_shape == y_col_len:
                                self.logger.warning("The input columns (y) are not in the right order -> automatic reordering !")
                            else:
                                self.logger.warning("More columns in input (y) than needed, but we can find the needed columns -> only considering the needed columns")
                            y_input = y_input[target_cols]
                else:
                    if y_input_shape != y_col_len:
                        raise ValueError(f"Input data (y) is not in the right format ({y_input_shape} != {y_col_len})")
                    self.logger.warning("The input (y) does not have the 'columns' attribute -> can't check the ordering of the columns")

        # Return
        return x_input, y_input

    def display_if_gpu_activated(self) -> None:
        '''Displays if a GPU is being used'''
        if self._is_gpu_activated():
            self.logger.info("GPU activated")

    def _is_gpu_activated(self) -> bool:
        '''Checks if we use a GPU

        Returns:
            bool: whether GPU is available or not
        '''
        # By default, no GPU
        return False

__init__(model_dir=None, model_name=None, x_col=None, y_col=None, random_seed=None, preprocess_pipeline=None, level_save='HIGH', **kwargs)

Initialization of the parent class.

Kwargs

model_dir (str): Folder where to save the model If None, creates a directory based on the model's name and the date (most common usage) model_name (str): The name of the model x_col (list): Names of the columns used for the training - x y_col (str or int or list if multi-labels): Name of the model's target column(s) - y random_seed (int): Seed to use for packages randomness preprocess_pipeline (ColumnTransformer): The pipeline used for preprocessing. If None -> no preprocessing ! level_save (str): Level of saving LOW: stats + configurations + logger keras - /! The model can't be reused /! - MEDIUM: LOW + hdf5 + pkl + plots HIGH: MEDIUM + predictions

Raises: ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH']) NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)

Source code in template_num/models_training/model_class.py
def __init__(self, model_dir: Union[str, None] = None, model_name: Union[str, None] = None,
             x_col: Union[list, None] = None, y_col: Union[str, int, list, None] = None, random_seed: Union[int, None] = None,
             preprocess_pipeline: Union[ColumnTransformer, None] = None, level_save: str = 'HIGH', **kwargs) -> None:
    '''Initialization of the parent class.

    Kwargs:
        model_dir (str): Folder where to save the model
            If None, creates a directory based on the model's name and the date (most common usage)
        model_name (str): The name of the model
        x_col (list): Names of the columns used for the training - x
        y_col (str or int or list if multi-labels): Name of the model's target column(s) - y
        random_seed (int): Seed to use for packages randomness
        preprocess_pipeline (ColumnTransformer): The pipeline used for preprocessing. If None -> no preprocessing !
        level_save (str): Level of saving
            LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
            MEDIUM: LOW + hdf5 + pkl + plots
            HIGH: MEDIUM + predictions
    Raises:
        ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
        NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)
    '''
    if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
        raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")

    # Get logger
    self.logger = logging.getLogger(__name__)

    # Model type -> 'classifier' or 'regressor' depending on the model
    self.model_type = None

    # Model name
    self.model_name = self._default_name if model_name is None else model_name

    # Names of the columns used
    self.x_col = x_col
    self.y_col = y_col

    # Random seed
    self.random_seed = random_seed

    # Can be None if reloading a model
    if x_col is None:
        self.logger.warning("Warning, the attribute x_col is not given! The model might not work as intended.")
    if y_col is None:
        self.logger.warning("Warning, the attribute y_col is not given! The model might not work as intended.")

    # Model folder
    if model_dir is None:
        self.model_dir = self._get_new_model_dir()
    else:
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        if not os.path.isdir(model_dir):
            raise NotADirectoryError(f"{model_dir} is not a valid directory")
        self.model_dir = os.path.abspath(model_dir)

    # Preprocessing pipeline
    self.preprocess_pipeline = preprocess_pipeline
    if self.preprocess_pipeline is not None:
        try:
            check_is_fitted(self.preprocess_pipeline)
        except NotFittedError as e:
            self.logger.error("The preprocessing pipeline hasn't been fitted !")
            self.logger.error(repr(e))
            raise NotFittedError()
        # We get the associated columns (and a check if there has been a fit is done)
        self.columns_in, self.mandatory_columns = utils_models.get_columns_pipeline(self.preprocess_pipeline)
    else:
        # We can't define a "no_preprocess" pipeline since we should fit it
        # So we take care of that at the first fit
        self.logger.warning("Warning, no preprocessing pipeline given !")
        self.columns_in, self.mandatory_columns = None, None

    # Other options
    self.level_save = level_save

    # is trained ?
    self.trained = False
    self.nb_fit = 0

    # Configuration dict. to be logged. Set on save.
    self.json_dict: Dict[Any, Any] = {}

display_if_gpu_activated()

Displays if a GPU is being used

Source code in template_num/models_training/model_class.py
def display_if_gpu_activated(self) -> None:
    '''Displays if a GPU is being used'''
    if self._is_gpu_activated():
        self.logger.info("GPU activated")

fit(x_train, y_train, **kwargs)

Trains the model

Parameters:

Name Type Description Default
x_train ?

Array-like, shape = [n_samples, n_features]

required
y_train ?

Array-like, shape = [n_samples, n_targets]

required
Source code in template_num/models_training/model_class.py
def fit(self, x_train, y_train, **kwargs) -> None:
    '''Trains the model

    Args:
        x_train (?): Array-like, shape = [n_samples, n_features]
        y_train (?): Array-like, shape = [n_samples, n_targets]
    '''
    raise NotImplementedError("'fit' needs to be overridden")

get_and_save_metrics(y_true, y_pred, df_x=None, series_to_add=None, type_data='')

Gets and saves the metrics of a model

Parameters:

Name Type Description Default
y_true ?

Array-like, shape = [n_samples, n_targets]

required
y_pred ?

Array-like, shape = [n_samples, n_targets]

required

Kwargs: df_x (pd.DataFrame or None): Input dataFrame used for the prediction series_to_add (list): List of pd.Series to add to the dataframe type_data (str): Type of dataset (validation, test, ...) Returns: pd.DataFrame: The dataframe containing the statistics

Source code in template_num/models_training/model_class.py
def get_and_save_metrics(self, y_true, y_pred, df_x: Union[pd.DataFrame, None] = None,
                         series_to_add: Union[List[pd.Series], None] = None,
                         type_data: str = '') -> pd.DataFrame:
    '''Gets and saves the metrics of a model

    Args:
        y_true (?): Array-like, shape = [n_samples, n_targets]
        y_pred (?): Array-like, shape = [n_samples, n_targets]
    Kwargs:
        df_x (pd.DataFrame or None): Input dataFrame used for the prediction
        series_to_add (list): List of pd.Series to add to the dataframe
        type_data (str): Type of dataset (validation, test, ...)
    Returns:
        pd.DataFrame: The dataframe containing the statistics
    '''
    raise NotImplementedError("'get_and_save_metrics' needs to be overridden")

inverse_transform(y)

Gets the final format of prediction - Classification : classes from predictions - Regression : values (identity function)

Parameters:

Name Type Description Default
y list | ndarray

Array-like, shape = [n_samples,] OR 1D array shape = [n_classes] (only one prediction)

required

Returns: (?): Array, shape = [n_samples, ?]

Source code in template_num/models_training/model_class.py
def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
    '''Gets the final format of prediction
        - Classification : classes from predictions
        - Regression : values (identity function)

    Args:
        y (list | np.ndarray): Array-like, shape = [n_samples,]
               OR 1D array shape = [n_classes] (only one prediction)
    Returns:
        (?): Array, shape = [n_samples, ?]
    '''
    raise NotImplementedError("'inverse_transform' needs to be overridden")

predict(x_test, **kwargs)

Predictions on the test set

Parameters:

Name Type Description Default
x_test DataFrame

DataFrame with the test data to be predicted

required

Returns: (np.ndarray): Array, shape = [n_samples, n_classes]

Source code in template_num/models_training/model_class.py
def predict(self, x_test: pd.DataFrame, **kwargs) -> np.ndarray:
    '''Predictions on the test set

    Args:
        x_test (pd.DataFrame): DataFrame with the test data to be predicted
    Returns:
        (np.ndarray): Array, shape = [n_samples, n_classes]
    '''
    raise NotImplementedError("'predict' needs to be overridden")

predict_proba(x_test, **kwargs)

Predicts probabilities on the test dataset

Parameters:

Name Type Description Default
x_test DataFrame

DataFrame with the test data to be predicted

required

Returns: (np.ndarray): Array, shape = [n_samples, n_classes]

Source code in template_num/models_training/model_class.py
def predict_proba(self, x_test: pd.DataFrame, **kwargs) -> np.ndarray:
    '''Predicts probabilities on the test dataset

    Args:
        x_test (pd.DataFrame): DataFrame with the test data to be predicted
    Returns:
        (np.ndarray): Array, shape = [n_samples, n_classes]
    '''
    raise NotImplementedError("'predict_proba' needs to be overridden")

save(json_data=None)

Saves the model

Kwargs

json_data (dict): Additional configurations to be saved

Source code in template_num/models_training/model_class.py
def save(self, json_data: Union[dict, None] = None) -> None:
    '''Saves the model

    Kwargs:
        json_data (dict): Additional configurations to be saved
    '''

    # Manage paths
    pkl_path = os.path.join(self.model_dir, f"{self.model_name}.pkl")
    preprocess_pipeline_path = os.path.join(self.model_dir, "preprocess_pipeline.pkl")
    conf_path = os.path.join(self.model_dir, "configurations.json")

    # Save the model & preprocessing pipeline if level_save > 'LOW'
    if self.level_save in ['MEDIUM', 'HIGH']:
        with open(pkl_path, 'wb') as f:
            pickle.dump(self, f)
        # Useful for reload_from_standalone, otherwise, saved as a class attribute
        with open(preprocess_pipeline_path, 'wb') as f:
            pickle.dump(self.preprocess_pipeline, f)

    # Saving JSON configuration
    json_dict = {
        'maintainers': 'Agence DataServices',
        'gabarit_version': '1.3.4.dev0+local',
        'date': datetime.now().strftime("%d/%m/%Y - %H:%M:%S"),  # Not the same as the folder's name
        'package_version': utils.get_package_version(),
        'model_name': self.model_name,
        'model_dir': self.model_dir,
        'model_type': self.model_type,
        'trained': self.trained,
        'nb_fit': self.nb_fit,
        'x_col': self.x_col,
        'y_col': self.y_col,
        'columns_in': self.columns_in,
        'mandatory_columns': self.mandatory_columns,
        'random_seed': self.random_seed,
        'level_save': self.level_save,
        'librairie': None,
    }
    # Merge json_data if not None
    if json_data is not None:
        # Priority given to json_data !
        json_dict = {**json_dict, **json_data}

    # Add conf to attributes
    self.json_dict = json_dict

    # Save conf
    with open(conf_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_dict, json_file, indent=4, cls=utils.NpEncoder)

    # Now, save a properties file for the model upload
    self._save_upload_properties(json_dict)