Model class

`ModelClass`

Parent class for the models

Source code in template_vision/models_training/model_class.py

class ModelClass:
    '''Parent class for the models'''

    _default_name = 'none'
    # Variable annotation : https://www.python.org/dev/peps/pep-0526/
    # Solves lots of typing errors, cf mypy
    list_classes: list
    dict_classes: dict

    # Not implemented :
    # -> fit
    # -> predict
    # -> predict_proba
    # -> inverse_transform
    # -> get_and_save_metrics

    def __init__(self, model_dir: Union[str, None] = None, model_name: Union[str, None] = None,
                 level_save: str = 'HIGH', **kwargs) -> None:
        '''Initialization of the parent class.

        Kwargs:
            model_dir (str): Folder where to save the model
                If None, creates a directory based on the model's name and the date (most common usage)
            model_name (str): The name of the model
            level_save (str): Level of saving
                LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
                MEDIUM: LOW + hdf5 + pkl + plots
                HIGH: MEDIUM + predictions
        Raises:
            ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
            NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)
        '''
        if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
            raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")

        # Get logger
        self.logger = logging.getLogger(__name__)

        # Model type -> 'classifier' or 'object_detector' depending on the model
        self.model_type = None

        # Model name
        self.model_name = self._default_name if model_name is None else model_name

        # Model folder
        if model_dir is None:
            self.model_dir = self._get_new_model_dir()
        else:
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            if not os.path.isdir(model_dir):
                raise NotADirectoryError(f"{model_dir} is not a valid directory")
            self.model_dir = os.path.abspath(model_dir)

        # Other options
        self.level_save = level_save

        # is trained ?
        self.trained = False
        self.nb_fit = 0

        # Configuration dict. to be logged. Set on save.
        self.json_dict: Dict[Any, Any] = {}

    def fit(self, df_train, **kwargs) -> dict:
        '''Trains the model

        Args:
            df_train (pd.DataFrame): Train dataset
                Must contain file_path & file_class columns if classifier
                Must contain file_path & bboxes columns if object detector
        Returns:
            dict: Fit arguments, to be used with transfer learning fine-tuning
        '''
        raise NotImplementedError("'fit' needs to be overridden")

    def predict(self, df_test: pd.DataFrame, **kwargs) -> Union[np.ndarray, list]:
        '''Predictions on test set

        Args:
            df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
        Returns:
            (np.ndarray | list): Array, shape = [n_samples, n_classes] or List of n_samples elements
        '''
        raise NotImplementedError("'predict' needs to be overridden")

    def predict_proba(self, df_test: pd.DataFrame, **kwargs) -> np.ndarray:
        '''Predicts probabilities on the test dataset

        Args:
            df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
        Returns:
            (np.ndarray): Array, shape = [n_samples, n_classes]
        '''
        raise NotImplementedError("'predict_proba' needs to be overridden")

    def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
        '''Gets the final format of prediction
            - Classification : classes from predictions
            - Object detections : list of bboxes per image

        Args:
            y (list | np.ndarray): Array-like
        Returns:
            List of classes if classifier
            List of bboxes if object detector
        '''
        raise NotImplementedError("'inverse_transform' needs to be overridden")

    def get_and_save_metrics(self, y_true, y_pred, list_files_x: Union[list, None] = None,
                             type_data: str = '') -> pd.DataFrame:
        '''Gets and saves the metrics of a model

        Args:
            y_true (?): Array-like [n_samples, 1] if classifier
                # If classifier, class of each image
                # If object detector, list of list of bboxes per image
                    bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
            y_pred (?): Array-like [n_samples, 1] if classifier
                # If classifier, class of each image
                # If object detector, list of list of bboxes per image
                    bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
        Kwargs:
            list_files_x (list): Input images file paths
            type_data (str): Type of dataset (validation, test, ...)
        Returns:
            pd.DataFrame: The dataframe containing statistics
        '''
        raise NotImplementedError("'get_and_save_metrics' needs to be overridden")

    def save(self, json_data: Union[dict, None] = None) -> None:
        '''Saves the model

        Kwargs:
            json_data (dict): Additional configurations to be saved
        '''

        # Manage paths
        pkl_path = os.path.join(self.model_dir, f"{self.model_name}.pkl")
        conf_path = os.path.join(self.model_dir, "configurations.json")

        # Save model & pipeline preprocessing si level_save > 'LOW'
        if self.level_save in ['MEDIUM', 'HIGH']:
            with open(pkl_path, 'wb') as f:
                pickle.dump(self, f)

        # Save configuration JSON
        json_dict = {
            'maintainers': 'Agence DataServices',
            'gabarit_version': '1.3.4.dev0+local',
            'date': datetime.now().strftime("%d/%m/%Y - %H:%M:%S"),  # Not the same as the folder's name
            'package_version': utils.get_package_version(),
            'model_name': self.model_name,
            'model_dir': self.model_dir,
            'model_type': self.model_type,
            'trained': self.trained,
            'nb_fit': self.nb_fit,
            'level_save': self.level_save,
            'librairie': None,
        }
        # Merge json_data if not None
        if json_data is not None:
            # Priority given to json_data !
            json_dict = {**json_dict, **json_data}

        # Add conf to attributes
        self.json_dict = json_dict

        # Save conf
        with open(conf_path, 'w', encoding='utf-8') as json_file:
            json.dump(json_dict, json_file, indent=4, cls=utils.NpEncoder)

        # Now, save a properties file for the model upload
        self._save_upload_properties(json_dict)

    def _save_upload_properties(self, json_dict: Union[dict, None] = None) -> None:
        '''Prepares a configuration file for a future export (e.g on an artifactory)

        Kwargs:
            json_dict: Configurations to save
        '''
        if json_dict is None:
            json_dict = {}

        # Manage paths
        properties_path = os.path.join(self.model_dir, "properties.json")
        vanilla_model_upload_instructions = os.path.join(utils.get_ressources_path(), 'model_upload_instructions.md')
        specific_model_upload_instructions = os.path.join(self.model_dir, "model_upload_instructions.md")

        # First, we define a list of "allowed" properties
        allowed_properties = ["maintainers", "gabarit_version", "date", "package_version", "model_name", "list_classes",
                              "librairie", "fit_time"]
        # Now we filter these properties
        final_dict = {k: v for k, v in json_dict.items() if k in allowed_properties}
        # Save
        with open(properties_path, 'w', encoding='utf-8') as f:
            json.dump(final_dict, f, indent=4, cls=utils.NpEncoder)

        # Add instructions to upload a model to a storage solution (e.g. Artifactory)
        with open(vanilla_model_upload_instructions, 'r', encoding='utf-8') as f:
            content = f.read()
        # TODO: to be improved
        new_content = content.replace('model_dir_path_identifier', os.path.abspath(self.model_dir))
        with open(specific_model_upload_instructions, 'w', encoding='utf-8') as f:
            f.write(new_content)

    def _get_new_model_dir(self) -> str:
        '''Gets a folder where to save the model

        Returns:
            str: Path to the folder
        '''
        models_dir = utils.get_models_path()
        subfolder = os.path.join(models_dir, self.model_name)
        folder_name = datetime.now().strftime(f"{self.model_name}_%Y_%m_%d-%H_%M_%S")
        model_dir = os.path.join(subfolder, folder_name)
        if os.path.isdir(model_dir):
            time.sleep(1)  # Wait 1 second so that the 'date' changes...
            return self._get_new_model_dir()  # Get new directory name
        else:
            os.makedirs(model_dir)
        return model_dir

    def display_if_gpu_activated(self) -> None:
        '''Displays if a GPU is being used'''
        if self._is_gpu_activated():
            self.logger.info("GPU activated")

    def _is_gpu_activated(self) -> bool:
        '''Checks if we use a GPU

        Returns:
            bool: whether GPU is available or not
        '''
        # By default, no GPU
        return False

`init(model_dir=None, model_name=None, level_save='HIGH', **kwargs)`

Initialization of the parent class.

Kwargs

model_dir (str): Folder where to save the model If None, creates a directory based on the model's name and the date (most common usage) model_name (str): The name of the model level_save (str): Level of saving LOW: stats + configurations + logger keras - /! The model can't be reused /! - MEDIUM: LOW + hdf5 + pkl + plots HIGH: MEDIUM + predictions

Raises: ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH']) NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)

Source code in template_vision/models_training/model_class.py

def __init__(self, model_dir: Union[str, None] = None, model_name: Union[str, None] = None,
             level_save: str = 'HIGH', **kwargs) -> None:
    '''Initialization of the parent class.

    Kwargs:
        model_dir (str): Folder where to save the model
            If None, creates a directory based on the model's name and the date (most common usage)
        model_name (str): The name of the model
        level_save (str): Level of saving
            LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
            MEDIUM: LOW + hdf5 + pkl + plots
            HIGH: MEDIUM + predictions
    Raises:
        ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
        NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)
    '''
    if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
        raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")

    # Get logger
    self.logger = logging.getLogger(__name__)

    # Model type -> 'classifier' or 'object_detector' depending on the model
    self.model_type = None

    # Model name
    self.model_name = self._default_name if model_name is None else model_name

    # Model folder
    if model_dir is None:
        self.model_dir = self._get_new_model_dir()
    else:
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        if not os.path.isdir(model_dir):
            raise NotADirectoryError(f"{model_dir} is not a valid directory")
        self.model_dir = os.path.abspath(model_dir)

    # Other options
    self.level_save = level_save

    # is trained ?
    self.trained = False
    self.nb_fit = 0

    # Configuration dict. to be logged. Set on save.
    self.json_dict: Dict[Any, Any] = {}

`display_if_gpu_activated()`

Displays if a GPU is being used

Source code in template_vision/models_training/model_class.py

def display_if_gpu_activated(self) -> None:
    '''Displays if a GPU is being used'''
    if self._is_gpu_activated():
        self.logger.info("GPU activated")

`fit(df_train, **kwargs)`

Trains the model

Parameters:

Name	Type	Description	Default
`df_train`	`DataFrame`	Train dataset Must contain file_path & file_class columns if classifier Must contain file_path & bboxes columns if object detector	required

Returns: dict: Fit arguments, to be used with transfer learning fine-tuning

Source code in template_vision/models_training/model_class.py

def fit(self, df_train, **kwargs) -> dict:
    '''Trains the model

    Args:
        df_train (pd.DataFrame): Train dataset
            Must contain file_path & file_class columns if classifier
            Must contain file_path & bboxes columns if object detector
    Returns:
        dict: Fit arguments, to be used with transfer learning fine-tuning
    '''
    raise NotImplementedError("'fit' needs to be overridden")

`get_and_save_metrics(y_true, y_pred, list_files_x=None, type_data='')`

Gets and saves the metrics of a model

Parameters:

Name	Type	Description	Default
`y_true`	`?`	Array-like [n_samples, 1] if classifier If classifier, class of each image If object detector, list of list of bboxes per image `bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}`	required
`y_pred`	`?`	Array-like [n_samples, 1] if classifier If classifier, class of each image If object detector, list of list of bboxes per image `bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}`	required

Kwargs: list_files_x (list): Input images file paths type_data (str): Type of dataset (validation, test, ...) Returns: pd.DataFrame: The dataframe containing statistics

Source code in template_vision/models_training/model_class.py

def get_and_save_metrics(self, y_true, y_pred, list_files_x: Union[list, None] = None,
                         type_data: str = '') -> pd.DataFrame:
    '''Gets and saves the metrics of a model

    Args:
        y_true (?): Array-like [n_samples, 1] if classifier
            # If classifier, class of each image
            # If object detector, list of list of bboxes per image
                bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
        y_pred (?): Array-like [n_samples, 1] if classifier
            # If classifier, class of each image
            # If object detector, list of list of bboxes per image
                bbox format : {'class': ..., 'x1': ..., 'y1': ..., 'x2': ..., 'y2': ...}
    Kwargs:
        list_files_x (list): Input images file paths
        type_data (str): Type of dataset (validation, test, ...)
    Returns:
        pd.DataFrame: The dataframe containing statistics
    '''
    raise NotImplementedError("'get_and_save_metrics' needs to be overridden")

`inverse_transform(y)`

Gets the final format of prediction - Classification : classes from predictions - Object detections : list of bboxes per image

Parameters:

Name	Type	Description	Default
`y`	`list \| ndarray`	Array-like	required

Returns: List of classes if classifier List of bboxes if object detector

Source code in template_vision/models_training/model_class.py

def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
    '''Gets the final format of prediction
        - Classification : classes from predictions
        - Object detections : list of bboxes per image

    Args:
        y (list | np.ndarray): Array-like
    Returns:
        List of classes if classifier
        List of bboxes if object detector
    '''
    raise NotImplementedError("'inverse_transform' needs to be overridden")

`predict(df_test, **kwargs)`

Predictions on test set

Parameters:

Name	Type	Description	Default
`df_test`	`DataFrame`	DataFrame to be predicted, with column file_path	required

Returns: (np.ndarray | list): Array, shape = [n_samples, n_classes] or List of n_samples elements

Source code in template_vision/models_training/model_class.py

def predict(self, df_test: pd.DataFrame, **kwargs) -> Union[np.ndarray, list]:
    '''Predictions on test set

    Args:
        df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
    Returns:
        (np.ndarray | list): Array, shape = [n_samples, n_classes] or List of n_samples elements
    '''
    raise NotImplementedError("'predict' needs to be overridden")

`predict_proba(df_test, **kwargs)`

Predicts probabilities on the test dataset

Parameters:

Name	Type	Description	Default
`df_test`	`DataFrame`	DataFrame to be predicted, with column file_path	required

Returns: (np.ndarray): Array, shape = [n_samples, n_classes]

Source code in template_vision/models_training/model_class.py

def predict_proba(self, df_test: pd.DataFrame, **kwargs) -> np.ndarray:
    '''Predicts probabilities on the test dataset

    Args:
        df_test (pd.DataFrame): DataFrame to be predicted, with column file_path
    Returns:
        (np.ndarray): Array, shape = [n_samples, n_classes]
    '''
    raise NotImplementedError("'predict_proba' needs to be overridden")

`save(json_data=None)`

Saves the model

Kwargs

json_data (dict): Additional configurations to be saved

Source code in template_vision/models_training/model_class.py

def save(self, json_data: Union[dict, None] = None) -> None:
    '''Saves the model

    Kwargs:
        json_data (dict): Additional configurations to be saved
    '''

    # Manage paths
    pkl_path = os.path.join(self.model_dir, f"{self.model_name}.pkl")
    conf_path = os.path.join(self.model_dir, "configurations.json")

    # Save model & pipeline preprocessing si level_save > 'LOW'
    if self.level_save in ['MEDIUM', 'HIGH']:
        with open(pkl_path, 'wb') as f:
            pickle.dump(self, f)

    # Save configuration JSON
    json_dict = {
        'maintainers': 'Agence DataServices',
        'gabarit_version': '1.3.4.dev0+local',
        'date': datetime.now().strftime("%d/%m/%Y - %H:%M:%S"),  # Not the same as the folder's name
        'package_version': utils.get_package_version(),
        'model_name': self.model_name,
        'model_dir': self.model_dir,
        'model_type': self.model_type,
        'trained': self.trained,
        'nb_fit': self.nb_fit,
        'level_save': self.level_save,
        'librairie': None,
    }
    # Merge json_data if not None
    if json_data is not None:
        # Priority given to json_data !
        json_dict = {**json_dict, **json_data}

    # Add conf to attributes
    self.json_dict = json_dict

    # Save conf
    with open(conf_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_dict, json_file, indent=4, cls=utils.NpEncoder)

    # Now, save a properties file for the model upload
    self._save_upload_properties(json_dict)

Model class

ModelClass

__init__(model_dir=None, model_name=None, level_save='HIGH', **kwargs)

display_if_gpu_activated()

fit(df_train, **kwargs)

get_and_save_metrics(y_true, y_pred, list_files_x=None, type_data='')

If classifier, class of each image

If object detector, list of list of bboxes per image

If classifier, class of each image

If object detector, list of list of bboxes per image

inverse_transform(y)

predict(df_test, **kwargs)

predict_proba(df_test, **kwargs)

save(json_data=None)

`ModelClass`

`init(model_dir=None, model_name=None, level_save='HIGH', **kwargs)`

`display_if_gpu_activated()`

`fit(df_train, **kwargs)`

`get_and_save_metrics(y_true, y_pred, list_files_x=None, type_data='')`

`inverse_transform(y)`

`predict(df_test, **kwargs)`

`predict_proba(df_test, **kwargs)`

`save(json_data=None)`