Utils models

`display_train_test_shape(df_train, df_test, df_shape=None)`

Displays the size of a train/test split

Parameters:

Name	Type	Description	Default
`df_train`	`DataFrame`	Train dataset	required
`df_test`	`DataFrame`	Test dataset	required

Kwargs: df_shape (int): Size of the initial dataset Raises: ValueError: If the object df_shape is not positive

Source code in template_vision/models_training/utils_models.py

def display_train_test_shape(df_train: pd.DataFrame, df_test: pd.DataFrame, df_shape: Union[int, None] = None) -> None:
    '''Displays the size of a train/test split

    Args:
        df_train (pd.DataFrame): Train dataset
        df_test (pd.DataFrame): Test dataset
    Kwargs:
        df_shape (int): Size of the initial dataset
    Raises:
        ValueError: If the object df_shape is not positive
    '''
    if df_shape is not None and df_shape < 1:
        raise ValueError("The object df_shape must be positive")

    # Process
    if df_shape is None:
        df_shape = df_train.shape[0] + df_test.shape[0]
    logger.info(f"There are {df_train.shape[0]} lines in the train dataset and {df_test.shape[0]} in the test dataset.")
    logger.info(f"{round(100 * df_train.shape[0] / df_shape, 2)}% of data are in the train set")
    logger.info(f"{round(100 * df_test.shape[0] / df_shape, 2)}% of data are in the test set")

`load_model(model_dir, is_path=False)`

Loads a model from a path

Parameters:

Name	Type	Description	Default
`model_dir`	`str`	Name of the folder containing the model (e.g. model_autres_2019_11_07-13_43_19)	required

Kwargs: is_path (bool): If folder path instead of name (permits to load model from elsewhere) Returns: ?: Model dict: Model configurations

Source code in template_vision/models_training/utils_models.py

def load_model(model_dir: str, is_path: bool = False) -> Tuple[Any, dict]:
    '''Loads a model from a path

    Args:
        model_dir (str): Name of the folder containing the model (e.g. model_autres_2019_11_07-13_43_19)
    Kwargs:
        is_path (bool): If folder path instead of name (permits to load model from elsewhere)
    Returns:
        ?: Model
        dict: Model configurations
    '''
    # Find model path
    base_folder = None if is_path else utils.get_models_path()
    model_path = utils.find_folder_path(model_dir, base_folder)

    # Get configs
    configuration_path = os.path.join(model_path, 'configurations.json')
    with open(configuration_path, 'r', encoding='utf-8') as f:
        configs = json.load(f)
    # Can't set int as keys in json, so need to cast it after reloading
    # dict_classes keys are always ints
    if 'dict_classes' in configs.keys() and configs['dict_classes'] is not None:
        configs['dict_classes'] = {int(k): v for k, v in configs['dict_classes'].items()}

    # Load model
    pkl_path = os.path.join(model_path, f"{configs['model_name']}.pkl")
    with open(pkl_path, 'rb') as f:
        model = pickle.load(f)

    # Change model_dir if diff
    if model_path != model.model_dir:
        model.model_dir = model_path
        configs['model_dir'] = model_path

    # Load specifics
    hdf5_path = os.path.join(model_path, 'best.hdf5')

    # TODO : we should probably have a single function `load_self` and let the model manage it's reload
    # Check for keras model
    if os.path.exists(hdf5_path):
        # If a specific reload function has been defined (e.g. faster RCNN), we use it
        if hasattr(model, 'reload_models_from_hdf5'):
            model.reload_models_from_hdf5(hdf5_path)
        else:
            model.model = model.reload_model(hdf5_path)

    # Display if GPU is being used
    model.display_if_gpu_activated()

    # Return model & configs
    return model, configs

`normal_split(df, test_size=0.25, seed=None)`

Splits a DataFrame into train and test sets

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe containing the data	required

Kwargs: test_size (float): Proportion representing the size of the expected test set seed (int): random seed Raises: ValueError: If the object test_size is not between 0 and 1 Returns: DataFrame: Train dataframe DataFrame: Test dataframe

Source code in template_vision/models_training/utils_models.py

def normal_split(df: pd.DataFrame, test_size: float = 0.25, seed: Union[int, None] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    '''Splits a DataFrame into train and test sets

    Args:
        df (pd.DataFrame): Dataframe containing the data
    Kwargs:
        test_size (float): Proportion representing the size of the expected test set
        seed (int): random seed
    Raises:
        ValueError: If the object test_size is not between 0 and 1
    Returns:
        DataFrame: Train dataframe
        DataFrame: Test dataframe
    '''
    if not 0 <= test_size <= 1:
        raise ValueError('The object test_size must be between 0 and 1')

    # Normal split
    logger.info("Normal split")
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=seed)

    # Display
    display_train_test_shape(df_train, df_test, df_shape=df.shape[0])

    # Return
    return df_train, df_test

`predict(data_input, model, model_conf, return_proba=False, **kwargs)`

Gets predictions of a model on images

Parameters:

Name	Type	Description	Default
`data_input`	`str \| list<str> \| np.ndarray`	New content to be predicted - str: abs. path to an image - list: list of abs. path to an image - np.ndarray: an already loaded image Possibility to have several images if 4 dim (i.e (nb_images, width, height, channels)) - pd.DataFrame: Dataframe with a column file_path (abs. paths to images)	required
`model`	`ModelClass`	Model to use	required
`model_conf`	`dict`	Model configurations	required

Kwargs: return_proba (bool): If probabilities must be return instead Raises: NotImplementedError: If model is object detection task FileNotFoundError: If the input file does not exist (input type == str) FileNotFoundError: If one of the input files does not exist (input type == list) ValueError: If the input image format is not compatible (input type == np.ndarray) ValueError: If the input array is not compatible (input type == np.ndarray) ValueError: If the input DataFrame does not contains a 'file_path' column (input type == pd.DataFrame) ValueError: If the input type is not a valid type option Returns: List[str], np.ndarray: predictions or probabilities - If return_proba -> np.ndarray - Else List[str]

Source code in template_vision/models_training/utils_models.py

def predict(data_input: Union[str, List[str], np.ndarray, pd.DataFrame], model, model_conf: dict,
            return_proba: bool = False, **kwargs) -> Union[List[str], np.ndarray]:
    '''Gets predictions of a model on images

    Args:
        data_input (str | list<str> | np.ndarray): New content to be predicted
            - str: abs. path to an image
            - list<str>: list of abs. path to an image
            - np.ndarray: an already loaded image
                Possibility to have several images if 4 dim (i.e (nb_images, width, height, channels))
            - pd.DataFrame: Dataframe with a column file_path (abs. paths to images)
        model (ModelClass): Model to use
        model_conf (dict): Model configurations
    Kwargs:
        return_proba (bool): If probabilities must be return instead
    Raises:
        NotImplementedError: If model is object detection task
        FileNotFoundError: If the input file does not exist (input type == str)
        FileNotFoundError: If one of the input files does not exist (input type == list)
        ValueError: If the input image format is not compatible (input type == np.ndarray)
        ValueError: If the input array is not compatible (input type == np.ndarray)
        ValueError: If the input DataFrame does not contains a 'file_path' column (input type == pd.DataFrame)
        ValueError: If the input type is not a valid type option
    Returns:
        List[str], np.ndarray: predictions or probabilities
            - If return_proba -> np.ndarray
            - Else List[str]
    '''
    # TODO
    # TODO
    # TODO: Make this works with object_detector !!!
    # TODO
    # TODO
    if model.model_type == 'object_detector':
        raise NotImplementedError("`predict` is not yet implemented for object detection task")

    ##############################################
    # Retrieve data - PIL format (list)
    ##############################################

    # Type 1: absolute path
    if isinstance(data_input, str):
        if not os.path.exists(data_input):
            raise FileNotFoundError(f"The file {data_input} does not exist")
        images = [Image.open(data_input)]

    # Type 2: list of absolute paths
    elif isinstance(data_input, list):
        if not all([os.path.exists(_) for _ in data_input]):
            raise FileNotFoundError("At least one of the input path does not exist")
        images = [Image.open(_) for _ in data_input]

    # Type 3: numpy array
    elif isinstance(data_input, np.ndarray):
        # If only one image (shape = 3), exapnd a 4th image
        if len(data_input.shape) == 3:
            data_input = np.expand_dims(data_input, 0)
        # Consider input as image list
        if len(data_input.shape) == 4:
            images = []
            for i in range(data_input.shape[0]):
                np_image = data_input[i]
                # RGB
                if np_image.shape[-1] == 3:
                    images.append(Image.fromarray(np_image, 'RGB'))
                elif np_image.shape[-1] == 4:
                    images.append(Image.fromarray(np_image, 'RGBA'))
                else:
                    raise ValueError(f"Input image format ({np_image.shape}) is not compatible")
        else:
            raise ValueError(f"Input array format ({type(data_input)}) is not valid")

    # Type 4: pd.DataFrame
    elif isinstance(data_input, pd.DataFrame):
        if 'file_path' not in data_input.columns:
            raise ValueError("The input DataFrame does not contains a 'file_path' column (mandatory)")
        file_paths = list(data_input['file_path'].values)
        if not all([os.path.exists(_) for _ in file_paths]):
            raise FileNotFoundError("At least one of the input path does not exist")
        images = [Image.open(_) for _ in file_paths]

    # No solution
    else:
        raise ValueError(f"Input type ({type(data_input)}) is not a valid type option.")

    ##############################################
    # Apply preprocessing
    ##############################################

    # Get preprocessor
    if 'preprocess_str' in model_conf.keys():
        preprocess_str = model_conf['preprocess_str']
    else:
        preprocess_str = "no_preprocess"
    preprocessor = preprocess.get_preprocessor(preprocess_str)

    # Preprocess
    images_preprocessed = preprocessor(images)

    ##############################################
    # Save all preprocessed images in a temporary directory
    ##############################################

    # We'll create a temporary folder to save preprocessed images
    with tempfile.TemporaryDirectory(dir=utils.get_data_path()) as tmp_folder:
        # Save images
        images_path = []
        for i, img in enumerate(images_preprocessed):
            img_path = os.path.join(tmp_folder, f"image_{i}.png")
            img.save(img_path, format='PNG')
            images_path.append(img_path)

        # Get predictions
        df = pd.DataFrame({'file_path': images_path})
        predictions, probas = model.predict_with_proba(df)

    # Getting out of the context, all temporary data is deleted

    ##############################################
    # Return result
    ##############################################
    if return_proba:
        return probas
    else:
        return model.inverse_transform(predictions)

`predict_with_proba(data_input, model, model_conf)`

Gets probabilities predictions of a model on a dataset

Parameters:

Name	Type	Description	Default
`data_input`	`str \| list<str> \| np.ndarray`	New content to be predicted - str: abs. path to an image - list: list of abs. path to an image - np.ndarray: an already loaded image Possibility to have several images if 4 dim (i.e (nb_images, width, height, channels)) - pd.DataFrame: Dataframe with a column file_path (abs. paths to images)	required
`model`	`ModelClass`	Model to use	required
`model_conf`	`dict`	Model configurations	required

Raises: NotImplementedError: If model is object detection task ValueError : If predict does not return an np.ndarray Returns: Union[List[str], List[float]]: predictions, probabilities

Source code in template_vision/models_training/utils_models.py

def predict_with_proba(data_input: Union[str, List[str], np.ndarray, pd.DataFrame], model,
                       model_conf: dict) -> Tuple[List[str], List[float]]:
    '''Gets probabilities predictions of a model on a dataset

    Args:
        data_input (str | list<str> | np.ndarray): New content to be predicted
            - str: abs. path to an image
            - list<str>: list of abs. path to an image
            - np.ndarray: an already loaded image
                Possibility to have several images if 4 dim (i.e (nb_images, width, height, channels))
            - pd.DataFrame: Dataframe with a column file_path (abs. paths to images)
        model (ModelClass): Model to use
        model_conf (dict): Model configurations
    Raises:
        NotImplementedError: If model is object detection task
        ValueError : If predict does not return an np.ndarray
    Returns:
        Union[List[str], List[float]]: predictions, probabilities
    '''
    if model.model_type == 'object_detector':
        raise NotImplementedError("`predict_with_proba` is not yet implemented for object detection task")

    # Get probas
    probas = predict(data_input, model, model_conf, return_proba=True)

    # Check type
    if type(probas) != np.ndarray:
        raise ValueError("Internal error - probas should be an np.ndarray.")

    # Manage cases with only one element
    predictions = model.get_classes_from_proba(probas)
    predictions = model.inverse_transform(predictions)
    max_probas = list(probas.max(axis=1))

    # Return
    return predictions, max_probas

`remove_small_classes(df, col, min_rows=2)`

Deletes the classes with small numbers of elements

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe containing the data	required
`col`	`str \| int`	Columns containing the classes	required

Kwargs: min_rows (int): Minimal number of lines in the training set (default: 2) Raises: ValueError: If the object min_rows is not positive Returns: pd.DataFrame: New dataset

Source code in template_vision/models_training/utils_models.py

def remove_small_classes(df: pd.DataFrame, col: Union[str, int], min_rows: int = 2) -> pd.DataFrame:
    '''Deletes the classes with small numbers of elements

    Args:
        df (pd.DataFrame): Dataframe containing the data
        col (str | int): Columns containing the classes
    Kwargs:
        min_rows (int): Minimal number of lines in the training set (default: 2)
    Raises:
        ValueError: If the object min_rows is not positive
    Returns:
        pd.DataFrame: New dataset
    '''
    if min_rows < 1:
        raise ValueError("The object min_rows must be positive")

    # Looking for classes with less than min_rows lines
    v_count = df[col].value_counts()
    classes_to_remove = list(v_count[v_count < min_rows].index.values)
    for cl in classes_to_remove:
        logger.warning(f"/!\\ /!\\ /!\\ Class {cl} has less than {min_rows} lines in the training set.")
        logger.warning("/!\\ /!\\ /!\\ This class is automatically removed from the dataset.")
    return df[~df[col].isin(classes_to_remove)]

`stratified_split(df, col, test_size=0.25, seed=None)`

Splits a DataFrame into train and test sets - Stratified strategy

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe containing the data	required
`col`	`str or int`	column on which to do the stratified split	required

Kwargs: test_size (float): Proportion representing the size of the expected test set seed (int): Random seed Raises: ValueError: If the object test_size is not between 0 and 1 Returns: DataFrame: Train dataframe DataFrame: Test dataframe

Source code in template_vision/models_training/utils_models.py

def stratified_split(df: pd.DataFrame, col: Union[str, int], test_size: float = 0.25,
                     seed: Union[int, None] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    '''Splits a DataFrame into train and test sets - Stratified strategy

    Args:
        df (pd.DataFrame): Dataframe containing the data
        col (str or int): column on which to do the stratified split
    Kwargs:
        test_size (float): Proportion representing the size of the expected test set
        seed (int): Random seed
    Raises:
        ValueError: If the object test_size is not between 0 and 1
    Returns:
        DataFrame: Train dataframe
        DataFrame: Test dataframe
    '''
    if not 0 <= test_size <= 1:
        raise ValueError('The object test_size must be between 0 and 1')

    # Stratified split
    logger.info("Stratified split")
    df = remove_small_classes(df, col, min_rows=math.ceil(1 / test_size))  # minimum lines number per category to split
    df_train, df_test = train_test_split(df, stratify=df[col], test_size=test_size, random_state=seed)

    # Display
    display_train_test_shape(df_train, df_test, df_shape=df.shape[0])

    # Return
    return df_train, df_test