Utils

`NpEncoder`

Bases: JSONEncoder

JSON encoder to manage numpy objects

Source code in template_nlp/utils.py

class NpEncoder(json.JSONEncoder):
    '''JSON encoder to manage numpy objects'''
    def default(self, obj) -> Any:
        if is_ndarray_convertable(obj):
            return ndarray_to_builtin_object(obj)
        elif isinstance(obj, set):
            return list(obj)
        else:
            return super(NpEncoder, self).default(obj)

`data_agnostic_str_to_list(function)`

Decorator to transform a string into a list of one element, and retrieve first element of the function returns. Idea: be able to do predict(my_string) Otherwise, we would have to do prediction = predict([my_string])[0]

Parameters:

Name	Type	Description	Default
`function`	`func`	Function to decorate	required

Returns: function: The decorated function

Source code in template_nlp/utils.py

def data_agnostic_str_to_list(function: Callable) -> Callable:
    '''Decorator to transform a string into a list of one element,
    and retrieve first element of the function returns.
    Idea: be able to do `predict(my_string)`
    Otherwise, we would have to do `prediction = predict([my_string])[0]`

    Args:
        function (func): Function to decorate
    Returns:
        function: The decorated function
    '''
    # Get wrapper
    def wrapper(self, x, *args, **kwargs):
        '''Wrapper'''
        if type(x) == str:
            # Cast str into a single element list
            my_list = [x]
            # Get function result
            results = function(self, my_list, *args, **kwargs)
            # Cast back to single element
            final_result = results[0]
        else:
            final_result = function(self, x, *args, **kwargs)
        # Return
        return final_result
    return wrapper

`display_shape(df)`

Displays the number of line and of column of a table.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Table to parse	required

Source code in template_nlp/utils.py

def display_shape(df: pd.DataFrame) -> None:
    '''Displays the number of line and of column of a table.

    Args:
        df (pd.DataFrame): Table to parse
    '''
    # Display
    logger.info(f"Number of lines : {df.shape[0]}. Number of columns : {df.shape[1]}.")

`find_folder_path(folder_name, base_folder=None)`

Find a folder in a base folder and its subfolders. If base_folder is None, considers folder_name as a path and check it exists

i.e., with the following structure : - C:/ - base_folder/ - folderA/ - folderB/ - folderC/ find_folder_path(folderA, C:/base_folder) == C:/base_folder/folderA find_folder_path(folderB, C:/base_folder) == C:/base_folder/folderA/folderB find_folder_path(C:/base_folder/folderC, None) == C:/base_folder/folderC find_folder_path(folderB, None) raises an error

Parameters:

Name	Type	Description	Default
`folder_name`	`str`	name of the folder to find. If base_folder is None, consider a path instead.	required

Kwargs: base_folder (str): path of the base folder. If None, consider folder_name as a path. Raises: FileNotFoundError: If we can't find folder_name in base_folder FileNotFoundError: If folder_name is not a valid path (case where base_folder is None) Returns: str: path to the wanted folder

Source code in template_nlp/utils.py

def find_folder_path(folder_name: str, base_folder: Union[str, None] = None) -> str:
    '''Find a folder in a base folder and its subfolders.
    If base_folder is None, considers folder_name as a path and check it exists

    i.e., with the following structure :
    - C:/
        - base_folder/
            - folderA/
                - folderB/
            - folderC/
    find_folder_path(folderA, C:/base_folder) == C:/base_folder/folderA
    find_folder_path(folderB, C:/base_folder) == C:/base_folder/folderA/folderB
    find_folder_path(C:/base_folder/folderC, None) == C:/base_folder/folderC
    find_folder_path(folderB, None) raises an error

    Args:
        folder_name (str): name of the folder to find. If base_folder is None, consider a path instead.
    Kwargs:
        base_folder (str): path of the base folder. If None, consider folder_name as a path.
    Raises:
        FileNotFoundError: If we can't find folder_name in base_folder
        FileNotFoundError: If folder_name is not a valid path (case where base_folder is None)
    Returns:
        str: path to the wanted folder
    '''
    if base_folder is not None:
        folder_path = None
        for path, subdirs, files in os.walk(base_folder):
            for name in subdirs:
                if name == folder_name:
                    folder_path = os.path.join(path, name)
        if folder_path is None:
            raise FileNotFoundError(f"Can't find folder {folder_name} inside {base_folder} and its subfolders")
    else:
        folder_path = folder_name
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Can't find folder {folder_path} (considered as a path)")
    return folder_path

`get_chunk_limits(x, chunksize=10000)`

Gets chunk limits from a pandas series or dataframe.

Parameters:

Name	Type	Description	Default
`x`	`Series or DataFrame`	Documents to consider	required

Kwargs: chunksize (int): The chunk size Raises: ValueError: If the chunk size is negative Returns: list: the chunk limits

Source code in template_nlp/utils.py

def get_chunk_limits(x: Union[pd.DataFrame, pd.Series], chunksize: int = 10000) -> List[Tuple[int]]:
    '''Gets chunk limits from a pandas series or dataframe.

    Args:
        x (pd.Series or pd.DataFrame): Documents to consider
    Kwargs:
        chunksize (int): The chunk size
    Raises:
        ValueError: If the chunk size is negative
    Returns:
        list<tuple>: the chunk limits
    '''
    if chunksize < 0:
        raise ValueError('The object chunksize must not be negative.')
    # Processs
    if chunksize == 0 or chunksize >= x.shape[0]:
        chunks_limits = [(0, x.shape[0])]
    else:
        chunks_limits = [(i * chunksize, min((i + 1) * chunksize, x.shape[0]))
                         for i in range(1 + ((x.shape[0] - 1) // chunksize))]
    return chunks_limits  # type: ignore

`get_data_path()`

Returns the path to the data folder

Returns:

Name	Type	Description
`str`	`str`	Path of the data folder

Source code in template_nlp/utils.py

def get_data_path() -> str:
    '''Returns the path to the data folder

    Returns:
        str: Path of the data folder
    '''
    if DIR_PATH is None:
        dir_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'template_nlp-data')
    else:
        dir_path = os.path.join(os.path.abspath(DIR_PATH), 'template_nlp-data')
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    return os.path.abspath(dir_path)

`get_models_path()`

Returns the path to the models folder

Returns:

Name	Type	Description
`str`	`str`	Path of the models folder

Source code in template_nlp/utils.py

def get_models_path() -> str:
    '''Returns the path to the models folder

    Returns:
        str: Path of the models folder
    '''
    if DIR_PATH is None:
        dir_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'template_nlp-models')
    else:
        dir_path = os.path.join(os.path.abspath(DIR_PATH), 'template_nlp-models')
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    return os.path.abspath(dir_path)

`get_new_column_name(column_list, wanted_name)`

Gets a new column name from a list of existing ones & a wanted name

If the wanted name does not exists, return it. Otherwise get a new column prefixed by the wanted name.

Parameters:

Name	Type	Description	Default
`column_list`	`list`	List of existing columns	required
`wanted_name`	`str`	Wanted name	required

Source code in template_nlp/utils.py

def get_new_column_name(column_list: list, wanted_name: str) -> str:
    '''Gets a new column name from a list of existing ones & a wanted name

    If the wanted name does not exists, return it.
    Otherwise get a new column prefixed by the wanted name.

    Args:
        column_list (list): List of existing columns
        wanted_name (str): Wanted name
    '''
    if wanted_name not in column_list:
        return wanted_name
    else:
        new_name = f'{wanted_name}_{str(uuid.uuid4())[:8]}'
        # It should not happen, but we still check if new_name is available (bad luck ?)
        return get_new_column_name(column_list, new_name)

`get_package_version()`

Returns the current version of the package

Returns:

Name	Type	Description
`str`	`str`	version of the package

Source code in template_nlp/utils.py

def get_package_version() -> str:
    '''Returns the current version of the package

    Returns:
        str: version of the package
    '''
    version = importlib.metadata.version('template_nlp')
    return version

`get_ressources_path()`

Returns the path to the ressources folder

Returns:

Name	Type	Description
`str`	`str`	Path of the ressources folder

Source code in template_nlp/utils.py

def get_ressources_path() -> str:
    '''Returns the path to the ressources folder

    Returns:
        str: Path of the ressources folder
    '''
    dir_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'template_nlp-ressources')
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    return os.path.abspath(dir_path)

`get_transformers_path()`

Returns the path to the transformers folder

Returns:

Name	Type	Description
`str`	`str`	Path of the transformers folder

Source code in template_nlp/utils.py

def get_transformers_path() -> str:
    '''Returns the path to the transformers folder

    Returns:
        str: Path of the transformers folder
    '''
    if DIR_PATH is None:
        dir_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'template_nlp-transformers')
    else:
        dir_path = os.path.join(os.path.abspath(DIR_PATH), 'template_nlp-transformers')
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    return os.path.abspath(dir_path)

`is_ndarray_convertable(obj)`

Returns True if the object is covertable to a builtin type in the same way a np.ndarray is

Parameters:

Name	Type	Description	Default
`obj`	`Any`	an object to test	required

Returns: bool: True if the object is covertable to a list as a np.ndarray is

Source code in template_nlp/utils.py

def is_ndarray_convertable(obj: Any) -> bool:
    '''Returns True if the object is covertable to a builtin type in the same way a np.ndarray is

    Args:
        obj (Any): an object to test
    Returns:
        bool: True if the object is covertable to a list as a np.ndarray is
    '''
    return hasattr(obj, "dtype") and hasattr(obj, "astype") and hasattr(obj, "tolist")

`ndarray_to_builtin_object(obj)`

Transform a numpy.ndarray like object to a builtin type like int, float or list

Parameters:

Name	Type	Description	Default
`obj`	`Any`	An object	required

Raises: ValueError: Raise a ValueError when obj is not ndarray convertable Returns: Any: The object converted to a builtin type like int, float or list

Source code in template_nlp/utils.py

def ndarray_to_builtin_object(obj: Any) -> Any:
    '''Transform a numpy.ndarray like object to a builtin type like int, float or list

    Args:
        obj (Any): An object
    Raises:
        ValueError: Raise a ValueError when obj is not ndarray convertable
    Returns:
        Any: The object converted to a builtin type like int, float or list
    '''
    if is_ndarray_convertable(obj):
        if np.issubdtype(obj.dtype, np.integer):
            return obj.astype(int).tolist()
        elif np.issubdtype(obj.dtype, np.number):
            return obj.astype(float).tolist()
        else:
            return obj.tolist()
    else:
        raise ValueError(f"{obj} is not ndarray convertable")

`read_csv(file_path, sep=';', encoding='utf-8', dtype=str, **kwargs)`

Reads a .csv file and parses the first line.

Parameters:

Name	Type	Description	Default
`file_path`	`str`	Path to the .csv file containing the data	required

Kwargs: sep (str): Separator of the data file encoding (str): Encoding of the data file kwargs: Pandas' kwargs Raises: FileNotFoundError: If the file_path object does not point to an existing file Returns: pd.DataFrame: Data str: First line of the .csv (None if not beginning with #) and with no line break

Source code in template_nlp/utils.py

def read_csv(file_path: str, sep: str = ';', encoding: str = 'utf-8', dtype: type = str, **kwargs) -> Tuple[pd.DataFrame, Union[str, None]]:
    '''Reads a .csv file and parses the first line.

    Args:
        file_path (str): Path to the .csv file containing the data
    Kwargs:
        sep (str): Separator of the data file
        encoding (str): Encoding of the data file
        kwargs: Pandas' kwargs
    Raises:
        FileNotFoundError: If the file_path object does not point to an existing file
    Returns:
        pd.DataFrame: Data
        str: First line of the .csv (None if not beginning with #) and with no line break
    '''
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist")

    # We get the first line
    with open(file_path, 'r', encoding=encoding) as f:
        first_line = f.readline()
    # We check if the first line contains metadata
    has_metada = True if first_line.startswith('#') else False
    # We load the dataset
    if has_metada:
        df = pd.read_csv(file_path, sep=sep, encoding=encoding, dtype=dtype, skiprows=1, **kwargs).fillna('')
    else:
        df = pd.read_csv(file_path, sep=sep, encoding=encoding, dtype=dtype, **kwargs).fillna('')

    # If no metadata, return only the dataframe
    if not has_metada:
        return df, None
    # Else process the first_line
    else:
        # Deletion of the line break
        if first_line is not None and first_line.endswith('\n'):
            first_line = first_line[:-1]
        # Deletion of the return carriage
        if first_line is not None and first_line.endswith('\r'):
            first_line = first_line[:-1]
        # Return
        return df, first_line

`to_csv(df, file_path, first_line=None, sep=';', encoding='utf-8', **kwargs)`

Writes a .csv and manages the first line.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Data to write	required
`file_path`	`str`	Path to the file to create	required

Kwargs: first_line (str): First line to write (without line break which is done in this function) sep (str): Separator for the data file encoding (str): Encoding of the data file kwargs: pandas' kwargs

Source code in template_nlp/utils.py

def to_csv(df: pd.DataFrame, file_path: str, first_line: Union[str, None] = None, sep: str = ';',
           encoding: str = 'utf-8', **kwargs) -> None:
    '''Writes a .csv and manages the first line.

    Args:
        df (pd.DataFrame): Data to write
        file_path (str): Path to the file to create
    Kwargs:
        first_line (str): First line to write (without line break which is done in this function)
        sep (str): Separator for the data file
        encoding (str): Encoding of the data file
        kwargs: pandas' kwargs
    '''
    # We get the first line
    with open(file_path, 'w', encoding=encoding) as f:
        if first_line is not None:
            f.write(first_line + '\n')  # We add the first line if metadata are present
        df.to_csv(f, sep=sep, encoding=encoding, index=None, **kwargs)

`trained_needed(function)`

Decorator to ensure that a model has been trained.

Parameters:

Name	Type	Description	Default
`function`	`func`	Function to decorate	required

Returns: function: The decorated function

Source code in template_nlp/utils.py

def trained_needed(function: Callable) -> Callable:
    '''Decorator to ensure that a model has been trained.

    Args:
        function (func): Function to decorate
    Returns:
        function: The decorated function
    '''
    # Get wrapper
    def wrapper(self, *args, **kwargs):
        '''Wrapper'''
        if not self.trained:
            raise AttributeError(f"The function {function.__name__} can't be called as long as the model hasn't been fitted")
        else:
            return function(self, *args, **kwargs)
    return wrapper