Skip to content

Preprocess

get_preprocessor(preprocess_str)

Gets a preprocessing (function) from its name

Parameters:

Name Type Description Default
preprocess_str str

Name of the preprocess

required

Raises: ValueError: If the name of the preprocess is not known Returns: Callable: Function to be used for the preprocessing

Source code in template_nlp/preprocessing/preprocess.py
def get_preprocessor(preprocess_str: str) -> Callable:
    '''Gets a preprocessing (function) from its name

    Args:
        preprocess_str (str): Name of the preprocess
    Raises:
        ValueError: If the name of the preprocess is not known
    Returns:
        Callable: Function to be used for the preprocessing
    '''
    # Process
    preprocessors_dict = get_preprocessors_dict()
    if preprocess_str not in preprocessors_dict.keys():
        raise ValueError(f"The preprocess {preprocess_str} is not known.")
    # Get preprocessor
    preprocessor = preprocessors_dict[preprocess_str]
    # Return
    return preprocessor

get_preprocessors_dict()

Gets a dictionary of available preprocessing

Returns:

Name Type Description
dict dict

Dictionary of preprocessing

Source code in template_nlp/preprocessing/preprocess.py
def get_preprocessors_dict() -> dict:
    '''Gets a dictionary of available preprocessing

    Returns:
        dict: Dictionary of preprocessing
    '''
    preprocessors_dict = {
        'no_preprocess': lambda x: x,  # - /!\ DO NOT DELETE -> necessary for compatibility /!\ -
        'preprocess_P1': preprocess_sentence_P1,  # Example of a preprocessing
        #  'preprocess_P2': preprocess_sentence_P2 , ETC ...
    }
    return preprocessors_dict

preprocess_sentence_P1(docs)

Applies "default" preprocess to a list of documents (text)

Parameters:

Name Type Description Default
docs Series

Documents to be preprocessed

required

Returns: pd.Series: Preprocessed documents

Source code in template_nlp/preprocessing/preprocess.py
@wnf_utils.data_agnostic
@wnf_utils.regroup_data_series
def preprocess_sentence_P1(docs: pd.Series) -> pd.Series:
    '''Applies "default" preprocess to a list of documents (text)

    Args:
        docs (pd.Series): Documents to be preprocessed
    Returns:
        pd.Series: Preprocessed documents
    '''
    pipeline = ['remove_non_string', 'get_true_spaces', 'remove_punct', 'to_lower', 'trim_string',
                'remove_leading_and_ending_spaces']
    return api.preprocess_pipeline(docs, pipeline=pipeline, chunksize=100000)