class ModelClass:
'''Parent class for the models'''
_default_name = 'none'
# Variable annotation : https://www.python.org/dev/peps/pep-0526/
# Solves lots of typing errors, cf mypy
multi_label: Union[bool, None]
list_classes: Union[list, None]
dict_classes: Union[dict, None]
# Not implemented :
# -> fit
# -> predict
# -> predict_proba
# -> inverse_transform
# -> get_and_save_metrics
def __init__(self, model_dir: Union[str, None] = None, model_name: Union[str, None] = None,
x_col: Union[list, None] = None, y_col: Union[str, int, list, None] = None, random_seed: Union[int, None] = None,
preprocess_pipeline: Union[ColumnTransformer, None] = None, level_save: str = 'HIGH', **kwargs) -> None:
'''Initialization of the parent class.
Kwargs:
model_dir (str): Folder where to save the model
If None, creates a directory based on the model's name and the date (most common usage)
model_name (str): The name of the model
x_col (list): Names of the columns used for the training - x
y_col (str or int or list if multi-labels): Name of the model's target column(s) - y
random_seed (int): Seed to use for packages randomness
preprocess_pipeline (ColumnTransformer): The pipeline used for preprocessing. If None -> no preprocessing !
level_save (str): Level of saving
LOW: stats + configurations + logger keras - /!\\ The model can't be reused /!\\ -
MEDIUM: LOW + hdf5 + pkl + plots
HIGH: MEDIUM + predictions
Raises:
ValueError: If the object level_save is not a valid option (['LOW', 'MEDIUM', 'HIGH'])
NotADirectoryError: If a provided model directory is not a directory (i.e. it's a file)
'''
if level_save not in ['LOW', 'MEDIUM', 'HIGH']:
raise ValueError(f"The object level_save ({level_save}) is not a valid option (['LOW', 'MEDIUM', 'HIGH'])")
# Get logger
self.logger = logging.getLogger(__name__)
# Model type -> 'classifier' or 'regressor' depending on the model
self.model_type = None
# Model name
self.model_name = self._default_name if model_name is None else model_name
# Names of the columns used
self.x_col = x_col
self.y_col = y_col
# Random seed
self.random_seed = random_seed
# Can be None if reloading a model
if x_col is None:
self.logger.warning("Warning, the attribute x_col is not given! The model might not work as intended.")
if y_col is None:
self.logger.warning("Warning, the attribute y_col is not given! The model might not work as intended.")
# Model folder
if model_dir is None:
self.model_dir = self._get_new_model_dir()
else:
if not os.path.exists(model_dir):
os.makedirs(model_dir)
if not os.path.isdir(model_dir):
raise NotADirectoryError(f"{model_dir} is not a valid directory")
self.model_dir = os.path.abspath(model_dir)
# Preprocessing pipeline
self.preprocess_pipeline = preprocess_pipeline
if self.preprocess_pipeline is not None:
try:
check_is_fitted(self.preprocess_pipeline)
except NotFittedError as e:
self.logger.error("The preprocessing pipeline hasn't been fitted !")
self.logger.error(repr(e))
raise NotFittedError()
# We get the associated columns (and a check if there has been a fit is done)
self.columns_in, self.mandatory_columns = utils_models.get_columns_pipeline(self.preprocess_pipeline)
else:
# We can't define a "no_preprocess" pipeline since we should fit it
# So we take care of that at the first fit
self.logger.warning("Warning, no preprocessing pipeline given !")
self.columns_in, self.mandatory_columns = None, None
# Other options
self.level_save = level_save
# is trained ?
self.trained = False
self.nb_fit = 0
# Configuration dict. to be logged. Set on save.
self.json_dict: Dict[Any, Any] = {}
def fit(self, x_train, y_train, **kwargs) -> None:
'''Trains the model
Args:
x_train (?): Array-like, shape = [n_samples, n_features]
y_train (?): Array-like, shape = [n_samples, n_targets]
'''
raise NotImplementedError("'fit' needs to be overridden")
def predict(self, x_test: pd.DataFrame, **kwargs) -> np.ndarray:
'''Predictions on the test set
Args:
x_test (pd.DataFrame): DataFrame with the test data to be predicted
Returns:
(np.ndarray): Array, shape = [n_samples, n_classes]
'''
raise NotImplementedError("'predict' needs to be overridden")
def predict_proba(self, x_test: pd.DataFrame, **kwargs) -> np.ndarray:
'''Predicts probabilities on the test dataset
Args:
x_test (pd.DataFrame): DataFrame with the test data to be predicted
Returns:
(np.ndarray): Array, shape = [n_samples, n_classes]
'''
raise NotImplementedError("'predict_proba' needs to be overridden")
def inverse_transform(self, y: Union[list, np.ndarray]) -> Union[list, tuple]:
'''Gets the final format of prediction
- Classification : classes from predictions
- Regression : values (identity function)
Args:
y (list | np.ndarray): Array-like, shape = [n_samples,]
OR 1D array shape = [n_classes] (only one prediction)
Returns:
(?): Array, shape = [n_samples, ?]
'''
raise NotImplementedError("'inverse_transform' needs to be overridden")
def get_and_save_metrics(self, y_true, y_pred, df_x: Union[pd.DataFrame, None] = None,
series_to_add: Union[List[pd.Series], None] = None,
type_data: str = '') -> pd.DataFrame:
'''Gets and saves the metrics of a model
Args:
y_true (?): Array-like, shape = [n_samples, n_targets]
y_pred (?): Array-like, shape = [n_samples, n_targets]
Kwargs:
df_x (pd.DataFrame or None): Input dataFrame used for the prediction
series_to_add (list): List of pd.Series to add to the dataframe
type_data (str): Type of dataset (validation, test, ...)
Returns:
pd.DataFrame: The dataframe containing the statistics
'''
raise NotImplementedError("'get_and_save_metrics' needs to be overridden")
def save(self, json_data: Union[dict, None] = None) -> None:
'''Saves the model
Kwargs:
json_data (dict): Additional configurations to be saved
'''
# Manage paths
pkl_path = os.path.join(self.model_dir, f"{self.model_name}.pkl")
preprocess_pipeline_path = os.path.join(self.model_dir, "preprocess_pipeline.pkl")
conf_path = os.path.join(self.model_dir, "configurations.json")
# Save the model & preprocessing pipeline if level_save > 'LOW'
if self.level_save in ['MEDIUM', 'HIGH']:
with open(pkl_path, 'wb') as f:
pickle.dump(self, f)
# Useful for reload_from_standalone, otherwise, saved as a class attribute
with open(preprocess_pipeline_path, 'wb') as f:
pickle.dump(self.preprocess_pipeline, f)
# Saving JSON configuration
json_dict = {
'maintainers': 'Agence DataServices',
'gabarit_version': '1.3.4.dev0+local',
'date': datetime.now().strftime("%d/%m/%Y - %H:%M:%S"), # Not the same as the folder's name
'package_version': utils.get_package_version(),
'model_name': self.model_name,
'model_dir': self.model_dir,
'model_type': self.model_type,
'trained': self.trained,
'nb_fit': self.nb_fit,
'x_col': self.x_col,
'y_col': self.y_col,
'columns_in': self.columns_in,
'mandatory_columns': self.mandatory_columns,
'random_seed': self.random_seed,
'level_save': self.level_save,
'librairie': None,
}
# Merge json_data if not None
if json_data is not None:
# Priority given to json_data !
json_dict = {**json_dict, **json_data}
# Add conf to attributes
self.json_dict = json_dict
# Save conf
with open(conf_path, 'w', encoding='utf-8') as json_file:
json.dump(json_dict, json_file, indent=4, cls=utils.NpEncoder)
# Now, save a properties file for the model upload
self._save_upload_properties(json_dict)
def _save_upload_properties(self, json_dict: Union[dict, None] = None) -> None:
'''Prepares a configuration file for a future export (e.g on an artifactory)
Kwargs:
json_dict: Configurations to save
'''
if json_dict is None:
json_dict = {}
# Manage paths
properties_path = os.path.join(self.model_dir, "properties.json")
vanilla_model_upload_instructions = os.path.join(utils.get_ressources_path(), 'model_upload_instructions.md')
specific_model_upload_instructions = os.path.join(self.model_dir, "model_upload_instructions.md")
# First, we define a list of "allowed" properties
allowed_properties = ["maintainers", "gabarit_version", "date", "package_version", "model_name", "list_classes",
"librairie", "fit_time"]
# Now we filter these properties
final_dict = {k: v for k, v in json_dict.items() if k in allowed_properties}
# Save
with open(properties_path, 'w', encoding='utf-8') as f:
json.dump(final_dict, f, indent=4, cls=utils.NpEncoder)
# Add instructions to upload a model to a storage solution (e.g. Artifactory)
with open(vanilla_model_upload_instructions, 'r', encoding='utf-8') as f:
content = f.read()
# TODO: to be improved
new_content = content.replace('model_dir_path_identifier', os.path.abspath(self.model_dir))
with open(specific_model_upload_instructions, 'w', encoding='utf-8') as f:
f.write(new_content)
def _get_new_model_dir(self) -> str:
'''Gets a folder where to save the model
Returns:
str: Path to the folder
'''
models_dir = utils.get_models_path()
subfolder = os.path.join(models_dir, self.model_name)
folder_name = datetime.now().strftime(f"{self.model_name}_%Y_%m_%d-%H_%M_%S")
model_dir = os.path.join(subfolder, folder_name)
if os.path.isdir(model_dir):
time.sleep(1) # Wait 1 second so that the 'date' changes...
return self._get_new_model_dir() # Get new directory name
else:
os.makedirs(model_dir)
return model_dir
def _check_input_format(self, x_input: Union[pd.DataFrame, np.ndarray], y_input: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
fit_function: bool = False) -> Tuple[Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, pd.Series, np.ndarray, None]]:
'''Checks the inputs of a function. We check the number of columns and the ordering.
Strategy :
- If fit function, set preprocessing pipeline, columns_in, mandatory_columns, x_col (if not set), y_col (if not set) with input data
- Then, for both x & y
- If input data has a column attribute
- If we can find all needed columns, reorder the dataset using only the needed columns (so it works if we have more columns)
- Else, raise an error if length do not match (otherwise log a warning)
- Else, raise an error if length do not match (otherwise log a warning)
We also set the pipeline to a passthrough pipeline if None
Args:
x_input (pd.DataFrame, np.ndarray): Array-like, shape = [n_samples, n_features]
Kwargs:
y_input (pd.DataFrame, pd.Series, np.ndarray): Array-like, shape = [n_samples, n_target]
Mandatory if fit_function
fit_function (bool): If it is a fit function
Raises:
AttributeError: If fit_function == True, but y_input is None
ValueError: If one of the inputs hasn't the right number of columns
Returns:
(pd.DataFrame, np.ndarray): x_input, may be reordered if needed
(pd.DataFrame, pd.Series, np.ndarray): y_input, may be reordered if needed
'''
# Getting some info first
x_input_shape = x_input.shape[-1] if len(x_input.shape) > 1 else 1
if y_input is not None:
y_input_shape = y_input.shape[-1] if len(y_input.shape) > 1 else 1
else:
y_input_shape = 0 # not used
# Manage fit_function = True
if fit_function:
if y_input is None:
raise AttributeError("The argument y_input is mandatory if fit_function == True")
# Set x_col if not set yet
if self.x_col is None:
self.logger.warning("Warning, the attribute x_col was not given when creating the model")
self.logger.warning("We set it now with the input data of the fit function")
if hasattr(x_input, 'columns'):
# TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
self.x_col = list(x_input.columns) # type: ignore
else:
self.x_col = [_ for _ in range(x_input_shape)]
# Same thing for y_col
if self.y_col is None:
self.logger.warning("Warning, the attribute y_col was not given when creating the model")
self.logger.warning("We set it now with the input data of the fit function")
if hasattr(y_input, 'columns'):
# TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
self.y_col = list(y_input.columns) # type: ignore
else:
self.y_col = [_ for _ in range(y_input_shape)]
# If there is only one element, we get rid of the list
if y_input_shape == 1:
self.y_col = self.y_col[0]
# If pipeline, columns_in or mandatory_columns is None, sets it
if self.preprocess_pipeline is None: # ie no pipeline given when initializing the class
preprocess_str = "no_preprocess"
preprocess_pipeline = preprocess.get_pipeline(preprocess_str) # Warning, the pipeline must be fitted
preprocess_pipeline.fit(x_input) # We fit the pipeline to set the necessary columns for the pipeline
# Set attributes
self.preprocess_pipeline = preprocess_pipeline
self.columns_in, self.mandatory_columns = utils_models.get_columns_pipeline(self.preprocess_pipeline)
# Checking x_input
if self.x_col is None:
self.logger.warning("Can't check the input format (x) because x_col is not set...")
else:
x_col_len = len(self.x_col)
# We check the presence of the columns
if hasattr(x_input, 'columns'):
can_reorder = True
for col in self.x_col:
# TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
if col not in x_input.columns: # type: ignore
can_reorder = False
self.logger.warning(f"The column {col} is missing from the input (x)")
# If we can't reorder :
# 1. Exact number of columns : we write a warning message and continue with columns renamed
# 2. Not the correct number of column : raise an error
if not can_reorder:
if x_input_shape != x_col_len:
raise ValueError(f"Input data (x) is not in the right format ({x_input_shape} != {x_col_len})")
self.logger.warning("The names of the columns (x) do not match. The process continues since there is the right number of columns")
x_input = x_input.copy() # needs a copy as we wil change columns names
x_input.columns = self.x_col # type: ignore
# If we can reorder :
# 1. Same number of inputs but not the same order -> we just reorder
# 2. More columns ? -> we just take the needed subset + log a warning message
else:
# TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
if list(x_input.columns) != self.x_col: # type: ignore
if x_input_shape == x_col_len:
self.logger.warning("The input columns (x) are not in the right order -> automatic reordering !")
else:
self.logger.warning("More columns in input (x) than needed, but we can find the needed columns -> only considering the needed columns")
x_input = x_input[self.x_col]
else:
if x_input_shape != len(self.x_col):
raise ValueError(f"Input data (x) is not in the right format ({x_input_shape} != {x_col_len})")
self.logger.warning("The input (x) does not have the 'columns' attribute -> can't check the ordering of the columns")
# Checking y_input
if y_input is not None:
if self.y_col is None:
self.logger.warning("Can't check the input format (y) because y_col is not set...")
else:
# Checking y_input format
y_col_len = len(self.y_col) if type(self.y_col) == list else 1
# We check the presence of the columns
if hasattr(y_input, 'columns'):
can_reorder = True
target_cols = self.y_col if type(self.y_col) == list else [self.y_col]
for col in target_cols:
# TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
if col not in y_input.columns: # type: ignore
can_reorder = False
self.logger.warning(f"The column {col} is missing from the input (y)")
# If we can't reorder :
# 1. Exact number of columns : we write a warning message and continue with columns renamed
# 2. Not the correct number of column : raise an error
if not can_reorder:
if y_input_shape != y_col_len:
raise ValueError(f"Input data (y) is not in the right format ({y_input_shape} != {y_col_len})")
self.logger.warning("The names of the columns (y) do not match. The process continues since there is the right number of columns")
y_input = y_input.copy() # needs a copy as we wil change columns names
y_input.columns = self.y_col # type: ignore
# If we can reorder :
# 1. Same number of inputs but not the same order -> we just reorder
# 2. More columns ? -> we just take the needed subset + log a warning message
else:
# TODO : tmp mypy fix https://github.com/python/mypy/pull/13544
if list(y_input.columns) != target_cols: # type: ignore
if y_input_shape == y_col_len:
self.logger.warning("The input columns (y) are not in the right order -> automatic reordering !")
else:
self.logger.warning("More columns in input (y) than needed, but we can find the needed columns -> only considering the needed columns")
y_input = y_input[target_cols]
else:
if y_input_shape != y_col_len:
raise ValueError(f"Input data (y) is not in the right format ({y_input_shape} != {y_col_len})")
self.logger.warning("The input (y) does not have the 'columns' attribute -> can't check the ordering of the columns")
# Return
return x_input, y_input
def display_if_gpu_activated(self) -> None:
'''Displays if a GPU is being used'''
if self._is_gpu_activated():
self.logger.info("GPU activated")
def _is_gpu_activated(self) -> bool:
'''Checks if we use a GPU
Returns:
bool: whether GPU is available or not
'''
# By default, no GPU
return False