Skip to content

Model tfidf dense

ModelTfidfDense

Bases: ModelKeras

Model for predictions via TF-IDF + Dense

Source code in template_nlp/models_training/models_tensorflow/model_tfidf_dense.py
class ModelTfidfDense(ModelKeras):
    '''Model for predictions via TF-IDF + Dense'''

    _default_name = 'model_tfidf_dense'

    def __init__(self, tfidf_params: Union[dict, None] = None, **kwargs) -> None:
        '''Initialization of the class (see ModelClass & ModelKeras for more arguments).

        Kwargs:
            tfidf_params (dict) : Parameters for the tfidf
        '''
        # Init.
        super().__init__(**kwargs)

        # Get logger (must be done after super init)
        self.logger = logging.getLogger(__name__)

        if tfidf_params is None:
            tfidf_params = {}
        self.tfidf = TfidfVectorizer(**tfidf_params)

    def _prepare_x_train(self, x_train) -> np.ndarray:
        '''Prepares the input data for the model. Called when fitting the model

        Args:
            x_train (?): Array-like, shape = [n_samples, n_features]
        Returns:
            (np.ndarray): Prepared data
        '''
        # Fit tfidf & return x transformed
        self.tfidf.fit(x_train)
        # TODO: Use of todense because tensorflow 2.3 does not support sparse data anymore
        return self.tfidf.transform(x_train).todense()

    def _prepare_x_test(self, x_test) -> np.ndarray:
        '''Prepares the input data for the model. Called when fitting the model

        Args:
            x_test (?): Array-like, shape = [n_samples, n_features]
        Returns:
            (np.ndarray): Prepared data
        '''
        # Get tf-idf & fit on train
        # TODO: Use of todense because tensorflow 2.3 does not support sparse data anymore
        return self.tfidf.transform(x_test).todense()

    def _get_model(self) -> Model:
        '''Gets a model structure - returns the instance model instead if already defined

        Returns:
            (Model): a Keras model
        '''
        # Return model if already set
        if self.model is not None:
            return self.model

        # Get input/output dimensions
        input_dim = len(self.tfidf.get_feature_names_out())
        num_classes = len(self.list_classes)

        # Get random_state
        random_state = np.random.RandomState(self.random_seed)
        limit = int(1e9)

        # Process
        tfidf_features = Input(shape=(input_dim,))
        x = Dense(128, activation=None, kernel_initializer=HeUniform(random_state.randint(limit)))(tfidf_features)
        x = BatchNormalization(momentum=0.9)(x)
        x = ELU(alpha=1.0)(x)
        x = Dropout(0.5,seed=random_state.randint(limit))(x)

        x = Dense(64, activation=None, kernel_initializer=HeUniform(random_state.randint(limit)))(x)
        x = BatchNormalization(momentum=0.9)(x)
        x = ELU(alpha=1.0)(x)
        x = Dropout(0.5, seed=random_state.randint(limit))(x)

        x = Dense(32, activation=None, kernel_initializer=HeUniform(random_state.randint(limit)))(x)
        x = BatchNormalization(momentum=0.9)(x)
        x = ELU(alpha=1.0)(x)
        x = Dropout(0.5, seed=random_state.randint(limit))(x)

        # Last layer
        activation = 'sigmoid' if self.multi_label else 'softmax'
        out = Dense(num_classes, activation=activation, kernel_initializer=GlorotUniform(random_state.randint(limit)))(x)

        # Compile model
        model = Model(inputs=tfidf_features, outputs=[out])
        lr = self.keras_params.get('learning_rate', 0.002)
        decay = self.keras_params.get('decay', 0.0)
        self.logger.info(f"Learning rate: {lr}")
        self.logger.info(f"Decay: {decay}")
        optimizer = Adam(lr=lr, decay=decay)
        # loss = utils_deep_keras.f1_loss if self.multi_label else 'categorical_crossentropy'
        loss = 'binary_crossentropy' if self.multi_label else 'categorical_crossentropy'  # utils_deep_keras.f1_loss also possible if multi-labels
        metrics: List[Union[str, Callable]] = ['accuracy'] if not self.multi_label else ['categorical_accuracy', utils_deep_keras.f1, utils_deep_keras.precision, utils_deep_keras.recall]
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        if self.logger.getEffectiveLevel() < logging.ERROR:
            model.summary()

        # Try to save model as png if level_save > 'LOW'
        if self.level_save in ['MEDIUM', 'HIGH']:
            self._save_model_png(model)

        # Return
        return model

    @tf.function(input_signature=(tf.TensorSpec(shape=(None, None), dtype=tf.float64, name='x'), ))
    def _serve(self, x: np.ndarray):
        '''Improves predict function using tf.function (cf. https://www.tensorflow.org/guide/function)
        Args:
            x (np.ndarray): input data
        Returns:
            tf.tensor: model's output
        '''
        return self.model(x, training=False)

    def save(self, json_data: Union[dict, None] = None) -> None:
        '''Saves the model

        Kwargs:
            json_data (dict): Additional configurations to be saved
        '''
        # Save configuration JSON
        if json_data is None:
            json_data = {}

        # Add tfidf params
        confs = self.tfidf.get_params()
        # Get rid of some non serializable conf
        for special_conf in ['dtype', 'base_estimator']:
            if special_conf in confs.keys():
                confs[special_conf] = str(confs[special_conf])
        json_data['tfidf_confs'] = confs

        # Save tfidf if not None & level_save > LOW
        if (self.tfidf is not None) and (self.level_save in ['MEDIUM', 'HIGH']):
            # Manage paths
            tfidf_path = os.path.join(self.model_dir, "tfidf_standalone.pkl")
            # Save as pickle
            with open(tfidf_path, 'wb') as f:
                pickle.dump(self.tfidf, f)

        # Save
        super().save(json_data=json_data)

    def _load_standalone_files(self, default_model_dir: Union[str, None] = None,  # type: ignore
                               tfidf_path: Union[str, None] = None, *args, **kwargs):
        '''Loads standalone files for a newly created model via _init_new_instance_from_configs

        Kwargs:
            default_model_dir (str): a path to look for default file paths
                                     If None, standalone files path should all be provided
            tfidf_path (str): Path to the TFIDF file
        Raises:
            ValueError: If the TFIDF file is not specified and can't be inferred
            FileNotFoundError: If the TFIDF file does not exist
        '''
        # Check if we are able to get all needed paths
        if default_model_dir is None and tfidf_path is None:
            raise ValueError("The TFIDF file is not specified and can't be inferred")

        # Call parent
        super()._load_standalone_files(default_model_dir=default_model_dir, **kwargs)

        # Retrieve file paths
        if tfidf_path is None:
            tfidf_path = os.path.join(default_model_dir, "tfidf_standalone.pkl")

        # Check paths exists
        if not os.path.isfile(tfidf_path):
            raise FileNotFoundError(f"Can't find the TFIDF file ({tfidf_path})")

        # Reload tfidf
        with open(tfidf_path, 'rb') as f:
            self.tfidf = pickle.load(f)

__init__(tfidf_params=None, **kwargs)

Initialization of the class (see ModelClass & ModelKeras for more arguments).

Kwargs

tfidf_params (dict) : Parameters for the tfidf

Source code in template_nlp/models_training/models_tensorflow/model_tfidf_dense.py
def __init__(self, tfidf_params: Union[dict, None] = None, **kwargs) -> None:
    '''Initialization of the class (see ModelClass & ModelKeras for more arguments).

    Kwargs:
        tfidf_params (dict) : Parameters for the tfidf
    '''
    # Init.
    super().__init__(**kwargs)

    # Get logger (must be done after super init)
    self.logger = logging.getLogger(__name__)

    if tfidf_params is None:
        tfidf_params = {}
    self.tfidf = TfidfVectorizer(**tfidf_params)

save(json_data=None)

Saves the model

Kwargs

json_data (dict): Additional configurations to be saved

Source code in template_nlp/models_training/models_tensorflow/model_tfidf_dense.py
def save(self, json_data: Union[dict, None] = None) -> None:
    '''Saves the model

    Kwargs:
        json_data (dict): Additional configurations to be saved
    '''
    # Save configuration JSON
    if json_data is None:
        json_data = {}

    # Add tfidf params
    confs = self.tfidf.get_params()
    # Get rid of some non serializable conf
    for special_conf in ['dtype', 'base_estimator']:
        if special_conf in confs.keys():
            confs[special_conf] = str(confs[special_conf])
    json_data['tfidf_confs'] = confs

    # Save tfidf if not None & level_save > LOW
    if (self.tfidf is not None) and (self.level_save in ['MEDIUM', 'HIGH']):
        # Manage paths
        tfidf_path = os.path.join(self.model_dir, "tfidf_standalone.pkl")
        # Save as pickle
        with open(tfidf_path, 'wb') as f:
            pickle.dump(self.tfidf, f)

    # Save
    super().save(json_data=json_data)