# Description: Pipeline running a model on user input. # ML/pipeline.py import numpy as np import pandas as pd import tensorflow as tf from transformers import BertTokenizer, TFBertModel class Pipeline: """Pipeline running a model on user input.""" def __init__(self) -> None: """Initializes the pipeline.""" # Configuration for the model self.__config = { "max_seq_length": 512, "bert_model_name": "bert-base-uncased", "model_type": "bert_text", } # Initialize the BERT tokenizer self.__bert_tokenizer = BertTokenizer.from_pretrained( self.__config["bert_model_name"] ) # Initialize the model self.__model = self.__init_model() def run(self, input_data: list[str]) -> float: """Runs the pipeline on the given input data. Args: input_data: A list of strings. Returns: A float representing the predicted value. """ # Preprocess the input data input = self.__preprocessing(input_data) # Make a prediction using the preprocessed data result = self.__make_prediction(input) return result def __preprocessing(self, data: list[str]) -> np.ndarray: """Preprocesses the input data. Returns a numpy array of the preprocessed data.""" # Convert the data to a pandas DataFrame df = pd.DataFrame({"text": data}) # Convert the 'text' column to a numpy array input = np.array(df["text"]) # Tokenize the input using the BERT tokenizer input_ids = self.__bert_tokenizer( list(input), padding=True, truncation=True, return_tensors="tf", max_length=self.__config["max_seq_length"] )["input_ids"] # Pad the tokenized input to match the max sequence length padded_ids = tf.pad( input_ids, [[0, 0], [0, self.__config["max_seq_length"] - input_ids.shape[1]]] ) return padded_ids def __make_prediction(self, input: np.ndarray) -> float: """Makes a prediction using the model. Returns the prediction.""" # Make a prediction using the model prediction = self.__model.predict(input)[0][0] # Round the prediction to the nearest available value result = self.__round_prediction(prediction) return result def __init_model(self) -> tf.keras.models.Model: """Initializes the model and loads the weights.""" # Load the BERT model self.__bert_model = TFBertModel.from_pretrained(self.__config["bert_model_name"]) # Create a custom regression head for the model regression_head = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.3), tf.keras.layers.Dense(64, activation="relu"), tf.keras.layers.Dropout(0.3), tf.keras.layers.Dense(1, activation="linear"), ]) # Combine BERT and Regression Head input_ids = tf.keras.layers.Input( shape=(self.__config["max_seq_length"],), dtype=tf.int32 ) bert_output = self.__bert_model(input_ids)[0] # BERT's output pooler_output = bert_output[:, 0, :] # Pooler output regression_output = regression_head(pooler_output) # Custom regression head model = tf.keras.models.Model(inputs=input_ids, outputs=regression_output) # Set BERT layers as non-trainable for layer in self.__bert_model.layers: layer.trainable = False # Load the weights model.load_weights("./app/ML/models/training_" + self.__config["model_type"] + "/cp.ckpt") return model def __round_prediction(self, value: float) -> float: """Rounds a given value to the nearest IELTS score.""" available_values = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0] closest_value = min(available_values, key=lambda x: abs(x - value)) return closest_value