import json import os from typing import List, Optional, Union, Dict, Any, Tuple from transformers import PreTrainedTokenizerFast from transformers.tokenization_utils_base import AddedToken from transformers.utils import logging logger = logging.get_logger(__name__) class TessarTokenizer(PreTrainedTokenizerFast): """ Tessar Tokenizer implementation for Hugging Face Transformers This custom tokenizer extends the PreTrainedTokenizerFast with specialized configuration and tokenization methods for the Tessar model. """ model_input_names = ['input_ids', 'attention_mask'] vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"} def __init__( self, vocab_file=None, tokenizer_file=None, do_lower_case=True, unk_token="", sep_token="", pad_token="", cls_token="", mask_token="", bos_token="", eos_token="", max_cell_length=15, **kwargs ): """ Initialize the Tessar Tokenizer with specific token configurations Args: vocab_file (str, optional): Path to the vocabulary file tokenizer_file (str, optional): Path to the pre-trained tokenizer file do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True. max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15. """ # Prepare special tokens special_tokens_dict = { "unk_token": unk_token, "sep_token": sep_token, "pad_token": pad_token, "cls_token": cls_token, "mask_token": mask_token, "bos_token": bos_token, "eos_token": eos_token, } # Convert string tokens to AddedToken objects if they're not already for token_name, token_value in special_tokens_dict.items(): if isinstance(token_value, str): special_tokens_dict[token_name] = AddedToken(token_value, lstrip=False, rstrip=False, normalized=True, special=True) # Call parent constructor super().__init__( vocab_file=vocab_file, tokenizer_file=tokenizer_file, **special_tokens_dict, **kwargs ) # Custom Tessar-specific attributes self.do_lower_case = do_lower_case self.max_cell_length = max_cell_length @property def vocab_size(self) -> int: """ Return the size of vocabulary Returns: int: The vocabulary size """ return len(self.vocab) def get_vocab(self) -> Dict[str, int]: """ Return the vocabulary mapping Returns: Dict[str, int]: The vocabulary mapping """ return dict(self.vocab) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]: """ Save the tokenizer vocabulary and special tokens file Args: save_directory (str): Directory to save the vocabulary filename_prefix (str, optional): Prefix for the saved files Returns: tuple: Paths to the saved files """ # Ensure the save directory exists os.makedirs(save_directory, exist_ok=True) # Prepare file paths vocab_file = os.path.join( save_directory, f"{filename_prefix + '-' if filename_prefix else ''}vocab.json" ) # Save tokenizer file tokenizer_file = os.path.join( save_directory, f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json" ) # Save special tokens configuration special_tokens_file = os.path.join( save_directory, f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json" ) # Get vocabulary from tokenizer vocab_dict = self.get_vocab() # Save vocabulary with open(vocab_file, 'w', encoding='utf-8') as f: json.dump(vocab_dict, f, ensure_ascii=False, indent=2) # Save the tokenizer file if it exists if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"): self.backend_tokenizer.save(tokenizer_file) # Save special tokens configuration special_tokens_config = { "unk_token": self.unk_token, "sep_token": self.sep_token, "pad_token": self.pad_token, "cls_token": self.cls_token, "mask_token": self.mask_token, "bos_token": self.bos_token, "eos_token": self.eos_token, "do_lower_case": self.do_lower_case, "max_cell_length": self.max_cell_length } # Convert token objects to strings for JSON serialization for key, token in special_tokens_config.items(): if hasattr(token, "content"): special_tokens_config[key] = token.content with open(special_tokens_file, 'w', encoding='utf-8') as f: json.dump(special_tokens_config, f, ensure_ascii=False, indent=2) return (vocab_file, tokenizer_file, special_tokens_file) def _tokenize(self, text: str) -> List[str]: """ Custom tokenization method Args: text (str): Input text to tokenize Returns: List[str]: List of tokens """ # Apply lowercase if required if self.do_lower_case: text = text.lower() # Use the parent tokenizer's tokenization method tokens = super()._tokenize(text) # Optional: Add custom cell-length truncation if self.max_cell_length > 0: tokens = tokens[:self.max_cell_length] return tokens def prepare_for_model( self, ids: List[int], pair_ids: Optional[List[int]] = None, add_special_tokens: bool = True, padding: Union[bool, str] = False, truncation: Union[bool, str] = False, max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs ) -> Dict[str, Any]: """ Prepare tokenized inputs for the model Args: ids (List[int]): List of input token ids pair_ids (Optional[List[int]], optional): List of pair token ids Returns: dict: Prepared model inputs """ # Implement any Tessar-specific model preparation logic # For example, you might want to handle table data differently return super().prepare_for_model( ids, pair_ids=pair_ids, add_special_tokens=add_special_tokens, padding=padding, truncation=truncation, max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, return_length=return_length, verbose=verbose, **kwargs ) def batch_encode_tables( self, tables: List[List[List[str]]], max_length: Optional[int] = None, padding: Union[bool, str] = True, truncation: Union[bool, str] = True, return_tensors: Optional[str] = "pt", **kwargs ) -> Dict[str, Any]: """ Encode a batch of tables for table question answering Args: tables (List[List[List[str]]]): List of tables, where each table is a list of rows, and each row is a list of cell values max_length (Optional[int], optional): Maximum sequence length padding (Union[bool, str], optional): Padding strategy truncation (Union[bool, str], optional): Truncation strategy return_tensors (Optional[str], optional): Type of tensors to return Returns: Dict[str, Any]: Encoded table batch """ # Flatten tables into text sequences with appropriate format flattened_inputs = [] for table in tables: # Convert table to a flattened text representation # This is a simplified example - real implementation would depend on your specific format table_text = "" for row_idx, row in enumerate(table): for col_idx, cell in enumerate(row): # Apply cell-level processing if self.do_lower_case: cell = cell.lower() # Add cell with position information table_text += f"[CELL_{row_idx}_{col_idx}] {cell} " # Add row separator table_text += "[ROW_END] " flattened_inputs.append(table_text.strip()) # Encode the flattened text inputs return self( flattened_inputs, max_length=max_length, padding=padding, truncation=truncation, return_tensors=return_tensors, **kwargs ) def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs): """ Load a pretrained Tessar tokenizer Args: pretrained_model_name_or_path (str): Path to the pretrained model **kwargs: Additional arguments to pass to from_pretrained Returns: TessarTokenizer: Initialized tokenizer """ return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) # Register the tokenizer with the Transformers library from transformers import AutoTokenizer AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer) # Example usage if __name__ == "__main__": # Example of loading a pretrained tokenizer try: # Method 1: Direct loading with the class tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest") print("Tokenizer loaded successfully!") # Method 2: Loading through AutoTokenizer # This will work after the registration above auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest") print("AutoTokenizer loaded successfully!") # Basic tokenization example text = "Hello, how are you doing today?" encoded = tokenizer(text, return_tensors="pt") print("Encoded Input:", encoded) # Example with table data table = [ ["Header1", "Header2", "Header3"], ["Value1", "Value2", "Value3"], ["Value4", "Value5", "Value6"] ] # Example of batch encoding tables encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt") print("Encoded Table:", encoded_table) except Exception as e: print(f"Error loading tokenizer: {e}")