SVECTOR-OFFICIAL commited on
Commit
1ed1133
·
verified ·
1 Parent(s): 0ca0f56

Upload 11 files

Browse files
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "SVECTOR-CORPORATION/Tessar-largest",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "BartForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 12,
16
+ "decoder_start_token_id": 2,
17
+ "dropout": 0.1,
18
+ "encoder_attention_heads": 16,
19
+ "encoder_ffn_dim": 4096,
20
+ "encoder_layerdrop": 0.0,
21
+ "encoder_layers": 12,
22
+ "eos_token_id": 2,
23
+ "forced_bos_token_id": 0,
24
+ "forced_eos_token_id": 2,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": 1024,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "bart",
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "scale_embedding": false,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.17.0.dev0",
35
+ "use_cache": true,
36
+ "vocab_size": 50265
37
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_bos_token_id": 0,
7
+ "forced_eos_token_id": 2,
8
+ "max_length": 1024,
9
+ "pad_token_id": 1,
10
+ "transformers_version": "4.27.0.dev0"
11
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3fd086d5435c71f07dbe525e859840b1e218490bfb974d5d5cdf91506f967ee
3
+ size 1625426996
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d9dd92d3ee268740d9790bac260f0fd2fd6f7ad783b0d87769a11e7534c7cb3
3
+ size 1625481368
setup.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import find_packages, setup
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ setup(
7
+ name="tessar_tokenizer",
8
+ version="0.1.0",
9
+ description="Advanced Tokenizer for Table-based Transformations by SVECTOR",
10
+ long_description=long_description,
11
+ long_description_content_type="text/markdown",
12
+ author="SVECTOR",
13
+ author_email="[email protected]",
14
+ url="https://www.svector.co.in",
15
+ packages=find_packages(),
16
+ package_data={
17
+ 'tessar_tokenizer': ['*.json'],
18
+ },
19
+ install_requires=[
20
+ "transformers>=4.27.0",
21
+ "torch>=1.10.0",
22
+ "numpy>=1.19.0"
23
+ ],
24
+ extras_require={
25
+ 'dev': [
26
+ 'pytest',
27
+ 'black',
28
+ 'mypy',
29
+ 'isort'
30
+ ]
31
+ },
32
+ classifiers=[
33
+ "Development Status :: 3 - Alpha",
34
+ "Intended Audience :: Developers",
35
+ "Intended Audience :: Science/Research",
36
+ "License :: OSI Approved :: MIT License",
37
+ "Operating System :: OS Independent",
38
+ "Programming Language :: Python :: 3.7",
39
+ "Programming Language :: Python :: 3.8",
40
+ "Programming Language :: Python :: 3.9",
41
+ "Programming Language :: Python :: 3.10",
42
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
43
+ "Topic :: Software Development :: Libraries :: Python Modules",
44
+ ],
45
+ keywords="nlp tokenizer machine-learning table-transformations",
46
+ python_requires=">=3.7",
47
+ entry_points={
48
+ 'console_scripts': [
49
+ 'tessar-tokenizer=tessar_tokenizer.cli:main',
50
+ ],
51
+ },
52
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": true
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "single_word": false,
12
+ "lstrip": false,
13
+ "rstrip": false,
14
+ "normalized": true
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true
22
+ },
23
+ "sep_token": {
24
+ "content": "</s>",
25
+ "single_word": false,
26
+ "lstrip": false,
27
+ "rstrip": false,
28
+ "normalized": true
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": true
36
+ },
37
+ "cls_token": {
38
+ "content": "<s>",
39
+ "single_word": false,
40
+ "lstrip": false,
41
+ "rstrip": false,
42
+ "normalized": true
43
+ },
44
+ "mask_token": {
45
+ "content": "<mask>",
46
+ "single_word": false,
47
+ "lstrip": true,
48
+ "rstrip": false,
49
+ "normalized": true
50
+ }
51
+ }
tessar_tokenizer.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List, Optional, Union
4
+
5
+ from transformers import PreTrainedTokenizerFast
6
+
7
+
8
+ class TessarTokenizer(PreTrainedTokenizerFast):
9
+ """
10
+ Tessar Tokenizer implementation for Hugging Face Transformers
11
+ """
12
+
13
+ model_input_names = ['input_ids', 'attention_mask']
14
+
15
+ def __init__(
16
+ self,
17
+ vocab_file=None,
18
+ tokenizer_file=None,
19
+ do_lower_case=True,
20
+ unk_token="<unk>",
21
+ sep_token="</s>",
22
+ pad_token="<pad>",
23
+ cls_token="<s>",
24
+ mask_token="<mask>",
25
+ bos_token="<s>",
26
+ eos_token="</s>",
27
+ max_cell_length=15,
28
+ **kwargs
29
+ ):
30
+ """
31
+ Initialize the Tessar Tokenizer with specific token configurations
32
+
33
+ Args:
34
+ vocab_file (str, optional): Path to the vocabulary file
35
+ tokenizer_file (str, optional): Path to the pre-trained tokenizer file
36
+ do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
37
+ max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
38
+ """
39
+ # Prepare special tokens
40
+ special_tokens = {
41
+ "unk_token": unk_token,
42
+ "sep_token": sep_token,
43
+ "pad_token": pad_token,
44
+ "cls_token": cls_token,
45
+ "mask_token": mask_token,
46
+ "bos_token": bos_token,
47
+ "eos_token": eos_token,
48
+ }
49
+
50
+ # Remove None values
51
+ special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
52
+
53
+ # Call parent constructor
54
+ super().__init__(
55
+ vocab_file=vocab_file,
56
+ tokenizer_file=tokenizer_file,
57
+ do_lower_case=do_lower_case,
58
+ **special_tokens,
59
+ **kwargs
60
+ )
61
+
62
+ # Custom Tessar-specific attributes
63
+ self.do_lower_case = do_lower_case
64
+ self.max_cell_length = max_cell_length
65
+
66
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
67
+ """
68
+ Save the tokenizer vocabulary and special tokens file
69
+
70
+ Args:
71
+ save_directory (str): Directory to save the vocabulary
72
+ filename_prefix (str, optional): Prefix for the saved files
73
+
74
+ Returns:
75
+ tuple: Paths to the saved files
76
+ """
77
+ # Prepare file paths
78
+ vocab_file = os.path.join(
79
+ save_directory,
80
+ f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
81
+ )
82
+
83
+ # Save special tokens configuration
84
+ special_tokens_file = os.path.join(
85
+ save_directory,
86
+ f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
87
+ )
88
+
89
+ # Save vocabulary
90
+ with open(vocab_file, 'w', encoding='utf-8') as f:
91
+ json.dump(self.vocab, f, ensure_ascii=False, indent=2)
92
+
93
+ # Save special tokens configuration
94
+ special_tokens_config = {
95
+ "unk_token": self.unk_token,
96
+ "sep_token": self.sep_token,
97
+ "pad_token": self.pad_token,
98
+ "cls_token": self.cls_token,
99
+ "mask_token": self.mask_token,
100
+ "bos_token": self.bos_token,
101
+ "eos_token": self.eos_token,
102
+ "do_lower_case": self.do_lower_case,
103
+ "max_cell_length": self.max_cell_length
104
+ }
105
+
106
+ with open(special_tokens_file, 'w', encoding='utf-8') as f:
107
+ json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
108
+
109
+ return (vocab_file, special_tokens_file)
110
+
111
+ def _tokenize(self, text: str) -> List[str]:
112
+ """
113
+ Custom tokenization method
114
+
115
+ Args:
116
+ text (str): Input text to tokenize
117
+
118
+ Returns:
119
+ List[str]: List of tokens
120
+ """
121
+ # Apply lowercase if required
122
+ if self.do_lower_case:
123
+ text = text.lower()
124
+
125
+ # Use the parent tokenizer's tokenization method
126
+ tokens = super()._tokenize(text)
127
+
128
+ # Optional: Add custom cell-length truncation
129
+ tokens = tokens[:self.max_cell_length]
130
+
131
+ return tokens
132
+
133
+ def prepare_for_model(
134
+ self,
135
+ ids: List[int],
136
+ pair_ids: Optional[List[int]] = None,
137
+ **kwargs
138
+ ) -> dict:
139
+ """
140
+ Prepare tokenized inputs for the model
141
+
142
+ Args:
143
+ ids (List[int]): List of input token ids
144
+ pair_ids (Optional[List[int]], optional): List of pair token ids
145
+
146
+ Returns:
147
+ dict: Prepared model inputs
148
+ """
149
+ # Implement any Tessar-specific model preparation logic
150
+ # This method can be extended to add Tessar-specific preprocessing
151
+ return super().prepare_for_model(ids, pair_ids, **kwargs)
152
+
153
+ # Example usage and initialization
154
+ def load_tessar_tokenizer(pretrained_model_name_or_path: str):
155
+ """
156
+ Load a pretrained Tessar tokenizer
157
+
158
+ Args:
159
+ pretrained_model_name_or_path (str): Path to the pretrained model
160
+
161
+ Returns:
162
+ TessarTokenizer: Initialized tokenizer
163
+ """
164
+ return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
tessar_tokenizer_example.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tessar_tokenizer import TessarTokenizer, load_tessar_tokenizer
2
+
3
+ # Example 1: Initialize a new Tessar Tokenizer
4
+ tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
5
+
6
+ # Example 2: Tokenize a simple text
7
+ text = "Hello, how are you doing today?"
8
+ encoded = tokenizer(text, return_tensors="pt")
9
+ print("Encoded Input:", encoded)
10
+
11
+ # Example 3: Batch tokenization
12
+ texts = [
13
+ "Hello, world!",
14
+ "This is a test sentence.",
15
+ "Tokenization is an important NLP task."
16
+ ]
17
+ batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
18
+ print("Batch Encoded Inputs:", batch_encoded)
19
+
20
+ # Example 4: Save and reload tokenizer
21
+ save_directory = "./tessar_tokenizer"
22
+ tokenizer.save_pretrained(save_directory)
23
+
24
+ # Reload the saved tokenizer
25
+ reloaded_tokenizer = load_tessar_tokenizer(save_directory)
26
+
27
+ # Example 5: Custom tokenization with specific parameters
28
+ custom_tokenizer = TessarTokenizer(
29
+ do_lower_case=True,
30
+ max_cell_length=20,
31
+ unk_token="[UNK]",
32
+ pad_token="[PAD]"
33
+ )
34
+
35
+ # Tokenize with custom settings
36
+ custom_text = "A custom tokenization example"
37
+ custom_encoded = custom_tokenizer(custom_text, return_tensors="pt")
38
+ print("Custom Tokenizer Encoded:", custom_encoded)
tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_lower_case": true,
3
+ "errors": "replace",
4
+ "bos_token": {
5
+ "content": "<s>",
6
+ "single_word": false,
7
+ "lstrip": false,
8
+ "rstrip": false,
9
+ "normalized": true,
10
+ "__type": "AddedToken"
11
+ },
12
+ "eos_token": {
13
+ "content": "</s>",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": true,
18
+ "__type": "AddedToken"
19
+ },
20
+ "unk_token": {
21
+ "content": "<unk>",
22
+ "single_word": false,
23
+ "lstrip": false,
24
+ "rstrip": false,
25
+ "normalized": true,
26
+ "__type": "AddedToken"
27
+ },
28
+ "sep_token": {
29
+ "content": "</s>",
30
+ "single_word": false,
31
+ "lstrip": false,
32
+ "rstrip": false,
33
+ "normalized": true,
34
+ "__type": "AddedToken"
35
+ },
36
+ "cls_token": {
37
+ "content": "<s>",
38
+ "single_word": false,
39
+ "lstrip": false,
40
+ "rstrip": false,
41
+ "normalized": true,
42
+ "__type": "AddedToken"
43
+ },
44
+ "pad_token": {
45
+ "content": "<pad>",
46
+ "single_word": false,
47
+ "lstrip": false,
48
+ "rstrip": false,
49
+ "normalized": true,
50
+ "__type": "AddedToken"
51
+ },
52
+ "mask_token": {
53
+ "content": "<mask>",
54
+ "single_word": false,
55
+ "lstrip": true,
56
+ "rstrip": false,
57
+ "normalized": true,
58
+ "__type": "AddedToken"
59
+ },
60
+ "add_prefix_space": true,
61
+ "max_cell_length": 15,
62
+ "model_max_length": 1024,
63
+ "special_tokens_map_file": null,
64
+ "name_or_path": "SVECTOR-CORPORATION/Tessar-largest",
65
+ "use_fast": true,
66
+ "tokenizer_class": "TessarTokenizer"
67
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff