Update config file and model.safetensors
Browse files- README.md +3 -0
- config.json +8 -11
- generation_config.json +1 -1
- pytorch_model.bin → model.safetensors +2 -2
- modeling_rotary_indictrans.py +3 -1
- tokenizer_config.json +1 -1
README.md
CHANGED
@@ -78,5 +78,8 @@ If you use these models directly or fine-tune them further for additional use ca
|
|
78 |
}
|
79 |
```
|
80 |
|
|
|
|
|
|
|
81 |
# Warning
|
82 |
Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
|
|
|
78 |
}
|
79 |
```
|
80 |
|
81 |
+
# Note
|
82 |
+
These new and improved models are primarily built and tested for document-level and long-context translations, and the performance of smaller sentence-level tasks might be sub-optimal, and might require generation parameter tuning. Please throughly verify the performance of the models for your usecase before scaling up generation.
|
83 |
+
|
84 |
# Warning
|
85 |
Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
|
config.json
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
"decoder_attention_heads": 16,
|
11 |
"decoder_embed_dim": 1024,
|
12 |
"decoder_ffn_dim": 8192,
|
13 |
-
"decoder_layerdrop": 0,
|
14 |
"decoder_layers": 18,
|
15 |
"decoder_normalize_before": true,
|
16 |
"decoder_start_token_id": 2,
|
@@ -19,7 +19,7 @@
|
|
19 |
"encoder_attention_heads": 16,
|
20 |
"encoder_embed_dim": 1024,
|
21 |
"encoder_ffn_dim": 8192,
|
22 |
-
"encoder_layerdrop": 0,
|
23 |
"encoder_layers": 18,
|
24 |
"encoder_normalize_before": true,
|
25 |
"encoder_vocab_size": 122706,
|
@@ -27,21 +27,18 @@
|
|
27 |
"init_std": 0.02,
|
28 |
"is_encoder_decoder": true,
|
29 |
"layernorm_embedding": false,
|
|
|
|
|
30 |
"model_type": "RotaryIndicTrans",
|
31 |
"num_hidden_layers": 18,
|
32 |
"pad_token_id": 1,
|
33 |
"rope_args": {
|
34 |
-
"theta":
|
35 |
},
|
36 |
"scale_embedding": true,
|
37 |
"share_decoder_input_output_embed": false,
|
38 |
"torch_dtype": "float32",
|
39 |
-
"transformers_version": "4.
|
40 |
"use_cache": true,
|
41 |
-
"
|
42 |
-
|
43 |
-
"AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
|
44 |
-
"AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
|
45 |
-
},
|
46 |
-
"tokenizer_class": "IndicTransTokenizer"
|
47 |
-
}
|
|
|
10 |
"decoder_attention_heads": 16,
|
11 |
"decoder_embed_dim": 1024,
|
12 |
"decoder_ffn_dim": 8192,
|
13 |
+
"decoder_layerdrop": 0.0,
|
14 |
"decoder_layers": 18,
|
15 |
"decoder_normalize_before": true,
|
16 |
"decoder_start_token_id": 2,
|
|
|
19 |
"encoder_attention_heads": 16,
|
20 |
"encoder_embed_dim": 1024,
|
21 |
"encoder_ffn_dim": 8192,
|
22 |
+
"encoder_layerdrop": 0.0,
|
23 |
"encoder_layers": 18,
|
24 |
"encoder_normalize_before": true,
|
25 |
"encoder_vocab_size": 122706,
|
|
|
27 |
"init_std": 0.02,
|
28 |
"is_encoder_decoder": true,
|
29 |
"layernorm_embedding": false,
|
30 |
+
"max_source_positions": 8192,
|
31 |
+
"max_target_positions": 8192,
|
32 |
"model_type": "RotaryIndicTrans",
|
33 |
"num_hidden_layers": 18,
|
34 |
"pad_token_id": 1,
|
35 |
"rope_args": {
|
36 |
+
"theta": 50000
|
37 |
},
|
38 |
"scale_embedding": true,
|
39 |
"share_decoder_input_output_embed": false,
|
40 |
"torch_dtype": "float32",
|
41 |
+
"transformers_version": "4.47.1",
|
42 |
"use_cache": true,
|
43 |
+
"vocab_size": 32296
|
44 |
+
}
|
|
|
|
|
|
|
|
|
|
generation_config.json
CHANGED
@@ -4,5 +4,5 @@
|
|
4 |
"decoder_start_token_id": 2,
|
5 |
"eos_token_id": 2,
|
6 |
"pad_token_id": 1,
|
7 |
-
"transformers_version": "4.
|
8 |
}
|
|
|
4 |
"decoder_start_token_id": 2,
|
5 |
"eos_token_id": 2,
|
6 |
"pad_token_id": 1,
|
7 |
+
"transformers_version": "4.47.1"
|
8 |
}
|
pytorch_model.bin → model.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a481c6abeee887ab91f82834081a4fb7d81ba00d3e16869db73eb5bd367fe39
|
3 |
+
size 4092117552
|
modeling_rotary_indictrans.py
CHANGED
@@ -108,7 +108,9 @@ class RotaryEmbedding(torch.nn.Module):
|
|
108 |
self.max_seq_len = max_seq_len
|
109 |
self.scaling_factor = scaling_factor
|
110 |
|
111 |
-
inv_freq_ = 1.0 / (
|
|
|
|
|
112 |
|
113 |
self.register_buffer("inv_freq", inv_freq_, persistent=False)
|
114 |
self.precompute_freqs(max_seq_len)
|
|
|
108 |
self.max_seq_len = max_seq_len
|
109 |
self.scaling_factor = scaling_factor
|
110 |
|
111 |
+
inv_freq_ = 1.0 / (
|
112 |
+
theta ** (torch.arange(0, dim, 2, device=device).float() / dim)
|
113 |
+
)
|
114 |
|
115 |
self.register_buffer("inv_freq", inv_freq_, persistent=False)
|
116 |
self.precompute_freqs(max_seq_len)
|
tokenizer_config.json
CHANGED
@@ -37,7 +37,7 @@
|
|
37 |
"clean_up_tokenization_spaces": true,
|
38 |
"do_lower_case": false,
|
39 |
"eos_token": "</s>",
|
40 |
-
"model_max_length":
|
41 |
"pad_token": "<pad>",
|
42 |
"name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
|
43 |
"tokenizer_class": "IndicTransTokenizer",
|
|
|
37 |
"clean_up_tokenization_spaces": true,
|
38 |
"do_lower_case": false,
|
39 |
"eos_token": "</s>",
|
40 |
+
"model_max_length": 8192,
|
41 |
"pad_token": "<pad>",
|
42 |
"name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
|
43 |
"tokenizer_class": "IndicTransTokenizer",
|