prajdabre commited on
Commit
924b390
·
verified ·
1 Parent(s): 6179181

Update config file and model.safetensors

Browse files
README.md CHANGED
@@ -78,5 +78,8 @@ If you use these models directly or fine-tune them further for additional use ca
78
  }
79
  ```
80
 
 
 
 
81
  # Warning
82
  Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
 
78
  }
79
  ```
80
 
81
+ # Note
82
+ These new and improved models are primarily built and tested for document-level and long-context translations, and the performance of smaller sentence-level tasks might be sub-optimal, and might require generation parameter tuning. Please throughly verify the performance of the models for your usecase before scaling up generation.
83
+
84
  # Warning
85
  Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
config.json CHANGED
@@ -10,7 +10,7 @@
10
  "decoder_attention_heads": 16,
11
  "decoder_embed_dim": 1024,
12
  "decoder_ffn_dim": 8192,
13
- "decoder_layerdrop": 0,
14
  "decoder_layers": 18,
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
@@ -19,7 +19,7 @@
19
  "encoder_attention_heads": 16,
20
  "encoder_embed_dim": 1024,
21
  "encoder_ffn_dim": 8192,
22
- "encoder_layerdrop": 0,
23
  "encoder_layers": 18,
24
  "encoder_normalize_before": true,
25
  "encoder_vocab_size": 122706,
@@ -27,21 +27,18 @@
27
  "init_std": 0.02,
28
  "is_encoder_decoder": true,
29
  "layernorm_embedding": false,
 
 
30
  "model_type": "RotaryIndicTrans",
31
  "num_hidden_layers": 18,
32
  "pad_token_id": 1,
33
  "rope_args": {
34
- "theta": 10000
35
  },
36
  "scale_embedding": true,
37
  "share_decoder_input_output_embed": false,
38
  "torch_dtype": "float32",
39
- "transformers_version": "4.46.1",
40
  "use_cache": true,
41
- "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
42
- "auto_map": {
43
- "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
44
- "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
45
- },
46
- "tokenizer_class": "IndicTransTokenizer"
47
- }
 
10
  "decoder_attention_heads": 16,
11
  "decoder_embed_dim": 1024,
12
  "decoder_ffn_dim": 8192,
13
+ "decoder_layerdrop": 0.0,
14
  "decoder_layers": 18,
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
 
19
  "encoder_attention_heads": 16,
20
  "encoder_embed_dim": 1024,
21
  "encoder_ffn_dim": 8192,
22
+ "encoder_layerdrop": 0.0,
23
  "encoder_layers": 18,
24
  "encoder_normalize_before": true,
25
  "encoder_vocab_size": 122706,
 
27
  "init_std": 0.02,
28
  "is_encoder_decoder": true,
29
  "layernorm_embedding": false,
30
+ "max_source_positions": 8192,
31
+ "max_target_positions": 8192,
32
  "model_type": "RotaryIndicTrans",
33
  "num_hidden_layers": 18,
34
  "pad_token_id": 1,
35
  "rope_args": {
36
+ "theta": 50000
37
  },
38
  "scale_embedding": true,
39
  "share_decoder_input_output_embed": false,
40
  "torch_dtype": "float32",
41
+ "transformers_version": "4.47.1",
42
  "use_cache": true,
43
+ "vocab_size": 32296
44
+ }
 
 
 
 
 
generation_config.json CHANGED
@@ -4,5 +4,5 @@
4
  "decoder_start_token_id": 2,
5
  "eos_token_id": 2,
6
  "pad_token_id": 1,
7
- "transformers_version": "4.46.1"
8
  }
 
4
  "decoder_start_token_id": 2,
5
  "eos_token_id": 2,
6
  "pad_token_id": 1,
7
+ "transformers_version": "4.47.1"
8
  }
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18ebcbe449d6e65a57135f40eef13c01279a13ac590ff380c6b10f8db333fb45
3
- size 4092276726
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a481c6abeee887ab91f82834081a4fb7d81ba00d3e16869db73eb5bd367fe39
3
+ size 4092117552
modeling_rotary_indictrans.py CHANGED
@@ -108,7 +108,9 @@ class RotaryEmbedding(torch.nn.Module):
108
  self.max_seq_len = max_seq_len
109
  self.scaling_factor = scaling_factor
110
 
111
- inv_freq_ = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
 
 
112
 
113
  self.register_buffer("inv_freq", inv_freq_, persistent=False)
114
  self.precompute_freqs(max_seq_len)
 
108
  self.max_seq_len = max_seq_len
109
  self.scaling_factor = scaling_factor
110
 
111
+ inv_freq_ = 1.0 / (
112
+ theta ** (torch.arange(0, dim, 2, device=device).float() / dim)
113
+ )
114
 
115
  self.register_buffer("inv_freq", inv_freq_, persistent=False)
116
  self.precompute_freqs(max_seq_len)
tokenizer_config.json CHANGED
@@ -37,7 +37,7 @@
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
- "model_max_length": 4096,
41
  "pad_token": "<pad>",
42
  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
43
  "tokenizer_class": "IndicTransTokenizer",
 
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
+ "model_max_length": 8192,
41
  "pad_token": "<pad>",
42
  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
43
  "tokenizer_class": "IndicTransTokenizer",