65rted6tfdjhgfjyrf commited on
Commit
40a14d6
Β·
verified Β·
1 Parent(s): ebeabcf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -16
app.py CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
5
  import os
6
  from huggingface_hub import login
7
 
8
- # Load Hugging Face token (ensure it's set in 'Secrets')
9
  hf_token = os.getenv("HUGGINGFACE_TOKEN")
10
  if hf_token:
11
  login(hf_token)
@@ -13,34 +13,32 @@ if hf_token:
13
  else:
14
  print("❌ Hugging Face token not found. Make sure it's set in 'Secrets'.")
15
 
16
- # Base model
17
  BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
18
- FINETUNED_MODEL_DIR = "./finetuned_model" # Path to your adapter weights
 
 
 
19
 
20
  # Load tokenizer
21
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
22
 
23
- # Load base model (WITHOUT bitsandbytes)
24
  base_model = AutoModelForCausalLM.from_pretrained(
25
- BASE_MODEL,
26
- torch_dtype=torch.float32 # Ensure CPU compatibility
 
27
  )
28
 
29
- # Move base model to CPU
30
- base_model.to("cpu")
31
-
32
- # Load LoRA adapter
33
  model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL_DIR)
34
-
35
- # Merge adapter with base model
36
  model = model.merge_and_unload()
37
 
38
- # Move model to CPU (again, just to be sure)
39
- model.to("cpu")
40
 
41
- # Inference function
42
  def chat(message):
43
- inputs = tokenizer(message, return_tensors="pt").to("cpu") # Ensure inputs are on CPU
44
  output = model.generate(**inputs, max_new_tokens=100)
45
  response = tokenizer.decode(output[0], skip_special_tokens=True)
46
  return response
 
5
  import os
6
  from huggingface_hub import login
7
 
8
+ # Load Hugging Face token from environment variables
9
  hf_token = os.getenv("HUGGINGFACE_TOKEN")
10
  if hf_token:
11
  login(hf_token)
 
13
  else:
14
  print("❌ Hugging Face token not found. Make sure it's set in 'Secrets'.")
15
 
16
+ # Model paths
17
  BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
18
+ FINETUNED_MODEL_DIR = "./finetuned_model" # Path to your fine-tuned adapter
19
+
20
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
 
23
  # Load tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
25
 
26
+ # Load base model efficiently
27
  base_model = AutoModelForCausalLM.from_pretrained(
28
+ BASE_MODEL,
29
+ torch_dtype=torch_dtype,
30
+ device_map="auto" if torch.cuda.is_available() else None # Use GPU if available
31
  )
32
 
33
+ # Load and merge LoRA adapter
 
 
 
34
  model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL_DIR)
 
 
35
  model = model.merge_and_unload()
36
 
37
+ # Move model to appropriate device
38
+ model.to(device)
39
 
 
40
  def chat(message):
41
+ inputs = tokenizer(message, return_tensors="pt").to(device)
42
  output = model.generate(**inputs, max_new_tokens=100)
43
  response = tokenizer.decode(output[0], skip_special_tokens=True)
44
  return response