Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
|
|
5 |
import os
|
6 |
from huggingface_hub import login
|
7 |
|
8 |
-
# Load Hugging Face token
|
9 |
hf_token = os.getenv("HUGGINGFACE_TOKEN")
|
10 |
if hf_token:
|
11 |
login(hf_token)
|
@@ -13,34 +13,32 @@ if hf_token:
|
|
13 |
else:
|
14 |
print("β Hugging Face token not found. Make sure it's set in 'Secrets'.")
|
15 |
|
16 |
-
#
|
17 |
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
|
18 |
-
FINETUNED_MODEL_DIR = "./finetuned_model" # Path to your adapter
|
|
|
|
|
|
|
19 |
|
20 |
# Load tokenizer
|
21 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
22 |
|
23 |
-
# Load base model
|
24 |
base_model = AutoModelForCausalLM.from_pretrained(
|
25 |
-
BASE_MODEL,
|
26 |
-
torch_dtype=
|
|
|
27 |
)
|
28 |
|
29 |
-
#
|
30 |
-
base_model.to("cpu")
|
31 |
-
|
32 |
-
# Load LoRA adapter
|
33 |
model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL_DIR)
|
34 |
-
|
35 |
-
# Merge adapter with base model
|
36 |
model = model.merge_and_unload()
|
37 |
|
38 |
-
# Move model to
|
39 |
-
model.to(
|
40 |
|
41 |
-
# Inference function
|
42 |
def chat(message):
|
43 |
-
inputs = tokenizer(message, return_tensors="pt").to(
|
44 |
output = model.generate(**inputs, max_new_tokens=100)
|
45 |
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
46 |
return response
|
|
|
5 |
import os
|
6 |
from huggingface_hub import login
|
7 |
|
8 |
+
# Load Hugging Face token from environment variables
|
9 |
hf_token = os.getenv("HUGGINGFACE_TOKEN")
|
10 |
if hf_token:
|
11 |
login(hf_token)
|
|
|
13 |
else:
|
14 |
print("β Hugging Face token not found. Make sure it's set in 'Secrets'.")
|
15 |
|
16 |
+
# Model paths
|
17 |
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
|
18 |
+
FINETUNED_MODEL_DIR = "./finetuned_model" # Path to your fine-tuned adapter
|
19 |
+
|
20 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
21 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
|
23 |
# Load tokenizer
|
24 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
25 |
|
26 |
+
# Load base model efficiently
|
27 |
base_model = AutoModelForCausalLM.from_pretrained(
|
28 |
+
BASE_MODEL,
|
29 |
+
torch_dtype=torch_dtype,
|
30 |
+
device_map="auto" if torch.cuda.is_available() else None # Use GPU if available
|
31 |
)
|
32 |
|
33 |
+
# Load and merge LoRA adapter
|
|
|
|
|
|
|
34 |
model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL_DIR)
|
|
|
|
|
35 |
model = model.merge_and_unload()
|
36 |
|
37 |
+
# Move model to appropriate device
|
38 |
+
model.to(device)
|
39 |
|
|
|
40 |
def chat(message):
|
41 |
+
inputs = tokenizer(message, return_tensors="pt").to(device)
|
42 |
output = model.generate(**inputs, max_new_tokens=100)
|
43 |
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
44 |
return response
|