MrOvkill
/

gemma-2-inference-endpoint-GGUF

Text Generation

Model card Files Files and versions Community

MrOvkill commited on Mar 8, 2024

Commit

5b5a6aa

·

verified ·

1 Parent(s): 4405341

Update handler.py

Files changed (1) hide show

handler.py +5 -1

handler.py CHANGED Viewed

@@ -1,11 +1,15 @@
 from typing import Dict, List, Any
 from llama_cpp import Llama
 MAX_TOKENS=8192
 class EndpointHandler():
     def __init__(self, data):
-        self.model = Llama.from_pretrained("lmstudio-ai/gemma-2b-it-GGUF", filename="gemma-2b.q8_0.gguf", n_ctx=8192, cache_dir="./", n_gpu_layers=99)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs = data.pop("inputs", "")

 from typing import Dict, List, Any
 from llama_cpp import Llama
+import torch
 MAX_TOKENS=8192
 class EndpointHandler():
     def __init__(self, data):
+        n_gpu_layers = GPU_LAYERS
+        if not torch.cuda.is_available():
+            n_gpu_layers = 0
+        self.model = Llama.from_pretrained("lmstudio-ai/gemma-2b-it-GGUF", filename="gemma-2b-it-q4_k_m.gguf", n_ctx=8192, cache_dir="./", n_gpu_layers=n_gpu_layers)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs = data.pop("inputs", "")