Art3B-chat

Running on Zero

App Files Files Community

freeCS-dot-org commited on Jan 19

Commit

3bce535

verified ·

1 Parent(s): 0916816

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -15

app.py CHANGED Viewed

@@ -6,11 +6,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 import gradio as gr
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL = "AGI-0/Art-v0-3B"
-TITLE = """<h2>Link to the model: <a href="https://huggingface.co/AGI-0/Art-v0-3B"</h2>"""
 PLACEHOLDER = """
 <center>
@@ -18,7 +17,6 @@ PLACEHOLDER = """
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
@@ -31,7 +29,7 @@ h3 {
 }
 """
-device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
@@ -39,6 +37,8 @@ model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     device_map="auto")
 end_of_sentence = tokenizer.convert_tokens_to_ids("<|im_end|>")
 @spaces.GPU()
 def stream_chat(
     message: str,
@@ -53,8 +53,7 @@ def stream_chat(
     print(f'message: {message}')
     print(f'history: {history}')
-    conversation = [
-    ]
     for prompt, answer in history:
         conversation.extend([
             {"role": "user", "content": prompt},
@@ -69,11 +68,11 @@ def stream_chat(
     generate_kwargs = dict(
         input_ids=input_ids,
-        max_new_tokens = max_new_tokens,
-        do_sample = False if temperature == 0 else True,
-        top_p = top_p,
-        top_k = top_k,
-        temperature = temperature,
         repetition_penalty=penalty,
         eos_token_id=[end_of_sentence],
         streamer=streamer,
@@ -82,12 +81,42 @@ def stream_chat(
     with torch.no_grad():
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-        yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
@@ -155,6 +184,5 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
         cache_examples=False,
     )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL = "AGI-0/Art-v0-3B"
+TITLE = """<h2>Link to the model: <a href="https://huggingface.co/AGI-0/Art-v0-3B">click here</a></h2>"""
 PLACEHOLDER = """
 <center>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
 }
 """
+device = "cuda"  # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     device_map="auto")
 end_of_sentence = tokenizer.convert_tokens_to_ids("<|im_end|>")
+end_reasoning_token = "<|end_reasoning|>"
 @spaces.GPU()
 def stream_chat(
     message: str,
     print(f'message: {message}')
     print(f'history: {history}')
+    conversation = []
     for prompt, answer in history:
         conversation.extend([
             {"role": "user", "content": prompt},
     generate_kwargs = dict(
         input_ids=input_ids,
+        max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0 else True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
         repetition_penalty=penalty,
         eos_token_id=[end_of_sentence],
         streamer=streamer,
     with torch.no_grad():
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
     buffer = ""
+    reasoning_text = ""
+    final_text = ""
+    in_reasoning = True
     for new_text in streamer:
         buffer += new_text
+        if end_reasoning_token in buffer and in_reasoning:
+            # Split the buffer at the end_reasoning_token
+            parts = buffer.split(end_reasoning_token)
+            reasoning_text = parts[0]
+            final_text = parts[1] if len(parts) > 1 else ""
+            # Format the output with the details tag
+            formatted_output = (
+                "<details><summary>Click to see reasoning</summary>\n\n"
+                f"{reasoning_text}\n\n"
+                "</details>\n\n"
+                f"{final_text}"
+            )
+            in_reasoning = False
+            yield formatted_output
+        elif in_reasoning:
+            # Still collecting reasoning text
+            yield "<details><summary>Click to see reasoning</summary>\n\n" + buffer + "\n\n</details>"
+        else:
+            # After end_reasoning_token, just append to the existing formatted output
+            formatted_output = (
+                "<details><summary>Click to see reasoning</summary>\n\n"
+                f"{reasoning_text}\n\n"
+                "</details>\n\n"
+                f"{buffer}"
+            )
+            yield formatted_output
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
         cache_examples=False,
     )
 if __name__ == "__main__":
     demo.launch()