import streamlit as st import os import importlib import sys from langchain_openai import OpenAI import re import json from algos.PWS import * from utils.util import * from nodes.Worker import * from prompts import fewshots # Load API keys # with open(os.path.join('./keys/', 'openai.key'), 'r') as f: # os.environ["OPENAI_API_KEY"] = f.read().strip() # with open(os.path.join('./keys/', 'serpapi.key'), 'r') as f: # os.environ["SERPAPI_API_KEY"] = f.read().strip() def reload_modules(): """Reload all relevant modules""" importlib.reload(sys.modules['nodes.Worker']) importlib.reload(sys.modules['algos.PWS']) importlib.reload(sys.modules['utils.util']) importlib.reload(sys.modules['prompts.fewshots']) importlib.reload(sys.modules['prompts.solver']) return "✅ Modules reloaded successfully!" def process(tools, model, input_text): # Use study abroad fewshot for study-related questions if any(word in input_text.lower() for word in ["study", "student", "university", "college", "school", "abroad", "học", "trường", "du học", "học bổng", "gpa", "ielts", "tcf", "delf", "scholarship"]): # Ensure both Google and LLM are included for study abroad queries print(tools) assert ("LLM" in tools) and ("Google" or "Duckduckgo" in tools) method = PWS_Base(planner_model=model, solver_model=model, fewshot=fewshots.STUDY_ABROAD_PWS, available_tools=tools) else: method = PWS_Base(planner_model=model, solver_model=model, fewshot=fewshots.TRIVIAQA_PWS, available_tools=tools) response = method.run(input_text) # Extract planner log plan = response["planner_log"].split(input_text)[1].strip('\n') # Extract full solver log without truncating at "Now begin to solve the task" solve = response["solver_log"].split(input_text)[1].strip('\n') # Get the complete output output = response["output"] return plan, solve, output def evaluate(response, plan, solve): """ Evaluate whether the response is based on evidence or contains hallucinations. Args: response: The assistant's full response plan: The planning process solve: The solving process with evidence Returns: Dictionary with reasoning, summary and evaluation status """ # Initialize OpenAI client llm = OpenAI(temperature=0) # Extract only evidence paragraphs from solve evidence_blocks = [] for block in solve.split("\n\n"): if "Evidence:" in block: evidence_part = block.split("Evidence:", 1)[1].strip() if evidence_part: evidence_blocks.append(evidence_part) # Combine evidence sources evidence = "\n\n".join(evidence_blocks) if not evidence: evidence = solve # Fallback to using entire solve text if no evidence found # Create prompt for evaluation prompt = f""" Evaluate whether the following response is factually supported by the provided evidence. Response to evaluate: {response} Evidence: {evidence} Provide your evaluation in this format: REASONING: Detailed analysis comparing the response against the evidence SUMMARY: Brief summary of the evaluation VERDICT: [SUPPORTED/PARTIALLY SUPPORTED/UNSUPPORTED] - Choose one verdict """ try: result_text = llm.invoke(prompt).strip() # Parse the structured output reasoning = "" summary = "" verdict = "UNSUPPORTED" if "REASONING:" in result_text: parts = result_text.split("REASONING:", 1) remainder = parts[1] if "SUMMARY:" in remainder: reasoning, remainder = remainder.split("SUMMARY:", 1) if "VERDICT:" in remainder: summary, verdict = remainder.split("VERDICT:", 1) reasoning = reasoning.strip() summary = summary.strip() verdict = verdict.strip() # Determine verdict category verdict_category = "unsupported" if "SUPPORTED" in verdict and not "PARTIALLY" in verdict and not "UNSUPPORTED" in verdict: verdict_category = "supported" elif "PARTIALLY" in verdict: verdict_category = "partially_supported" return { "reasoning": reasoning, "summary": summary, "verdict": verdict, "verdict_category": verdict_category } except Exception as e: return { "reasoning": f"Error during evaluation: {str(e)}", "summary": "Could not complete evaluation", "verdict": "EVALUATION FAILED", "verdict_category": "error" } # Main app st.set_page_config(page_title="ReWOO Demo", layout="wide") st.title("ReWOO Demo 🤗") st.markdown(""" Demonstrating our recent work -- ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models. Note that this demo is only a conceptual impression of our work, we use a zero-shot set up and not optimizing the run time. """) # Initialize session state if 'messages' not in st.session_state: st.session_state.messages = [] # Sidebar with st.sidebar: st.header("Configuration") # Tools selection tools = st.multiselect( "Select Tools", options=['Wikipedia', 'Google', 'LLM', 'WolframAlpha', 'Calculator', 'Duckduckgo'], default=['Duckduckgo', 'LLM'] ) # Model selection model = st.selectbox( "Select Model", options=["text-davinci-003", "gpt-3.5-turbo"], index=1 ) # Refresh modules button if st.button("🔄 Refresh Modules"): status = reload_modules() st.success(status) # Examples section st.header("Examples") if st.button("Example 1: American Callan Pinckney's system"): example_text = "American Callan Pinckney's eponymously named system became a best-selling (1980s-2000s) book/video franchise in what genre?" st.session_state.messages.append({"role": "user", "content": example_text}) with st.spinner('Processing...'): plan, solve, output = process(["Wikipedia", "LLM"], "gpt-3.5-turbo", example_text) st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) if st.button("Example 2: ReWOO paper"): example_text = "What is the recent paper ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models about?" st.session_state.messages.append({"role": "user", "content": example_text}) with st.spinner('Processing...'): plan, solve, output = process(["Google", "LLM"], "gpt-3.5-turbo", example_text) st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) if st.button("Example 3: Car acceleration"): example_text = "the car can accelerate from 0 to 27.8 m/s in a time of 3.85 seconds. Determine the acceleration of this car in m/s/s." st.session_state.messages.append({"role": "user", "content": example_text}) with st.spinner('Processing...'): plan, solve, output = process(["Calculator", "WolframAlpha"], "gpt-3.5-turbo", example_text) st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) # Display chat history for i, message in enumerate(st.session_state.messages): if message["role"] == "user": st.chat_message("user").write(message["content"]) else: with st.chat_message("assistant"): st.write(message["content"]) with st.expander("Show reasoning process"): st.subheader("Planner") st.text(message["plan"]) st.subheader("Solver") st.text(message["solve"]) # Add evaluate button in the expander if "evaluation_results" not in message: if st.button("🔍 Evaluate", key=f"eval_btn_{i}", type="secondary"): with st.spinner("Evaluating response..."): results = evaluate(message["content"], message["plan"], message["solve"]) st.session_state.messages[i]["evaluation_results"] = results st.rerun() else: # Show evaluation in an expander with st.expander("Evaluation Results"): results = message["evaluation_results"] # Display verdict with color verdict_color = "red" if results["verdict_category"] == "supported": verdict_color = "green" elif results["verdict_category"] == "partially_supported": verdict_color = "orange" st.markdown(f"