Files
awesome-llm-apps/starter_ai_agents/agentic_rag_math_agent/app/benchmark.py
2025-04-25 15:04:51 -04:00

52 lines
1.5 KiB
Python

# Add the project root to the Python path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
import pandas as pd
import time
from datetime import datetime
from rag.query_router import answer_math_question
from data.load_gsm8k_data import load_jeebench_dataset
def benchmark_math_agent(limit: int = 10):
# ✅ Always filter math-only questions
df = load_jeebench_dataset()
df = df.head(limit) # Limit the number of questions for benchmarking
total = len(df)
correct = 0
results = []
for idx, row in df.iterrows():
question = row["question"]
expected = row["gold"]
start = time.time()
try:
response = answer_math_question(question)
is_correct = expected.lower() in response.lower()
if is_correct:
correct += 1
results.append({
"Question": question,
"Expected": expected,
"Predicted": response,
"Correct": is_correct,
"TimeTakenSec": round(time.time() - start, 2)
})
except Exception as e:
results.append({
"Question": question,
"Expected": expected,
"Predicted": f"Error: {e}",
"Correct": False,
"TimeTakenSec": None
})
df_result = pd.DataFrame(results)
accuracy = correct / total * 100
return df_result, accuracy