Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| import numpy as np | |
| from src.pipeline.query_pipeline import QueryPipeline | |
| from src.eval.retrieval_metrics import recall_at_k, mrr_score, precision_at_k | |
| from src.eval.hallucination import HallucinationGrader | |
| from src.eval.relevancy import RelevancyGrader | |
| def main(): | |
| print("Initializing Pipeline...") | |
| pipeline = QueryPipeline() | |
| grader = HallucinationGrader(pipeline.llm) | |
| relevancy_grader = RelevancyGrader(pipeline.llm) | |
| print("Loading Evaluation Data (WikiQA Test Split)...") | |
| # For meaningful evaluation, we need questions that actually have answers in our indexed subset. | |
| # Since we indexed the 'train' split of WikiQA (first 100), we should evaluate on that same subset | |
| # to test "retrieval ability" (can it find what we vaguely know is there). | |
| # In a real scenario, you'd test on a hold-out set, but only if you indexed the whole knowledge base. | |
| try: | |
| ds = load_dataset("microsoft/wiki_qa", split="train[:20]", trust_remote_code=True) | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| return | |
| # Metrics | |
| recalls = [] | |
| precisions = [] | |
| mrrs = [] | |
| hallucination_scores = [] | |
| relevancy_scores = [] | |
| print("Running Evaluation...") | |
| for i, row in tqdm(enumerate(ds), total=len(ds)): | |
| query = row['question'] | |
| relevant_doc_content = row['answer'] # The correct sentence | |
| is_correct = row['label'] == 1 | |
| if not is_correct: | |
| # If this row isn't a correct answer pair, skip for retrieval accuracy measurement | |
| # (or treat as a negative, but for RAG recall we usually care about positive queries) | |
| continue | |
| result = pipeline.run(query, top_k_retrieval=10, top_k_rerank=3) | |
| # Retrieval Metrics | |
| retrieved_contents = [doc if isinstance(doc, str) else doc['content'] for doc, score in result['context']] | |
| # Check if relevant content is in retrieved | |
| # The ingestion pipeline might add metadata like "Source: ...". | |
| # So we check if the relevant content SUBSTRING is in the retrieved chunks. | |
| is_hit = False | |
| for content in retrieved_contents: | |
| if relevant_doc_content in content: | |
| is_hit = True | |
| break | |
| recalls.append(1.0 if is_hit else 0.0) | |
| # Precision (strict: is the retrieved doc the specific sentence?) | |
| # Since we only retrieve 10 and usually there is only 1 relevant sentence in WikiQA: | |
| # Precision will be at best 0.1 if is_hit is true. | |
| precisions.append(1.0/10.0 if is_hit else 0.0) | |
| # MRR | |
| # Find rank | |
| rank = -1 | |
| for idx, content in enumerate(retrieved_contents): | |
| if relevant_doc_content in content: | |
| rank = idx + 1 | |
| break | |
| if rank > 0: | |
| mrrs.append(1.0 / rank) | |
| else: | |
| mrrs.append(0.0) | |
| # Generation / Hallucination Metric | |
| # We ask the LLM to grade if the answer supported by context | |
| grade = grader.grade( | |
| context="\n".join(retrieved_contents), | |
| answer=result['answer'] | |
| ) | |
| hallucination_scores.append(grade.get('score', 0.0)) | |
| # New: Answer Relevancy | |
| rel_grade = relevancy_grader.grade(query=query, answer=result['answer']) | |
| relevancy_scores.append(rel_grade.get('score', 0.0)) | |
| print("\nXXX Evaluation Results XXX") | |
| print(f"Average Recall@10: {np.mean(recalls):.4f}") | |
| print(f"Average Precision@10: {np.mean(precisions):.4f}") | |
| print(f"Average MRR: {np.mean(mrrs):.4f}") | |
| print(f"Average Factuality Score: {np.mean(hallucination_scores):.4f}") | |
| print(f"Average Answer Relevancy: {np.mean(relevancy_scores):.4f}") | |
| if __name__ == "__main__": | |
| main() | |