Evaluting lLamaIndex
Quick Summary
DeepEval integrates nicely with LlamaIndex's ResponseEvaluator
class. Below is an example of the factual consistency documentation.
from llama_index.response.schema import Response
from typing import List
from llama_index.schema import Document
from deepeval.metrics import HallucinationMetric
from llama_index import (
TreeIndex,
VectorStoreIndex,
SimpleDirectoryReader,
LLMPredictor,
ServiceContext,
Response,
)
from llama_index.llms import OpenAI
from llama_index.evaluation import ResponseEvaluator
import os
import openai
api_key = "sk-XXX"
openai.api_key = api_key
gpt4 = OpenAI(temperature=0, model="gpt-4", api_key=api_key)
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)
evaluator_gpt4 = ResponseEvaluator(service_context=service_context_gpt4)
Getting a lLamaHub Loader
from llama_index import download_loader
WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()
documents = loader.load_data(pages=['Tokyo'])
tree_index = TreeIndex.from_documents(documents=documents)
vector_index = VectorStoreIndex.from_documents(
documents, service_context=service_context_gpt4
)
We then build an evaluator based on the BaseEvaluator class that requires an evaluate method.
In this example, we show you how to write a factual consistency check.
from deepeval.test_case import LLMTestCase
from deepeval.metrics import HallucinationMetric
class HallucinationResponseEvaluator:
def get_context(self, response: Response) -> List[Document]:
"""Get context information from given Response object using source nodes.
Args:
response (Response): Response object from an index based on the query.
Returns:
List of Documents of source nodes information as context information.
"""
context = []
for context_info in response.source_nodes:
context.append(Document(text=context_info.node.get_content()))
return context
def evaluate(self, response: Response) -> str:
# Evaluate factual consistency metrics
answer = str(response)
metric = HallucinationMetric()
context = self.get_context(response)
context = " ".join([d.text for d in context])
test_case = LLMTestCase(input="This is an example input", context=context, actual_output=answer)
score = metric.measure(test_case=test_case)
if metric.is_successful():
return "YES"
else:
return "NO"
evaluator = HallucinationResponseEvaluator()
You can then evaluate as such:
query_engine = tree_index.as_query_engine()
response = query_engine.query("How did Tokyo get its name?")
eval_result = evaluator.evaluate(response)