Files
langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py
T
William FH a673a51efa [Breaking] Update Evaluation Functionality (#7388)
- Migrate from deprecated langchainplus_sdk to `langsmith` package
- Update the `run_on_dataset()` API to use an eval config
- Update a number of evaluators, as well as the loading logic
- Update docstrings / reference docs
- Update tracer to share single HTTP session
2023-07-13 02:13:06 -07:00

67 lines
2.0 KiB
Python

"""Test the comparison chains."""
import re
import pytest
from langchain.evaluation.comparison.eval_chain import (
LabeledPairwiseStringEvalChain,
PairwiseStringEvalChain,
)
from tests.unit_tests.llms.fake_llm import FakeLLM
def test_pairwise_string_comparison_chain() -> None:
llm = FakeLLM(
queries={
"a": "The values are the same.\n[[C]]",
"b": "A is clearly better than b.\n[[A]]",
"c": "B is clearly better than a.\n[[B]]",
},
sequential_responses=True,
)
chain = PairwiseStringEvalChain.from_llm(llm=llm)
res = chain.evaluate_string_pairs(
prediction="I like pie.",
prediction_b="I love pie.",
input="What is your favorite food?",
)
assert res["value"] is None
assert res["score"] == 0.5
assert res["reasoning"] == "The values are the same."
res = chain.evaluate_string_pairs(
prediction="I like pie.",
prediction_b="I like pie.",
input="What is your favorite food?",
)
assert res["value"] == "A"
assert res["score"] == 1
with pytest.warns(UserWarning, match=re.escape(chain._skip_reference_warning)):
res = chain.evaluate_string_pairs(
prediction="I like pie.",
prediction_b="I hate pie.",
input="What is your favorite food?",
reference="I enjoy pie.",
)
assert res["value"] == "B"
assert res["score"] == 0
def test_labeled_pairwise_string_comparison_chain_missing_ref() -> None:
llm = FakeLLM(
queries={
"a": "The values are the same.\n[[C]]",
"b": "A is clearly better than b.\n[[A]]",
"c": "B is clearly better than a.\n[[B]]",
},
sequential_responses=True,
)
chain = LabeledPairwiseStringEvalChain.from_llm(llm=llm)
with pytest.raises(ValueError):
chain.evaluate_string_pairs(
prediction="I like pie.",
prediction_b="I love pie.",
input="What is your favorite food?",
)