"""This module contains the evaluator classes for evaluating runs."""from__future__importannotationsimportasyncioimportinspectimportuuidfromabcimportabstractmethodfromtypingimport(Any,Awaitable,Callable,Dict,List,Literal,Optional,Sequence,Union,cast,)fromtyping_extensionsimportTypedDictfromlangsmithimportrun_helpersasrhfromlangsmithimportschemastry:frompydantic.v1import(# type: ignore[import]BaseModel,Field,ValidationError,validator,)exceptImportError:frompydanticimport(# type: ignore[assignment]BaseModel,Field,ValidationError,validator,)importloggingfromfunctoolsimportwrapsfromlangsmith.schemasimportSCORE_TYPE,VALUE_TYPE,Example,Runlogger=logging.getLogger(__name__)
[docs]classCategory(TypedDict):"""A category for categorical feedback."""value:Optional[Union[float,int]]"""The numeric score/ordinal corresponding to this category."""label:str"""The label for this category."""
[docs]classFeedbackConfig(TypedDict,total=False):"""Configuration to define a type of feedback. Applied on on the first creation of a feedback_key. """type:Literal["continuous","categorical","freeform"]"""The type of feedback."""min:Optional[Union[float,int]]"""The minimum permitted value (if continuous type)."""max:Optional[Union[float,int]]"""The maximum value permitted value (if continuous type)."""categories:Optional[List[Union[Category,dict]]]
[docs]classEvaluationResult(BaseModel):"""Evaluation result."""key:str"""The aspect, metric name, or label for this evaluation."""score:SCORE_TYPE=None"""The numeric score for this evaluation."""value:VALUE_TYPE=None"""The value for this evaluation, if not numeric."""comment:Optional[str]=None"""An explanation regarding the evaluation."""correction:Optional[Dict]=None"""What the correct value should be, if applicable."""evaluator_info:Dict=Field(default_factory=dict)"""Additional information about the evaluator."""feedback_config:Optional[Union[FeedbackConfig,dict]]=None"""The configuration used to generate this feedback."""source_run_id:Optional[Union[uuid.UUID,str]]=None"""The ID of the trace of the evaluator itself."""target_run_id:Optional[Union[uuid.UUID,str]]=None"""The ID of the trace this evaluation is applied to. If none provided, the evaluation feedback is applied to the root trace being."""extra:Optional[Dict]=None"""Metadata for the evaluator run."""classConfig:"""Pydantic model configuration."""allow_extra=False
[docs]@validator("value",pre=True)defcheck_value_non_numeric(cls,v,values):"""Check that the value is not numeric."""# If a score isn't provided and the value is numeric# it's more likely the user intended use the score fieldif"score"notinvaluesorvalues["score"]isNone:ifisinstance(v,(int,float)):logger.warning("Numeric values should be provided in"" the 'score' field, not 'value'."f" Got: {v}")returnv
[docs]classEvaluationResults(TypedDict,total=False):"""Batch evaluation results. This makes it easy for your evaluator to return multiple metrics at once. """results:List[EvaluationResult]"""The evaluation results."""
[docs]@abstractmethoddefevaluate_run(self,run:Run,example:Optional[Example]=None,evaluator_run_id:Optional[uuid.UUID]=None,)->Union[EvaluationResult,EvaluationResults]:"""Evaluate an example."""
[docs]asyncdefaevaluate_run(self,run:Run,example:Optional[Example]=None,evaluator_run_id:Optional[uuid.UUID]=None,)->Union[EvaluationResult,EvaluationResults]:"""Evaluate an example asynchronously."""current_context=rh.get_tracing_context()def_run_with_context():withrh.tracing_context(**current_context):returnself.evaluate_run(run,example,evaluator_run_id)returnawaitasyncio.get_running_loop().run_in_executor(None,_run_with_context)
[docs]classComparisonEvaluationResult(BaseModel):"""Feedback scores for the results of comparative evaluations. These are generated by functions that compare two or more runs, returning a ranking or other feedback. """key:str"""The aspect, metric name, or label for this evaluation."""scores:Dict[Union[uuid.UUID,str],SCORE_TYPE]"""The scores for each run in the comparison."""source_run_id:Optional[Union[uuid.UUID,str]]=None"""The ID of the trace of the evaluator itself."""comment:Optional[Union[str,Dict[Union[uuid.UUID,str],str]]]=None"""Comment for the scores. If a string, it's shared across all target runs. If a dict, it maps run IDs to individual comments."""
[docs]classDynamicRunEvaluator(RunEvaluator):"""A dynamic evaluator that wraps a function and transforms it into a `RunEvaluator`. This class is designed to be used with the `@run_evaluator` decorator, allowing functions that take a `Run` and an optional `Example` as arguments, and return an `EvaluationResult` or `EvaluationResults`, to be used as instances of `RunEvaluator`. Attributes: func (Callable): The function that is wrapped by this evaluator. """# noqa: E501
[docs]def__init__(self,func:Callable[[Run,Optional[Example]],Union[_RUNNABLE_OUTPUT,Awaitable[_RUNNABLE_OUTPUT]],],# Async function to be used for async evaluation. Optionalafunc:Optional[Callable[[Run,Optional[Example]],Awaitable[_RUNNABLE_OUTPUT],]]=None,):"""Initialize the DynamicRunEvaluator with a given function. Args: func (Callable): A function that takes a `Run` and an optional `Example` as arguments, and returns a dict or `ComparisonEvaluationResult`. """func=_normalize_evaluator_func(func)ifafunc:afunc=_normalize_evaluator_func(afunc)# type: ignore[assignment]wraps(func)(self)fromlangsmithimportrun_helpers# type: ignoreifafuncisnotNone:self.afunc=run_helpers.ensure_traceable(afunc,process_inputs=_serialize_inputs)self._name=getattr(afunc,"__name__","DynamicRunEvaluator")ifinspect.iscoroutinefunction(func):ifafuncisnotNone:raiseTypeError("Func was provided as a coroutine function, but afunc was ""also provided. If providing both, func should be a regular ""function to avoid ambiguity.")self.afunc=run_helpers.ensure_traceable(func,process_inputs=_serialize_inputs)self._name=getattr(func,"__name__","DynamicRunEvaluator")else:self.func=run_helpers.ensure_traceable(cast(Callable[[Run,Optional[Example]],_RUNNABLE_OUTPUT],func),process_inputs=_serialize_inputs,)self._name=getattr(func,"__name__","DynamicRunEvaluator")
def_coerce_evaluation_result(self,result:Union[EvaluationResult,dict],source_run_id:uuid.UUID,allow_no_key:bool=False,)->EvaluationResult:ifisinstance(result,EvaluationResult):ifnotresult.source_run_id:result.source_run_id=source_run_idreturnresulttry:ifnotresult:raiseValueError("Expected an EvaluationResult object, or dict with a metric"f" 'key' and optional 'score'; got empty result: {result}")if"key"notinresultandallow_no_key:result["key"]=self._nameifall(knotinresultforkin("score","value","comment")):raiseValueError("Expected an EvaluationResult object, or dict with a metric"f" 'key' and optional 'score' or categorical 'value'; got {result}")returnEvaluationResult(**{"source_run_id":source_run_id,**result})exceptValidationErrorase:raiseValueError("Expected an EvaluationResult object, or dict with a metric"f" 'key' and optional 'score'; got {result}")fromedef_coerce_evaluation_results(self,results:Union[dict,EvaluationResults],source_run_id:uuid.UUID,)->Union[EvaluationResult,EvaluationResults]:if"results"inresults:cp=results.copy()cp["results"]=[self._coerce_evaluation_result(r,source_run_id=source_run_id)forrinresults["results"]]returnEvaluationResults(**cp)returnself._coerce_evaluation_result(cast(dict,results),source_run_id=source_run_id,allow_no_key=True)def_format_result(self,result:Union[EvaluationResult,EvaluationResults,dict,str,int,bool,float,list],source_run_id:uuid.UUID,)->Union[EvaluationResult,EvaluationResults]:ifisinstance(result,EvaluationResult):ifnotresult.source_run_id:result.source_run_id=source_run_idreturnresultresult=_format_evaluator_result(result)returnself._coerce_evaluation_results(result,source_run_id)@propertydefis_async(self)->bool:"""Check if the evaluator function is asynchronous. Returns: bool: True if the evaluator function is asynchronous, False otherwise. """returnhasattr(self,"afunc")
[docs]defevaluate_run(self,run:Run,example:Optional[Example]=None,evaluator_run_id:Optional[uuid.UUID]=None,)->Union[EvaluationResult,EvaluationResults]:"""Evaluate a run using the wrapped function. This method directly invokes the wrapped function with the provided arguments. Args: run (Run): The run to be evaluated. example (Optional[Example]): An optional example to be used in the evaluation. Returns: Union[EvaluationResult, EvaluationResults]: The result of the evaluation. """# noqa: E501ifnothasattr(self,"func"):running_loop=asyncio.get_event_loop()ifrunning_loop.is_running():raiseRuntimeError("Cannot call `evaluate_run` on an async run evaluator from"" within an running event loop. Use `aevaluate_run` instead.")else:returnrunning_loop.run_until_complete(self.aevaluate_run(run,example))ifevaluator_run_idisNone:evaluator_run_id=uuid.uuid4()metadata:Dict[str,Any]={"target_run_id":run.id}ifgetattr(run,"session_id",None):metadata["experiment"]=str(run.session_id)result=self.func(run,example,langsmith_extra={"run_id":evaluator_run_id,"metadata":metadata},)returnself._format_result(result,evaluator_run_id)
[docs]asyncdefaevaluate_run(self,run:Run,example:Optional[Example]=None,evaluator_run_id:Optional[uuid.UUID]=None,):"""Evaluate a run asynchronously using the wrapped async function. This method directly invokes the wrapped async function with the provided arguments. Args: run (Run): The run to be evaluated. example (Optional[Example]): An optional example to be used in the evaluation. Returns: Union[EvaluationResult, EvaluationResults]: The result of the evaluation. """ifnothasattr(self,"afunc"):returnawaitsuper().aevaluate_run(run,example)ifevaluator_run_idisNone:evaluator_run_id=uuid.uuid4()metadata:Dict[str,Any]={"target_run_id":run.id}ifgetattr(run,"session_id",None):metadata["experiment"]=str(run.session_id)result=awaitself.afunc(run,example,langsmith_extra={"run_id":evaluator_run_id,"metadata":metadata},)returnself._format_result(result,evaluator_run_id)
def__call__(self,run:Run,example:Optional[Example]=None)->Union[EvaluationResult,EvaluationResults]:"""Make the evaluator callable, allowing it to be used like a function. This method enables the evaluator instance to be called directly, forwarding the call to `evaluate_run`. Args: run (Run): The run to be evaluated. example (Optional[Example]): An optional example to be used in the evaluation. Returns: Union[EvaluationResult, EvaluationResults]: The result of the evaluation. """# noqa: E501returnself.evaluate_run(run,example)def__repr__(self)->str:"""Represent the DynamicRunEvaluator object."""returnf"<DynamicRunEvaluator {self._name}>"
[docs]defrun_evaluator(func:Callable[[Run,Optional[Example]],Union[_RUNNABLE_OUTPUT,Awaitable[_RUNNABLE_OUTPUT]]],):"""Create a run evaluator from a function. Decorator that transforms a function into a `RunEvaluator`. """returnDynamicRunEvaluator(func)
[docs]classDynamicComparisonRunEvaluator:"""Compare predictions (as traces) from 2 or more runs."""
[docs]def__init__(self,func:Callable[[Sequence[Run],Optional[Example]],Union[_COMPARISON_OUTPUT,Awaitable[_COMPARISON_OUTPUT]],],# Async function to be used for async evaluation. Optionalafunc:Optional[Callable[[Sequence[Run],Optional[Example]],Awaitable[_COMPARISON_OUTPUT],]]=None,):"""Initialize the DynamicRunEvaluator with a given function. Args: func (Callable): A function that takes a `Run` and an optional `Example` as arguments, and returns an `EvaluationResult` or `EvaluationResults`. """func=_normalize_comparison_evaluator_func(func)ifafunc:afunc=_normalize_comparison_evaluator_func(afunc)# type: ignore[assignment]wraps(func)(self)fromlangsmithimportrun_helpers# type: ignoreifafuncisnotNone:self.afunc=run_helpers.ensure_traceable(afunc,process_inputs=_serialize_inputs)self._name=getattr(afunc,"__name__","DynamicRunEvaluator")ifinspect.iscoroutinefunction(func):ifafuncisnotNone:raiseTypeError("Func was provided as a coroutine function, but afunc was ""also provided. If providing both, func should be a regular ""function to avoid ambiguity.")self.afunc=run_helpers.ensure_traceable(func,process_inputs=_serialize_inputs)self._name=getattr(func,"__name__","DynamicRunEvaluator")else:self.func=run_helpers.ensure_traceable(cast(Callable[[Sequence[Run],Optional[Example]],_COMPARISON_OUTPUT,],func,),process_inputs=_serialize_inputs,)self._name=getattr(func,"__name__","DynamicRunEvaluator")
@propertydefis_async(self)->bool:"""Check if the evaluator function is asynchronous. Returns: bool: True if the evaluator function is asynchronous, False otherwise. """returnhasattr(self,"afunc")
[docs]defcompare_runs(self,runs:Sequence[Run],example:Optional[Example]=None)->ComparisonEvaluationResult:"""Compare runs to score preferences. Args: runs: A list of runs to compare. example: An optional example to be used in the evaluation. """# noqa: E501ifnothasattr(self,"func"):running_loop=asyncio.get_event_loop()ifrunning_loop.is_running():raiseRuntimeError("Cannot call `evaluate_run` on an async run evaluator from"" within an running event loop. Use `aevaluate_run` instead.")else:returnrunning_loop.run_until_complete(self.acompare_runs(runs,example))source_run_id=uuid.uuid4()tags=self._get_tags(runs)# TODO: Add metadata for the "comparison experiment" hereresult=self.func(runs,example,langsmith_extra={"run_id":source_run_id,"tags":tags},)returnself._format_results(result,source_run_id,runs)
[docs]asyncdefacompare_runs(self,runs:Sequence[Run],example:Optional[Example]=None)->ComparisonEvaluationResult:"""Evaluate a run asynchronously using the wrapped async function. This method directly invokes the wrapped async function with the provided arguments. Args: runs (Run): The runs to be evaluated. example (Optional[Example]): An optional example to be used in the evaluation. Returns: ComparisonEvaluationResult: The result of the evaluation. """ifnothasattr(self,"afunc"):returnself.compare_runs(runs,example)source_run_id=uuid.uuid4()tags=self._get_tags(runs)# TODO: Add metadata for the "comparison experiment" hereresult=awaitself.afunc(runs,example,langsmith_extra={"run_id":source_run_id,"tags":tags},)returnself._format_results(result,source_run_id,runs)
def__call__(self,runs:Sequence[Run],example:Optional[Example]=None)->ComparisonEvaluationResult:"""Make the evaluator callable, allowing it to be used like a function. This method enables the evaluator instance to be called directly, forwarding the call to `evaluate_run`. Args: run (Run): The run to be evaluated. example (Optional[Example]): An optional example to be used in the evaluation. Returns: ComparisonEvaluationResult: The result of the evaluation. """# noqa: E501returnself.compare_runs(runs,example)def__repr__(self)->str:"""Represent the DynamicRunEvaluator object."""returnf"<DynamicComparisonRunEvaluator {self._name}>"@staticmethoddef_get_tags(runs:Sequence[Run])->List[str]:"""Extract tags from runs."""# Add tags to support filteringtags=[]forruninruns:tags.append("run:"+str(run.id))ifgetattr(run,"session_id",None):tags.append("experiment:"+str(run.session_id))returntagsdef_format_results(self,result:Union[dict,list,ComparisonEvaluationResult],source_run_id:uuid.UUID,runs:Sequence[Run],)->ComparisonEvaluationResult:ifisinstance(result,ComparisonEvaluationResult):ifnotresult.source_run_id:result.source_run_id=source_run_idreturnresultelifisinstance(result,list):result={"scores":{run.id:scoreforrun,scoreinzip(runs,result)},"key":self._name,"source_run_id":source_run_id,}elifisinstance(result,dict):if"key"notinresult:result["key"]=self._nameelse:msg=("Expected 'dict', 'list' or 'ComparisonEvaluationResult' result "f"object. Received: {result=}")raiseValueError(msg)try:returnComparisonEvaluationResult(**{"source_run_id":source_run_id,**result})exceptValidationErrorase:raiseValueError(f"Expected a dictionary with a 'key' and dictionary of scores mapping""run IDs to numeric scores, or ComparisonEvaluationResult object,"f" got {result}")frome
[docs]defcomparison_evaluator(func:Callable[[Sequence[Run],Optional[Example]],Union[_COMPARISON_OUTPUT,Awaitable[_COMPARISON_OUTPUT]],],)->DynamicComparisonRunEvaluator:"""Create a comaprison evaluator from a function."""returnDynamicComparisonRunEvaluator(func)
def_normalize_evaluator_func(func:Callable,)->Union[Callable[[Run,Optional[Example]],_RUNNABLE_OUTPUT],Callable[[Run,Optional[Example]],Awaitable[_RUNNABLE_OUTPUT]],]:supported_args=("run","example","inputs","outputs","reference_outputs","attachments",)sig=inspect.signature(func)all_args=[pnameforpname,pinsig.parameters.items()ifp.kind!=p.VAR_KEYWORD]args_with_defaults=[pnameforpname,pinsig.parameters.items()ifp.defaultisnotinspect.Parameter.empty]ifnotall_argsor(notall(pnameinsupported_argsorpnameinargs_with_defaultsforpnameinall_args)andlen([aforainall_argsifanotinargs_with_defaults])!=2):msg=(f"Invalid evaluator function. Must have at least one "f"argument. Supported arguments are {supported_args}. Please "f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"# noqa: E501)raiseValueError(msg)# For backwards compatibility we assume custom arg names are Run and Example# types, respectively.elifnotall(pnameinsupported_argsorpnameinargs_with_defaultsforpnameinall_args)orall_args==["run","example",]:returnfuncelse:ifinspect.iscoroutinefunction(func):asyncdefawrapper(run:Run,example:Optional[Example])->_RUNNABLE_OUTPUT:arg_map={"run":run,"example":example,"inputs":example.inputsifexampleelse{},"outputs":run.outputsor{},"attachments":example.attachmentsor{}ifexampleelse{},"reference_outputs":example.outputsor{}ifexampleelse{},}kwargs={}args=[]forparam_name,paraminsig.parameters.items():# Could have params with defaults that are not in the arg mapifparam_nameinarg_map:ifparam.kindin(param.POSITIONAL_OR_KEYWORD,param.POSITIONAL_ONLY,):args.append(arg_map[param_name])else:kwargs[param_name]=arg_map[param_name]returnawaitfunc(*args,**kwargs)awrapper.__name__=(getattr(func,"__name__")ifhasattr(func,"__name__")elseawrapper.__name__)returnawrapper# type: ignore[return-value]else:defwrapper(run:Run,example:Optional[Example])->_RUNNABLE_OUTPUT:arg_map={"run":run,"example":example,"inputs":example.inputsifexampleelse{},"outputs":run.outputsor{},"attachments":example.attachmentsor{}ifexampleelse{},"reference_outputs":example.outputsor{}ifexampleelse{},}kwargs={}args=[]forparam_name,paraminsig.parameters.items():# Could have params with defaults that are not in the arg mapifparam_nameinarg_map:ifparam.kindin(param.POSITIONAL_OR_KEYWORD,param.POSITIONAL_ONLY,):args.append(arg_map[param_name])else:kwargs[param_name]=arg_map[param_name]returnfunc(*args,**kwargs)wrapper.__name__=(getattr(func,"__name__")ifhasattr(func,"__name__")elsewrapper.__name__)returnwrapper# type: ignore[return-value]def_normalize_comparison_evaluator_func(func:Callable,)->Union[Callable[[Sequence[Run],Optional[Example]],_COMPARISON_OUTPUT],Callable[[Sequence[Run],Optional[Example]],Awaitable[_COMPARISON_OUTPUT]],]:supported_args=("runs","example","inputs","outputs","reference_outputs")sig=inspect.signature(func)all_args=[pnameforpname,pinsig.parameters.items()ifp.kind!=p.VAR_KEYWORD]args_with_defaults=[pnameforpname,pinsig.parameters.items()ifp.defaultisnotinspect.Parameter.empty]ifnotall_argsor(notall(pnameinsupported_argsorpnameinargs_with_defaultsforpnameinall_args)andlen([aforainall_argsifanotinargs_with_defaults])!=2):msg=(f"Invalid evaluator function. Must have at least one "f"argument. Supported arguments are {supported_args}. Please "f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"# noqa: E501)raiseValueError(msg)# For backwards compatibility we assume custom arg names are List[Run] and# List[Example] types, respectively.elifnotall(pnameinsupported_argsorpnameinargs_with_defaultsforpnameinall_args)orall_args==["runs","example",]:returnfuncelse:ifinspect.iscoroutinefunction(func):asyncdefawrapper(runs:Sequence[Run],example:Optional[Example])->_COMPARISON_OUTPUT:arg_map={"runs":runs,"example":example,"inputs":example.inputsifexampleelse{},"outputs":[run.outputsor{}forruninruns],"reference_outputs":example.outputsor{}ifexampleelse{},}kwargs={}args=[]forparam_name,paraminsig.parameters.items():# Could have params with defaults that are not in the arg mapifparam_nameinarg_map:ifparam.kindin(param.POSITIONAL_OR_KEYWORD,param.POSITIONAL_ONLY,):args.append(arg_map[param_name])else:kwargs[param_name]=arg_map[param_name]returnawaitfunc(*args,**kwargs)awrapper.__name__=(getattr(func,"__name__")ifhasattr(func,"__name__")elseawrapper.__name__)returnawrapper# type: ignore[return-value]else:defwrapper(runs:Sequence[Run],example:Optional[Example])->_COMPARISON_OUTPUT:arg_map={"runs":runs,"example":example,"inputs":example.inputsifexampleelse{},"outputs":[run.outputsor{}forruninruns],"reference_outputs":example.outputsor{}ifexampleelse{},}kwargs={}args=[]forparam_name,paraminsig.parameters.items():# Could have params with defaults that are not in the arg mapifparam_nameinarg_map:ifparam.kindin(param.POSITIONAL_OR_KEYWORD,param.POSITIONAL_ONLY,):args.append(arg_map[param_name])else:kwargs[param_name]=arg_map[param_name]returnfunc(*args,**kwargs)wrapper.__name__=(getattr(func,"__name__")ifhasattr(func,"__name__")elsewrapper.__name__)returnwrapper# type: ignore[return-value]def_format_evaluator_result(result:Union[EvaluationResults,dict,str,int,bool,float,list],)->Union[EvaluationResults,dict]:ifisinstance(result,(bool,float,int)):result={"score":result}elifnotresult:raiseValueError(f"Expected a non-empty dict, str, bool, int, float, list, "f"EvaluationResult, or EvaluationResults. Got {result}")elifisinstance(result,list):ifnotall(isinstance(x,dict)forxinresult):raiseValueError(f"Expected a list of dicts or EvaluationResults. Received {result}.")result={"results":result}# type: ignore[misc]elifisinstance(result,str):result={"value":result}elifisinstance(result,dict):passelse:raiseValueError(f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "f"EvaluationResults. Got {result}")returnresultSUMMARY_EVALUATOR_T=Union[Callable[[Sequence[schemas.Run],Sequence[schemas.Example]],Union[EvaluationResult,EvaluationResults],],Callable[[List[schemas.Run],List[schemas.Example]],Union[EvaluationResult,EvaluationResults],],]def_normalize_summary_evaluator(func:Callable)->SUMMARY_EVALUATOR_T:supported_args=("runs","examples","inputs","outputs","reference_outputs")sig=inspect.signature(func)all_args=[pnameforpname,pinsig.parameters.items()]args_with_defaults=[pnameforpname,pinsig.parameters.items()ifp.defaultisnotinspect.Parameter.empty]ifnotall_argsor(notall(pnameinsupported_argsorpnameinargs_with_defaultsforpnameinall_args)andlen([aforainall_argsifanotinargs_with_defaults])!=2):msg=(f"Invalid evaluator function. Must have at least one "f"argument. Supported arguments are {supported_args}.")ifall_args:msg+=f" Received arguments {all_args}."raiseValueError(msg)# For backwards compatibility we assume custom arg names are Sequence[Run] and# Sequence[Example] types, respectively.elifnotall(pnameinsupported_argsforpnameinall_args)orall_args==["runs","examples",]:returnfuncelse:defwrapper(runs:Sequence[schemas.Run],examples:Sequence[schemas.Example])->Union[EvaluationResult,EvaluationResults]:arg_map={"runs":runs,"examples":examples,"inputs":[example.inputsforexampleinexamples],"outputs":[run.outputsor{}forruninruns],"reference_outputs":[example.outputsor{}forexampleinexamples],}kwargs={}args=[]forparam_name,paraminsig.parameters.items():# Could have params with defaults that are not in the arg mapifparam_nameinarg_map:ifparam.kindin(param.POSITIONAL_OR_KEYWORD,param.POSITIONAL_ONLY,):args.append(arg_map[param_name])else:kwargs[param_name]=arg_map[param_name]result=func(*args,**kwargs)ifisinstance(result,EvaluationResult):returnresultreturn_format_evaluator_result(result)# type: ignorewrapper.__name__=(getattr(func,"__name__")ifhasattr(func,"__name__")elsewrapper.__name__)returnwrapper# type: ignore[return-value]