Skip to content

Benchmarks

Module for benchmarking language models.

This module provides an Evaluator class to run evaluations with various metrics.

Evaluator

Evaluator class to execute language model evaluations using various metrics.

Attributes:

Name Type Description
metrics list

List of metric classes to evaluate outputs.

online bool

Flag indicating if evaluation should be run online using a pipeline.

pipeline callable

A callable that runs the model inference on input text.

Source code in langbench/benchmarks.py
class Evaluator:
    """
    Evaluator class to execute language model evaluations using various metrics.

    Attributes:
        metrics (list): List of metric classes to evaluate outputs.
        online (bool): Flag indicating if evaluation should be run online using a pipeline.
        pipeline (callable): A callable that runs the model inference on input text.
    """

    def __init__(self, pipeline=None, online=False):
        """
        Initializes the Evaluator instance.

        Args:
            pipeline (callable, optional): A callable function for processing text.
                Required if 'online' is True.
            online (bool, optional): If True, executes pipeline processing during evaluation.

        Raises:
            ValueError: If 'online' is True but pipeline is not provided.
        """
        self.metrics = []
        self.online = online
        self.pipeline = pipeline
        if online and pipeline is None:
            raise ValueError("Pipeline must be provided for online evaluation")

    def add_metric(self, metric):
        """
        Adds a metric to the evaluator.

        Args:
            metric (class): A metric class to be added to the evaluator.
        """
        self.metrics.append(metric)

    def call_pipeline(self, text):
        """
        Calls the pipeline function on the provided text.

        Args:
            text (str): The input text to evaluate.

        Returns:
            str: The output content from the pipeline's processing.
        """
        return self.pipeline.invoke(text).content

    def execute(self, data):
        """
        Executes the pipeline on the 'input' column of the data frame and calculates latency.

        Args:
            data (DataFrame): A pandas DataFrame containing the 'input' column.
        """
        outputs = []
        latencies = []
        with Progress() as progress:
            task = progress.add_task("Calculating latency...", total=len(data["input"]))
            for text in data["input"]:
                start_time = time.time()
                output = self.call_pipeline(text)
                end_time = time.time()
                outputs.append(output)
                latencies.append(end_time - start_time)
                progress.update(task, advance=1)
        data["output"] = outputs
        data["latency"] = latencies

    def evaluate(self, input_data):
        """
        Evaluates the input data using all added metrics.

        Args:
            input_data (DataFrame): A pandas DataFrame with at least an 'input' column.

        Returns:
            DataFrame: A DataFrame containing original data along with evaluation metric outputs.
        """
        data = input_data.copy()
        if self.online:
            self.execute(data)
        for metric in self.metrics:
            metric.run(data)
        self.generate_report(data)
        return data

    def generate_report(self, data):
        """
        Generates an HTML report of evaluation metrics using Plotly box plots.
        The report is saved as 'report.html' in the current directory.

        Args:
            data (DataFrame): The evaluated DataFrame containing metrics.

        Returns:
            str: An HTML string containing the report with embedded Plotly graphs.
        """
        import plotly.express as px
        import plotly.io as pio

        # Identify metric columns that are not standard columns
        exclude = ["input", "output", "latency"]
        metric_columns = [col for col in data.columns if col not in exclude]

        html_parts = []
        # Use a box plot for latency if it is present
        if "latency" in data.columns:
            fig_latency = px.box(
                data, y="latency", points="all", title="Latency Distribution"
            )
            html_parts.append(
                pio.to_html(fig_latency, full_html=False, include_plotlyjs="cdn")
            )

        # Use a box plot for each metric column
        for col in metric_columns:
            fig = px.box(data, y=col, points="all", title=f"{col} Distribution")
            # Only include plotlyjs once in the report
            include_plotlyjs = None if html_parts else "cdn"
            html_parts.append(
                pio.to_html(fig, full_html=False, include_plotlyjs=include_plotlyjs)
            )

        html_report = "<html><head><title>Benchmark Report</title></head><body>"
        for part in html_parts:
            html_report += part
        html_report += "</body></html>"

        with open("report.html", "w") as f:
            f.write(html_report)

        return html_report

__init__(pipeline=None, online=False)

Initializes the Evaluator instance.

Parameters:

Name Type Description Default
pipeline callable

A callable function for processing text. Required if 'online' is True.

None
online bool

If True, executes pipeline processing during evaluation.

False

Raises:

Type Description
ValueError

If 'online' is True but pipeline is not provided.

Source code in langbench/benchmarks.py
def __init__(self, pipeline=None, online=False):
    """
    Initializes the Evaluator instance.

    Args:
        pipeline (callable, optional): A callable function for processing text.
            Required if 'online' is True.
        online (bool, optional): If True, executes pipeline processing during evaluation.

    Raises:
        ValueError: If 'online' is True but pipeline is not provided.
    """
    self.metrics = []
    self.online = online
    self.pipeline = pipeline
    if online and pipeline is None:
        raise ValueError("Pipeline must be provided for online evaluation")

add_metric(metric)

Adds a metric to the evaluator.

Parameters:

Name Type Description Default
metric class

A metric class to be added to the evaluator.

required
Source code in langbench/benchmarks.py
def add_metric(self, metric):
    """
    Adds a metric to the evaluator.

    Args:
        metric (class): A metric class to be added to the evaluator.
    """
    self.metrics.append(metric)

call_pipeline(text)

Calls the pipeline function on the provided text.

Parameters:

Name Type Description Default
text str

The input text to evaluate.

required

Returns:

Name Type Description
str

The output content from the pipeline's processing.

Source code in langbench/benchmarks.py
def call_pipeline(self, text):
    """
    Calls the pipeline function on the provided text.

    Args:
        text (str): The input text to evaluate.

    Returns:
        str: The output content from the pipeline's processing.
    """
    return self.pipeline.invoke(text).content

evaluate(input_data)

Evaluates the input data using all added metrics.

Parameters:

Name Type Description Default
input_data DataFrame

A pandas DataFrame with at least an 'input' column.

required

Returns:

Name Type Description
DataFrame

A DataFrame containing original data along with evaluation metric outputs.

Source code in langbench/benchmarks.py
def evaluate(self, input_data):
    """
    Evaluates the input data using all added metrics.

    Args:
        input_data (DataFrame): A pandas DataFrame with at least an 'input' column.

    Returns:
        DataFrame: A DataFrame containing original data along with evaluation metric outputs.
    """
    data = input_data.copy()
    if self.online:
        self.execute(data)
    for metric in self.metrics:
        metric.run(data)
    self.generate_report(data)
    return data

execute(data)

Executes the pipeline on the 'input' column of the data frame and calculates latency.

Parameters:

Name Type Description Default
data DataFrame

A pandas DataFrame containing the 'input' column.

required
Source code in langbench/benchmarks.py
def execute(self, data):
    """
    Executes the pipeline on the 'input' column of the data frame and calculates latency.

    Args:
        data (DataFrame): A pandas DataFrame containing the 'input' column.
    """
    outputs = []
    latencies = []
    with Progress() as progress:
        task = progress.add_task("Calculating latency...", total=len(data["input"]))
        for text in data["input"]:
            start_time = time.time()
            output = self.call_pipeline(text)
            end_time = time.time()
            outputs.append(output)
            latencies.append(end_time - start_time)
            progress.update(task, advance=1)
    data["output"] = outputs
    data["latency"] = latencies

generate_report(data)

Generates an HTML report of evaluation metrics using Plotly box plots. The report is saved as 'report.html' in the current directory.

Parameters:

Name Type Description Default
data DataFrame

The evaluated DataFrame containing metrics.

required

Returns:

Name Type Description
str

An HTML string containing the report with embedded Plotly graphs.

Source code in langbench/benchmarks.py
def generate_report(self, data):
    """
    Generates an HTML report of evaluation metrics using Plotly box plots.
    The report is saved as 'report.html' in the current directory.

    Args:
        data (DataFrame): The evaluated DataFrame containing metrics.

    Returns:
        str: An HTML string containing the report with embedded Plotly graphs.
    """
    import plotly.express as px
    import plotly.io as pio

    # Identify metric columns that are not standard columns
    exclude = ["input", "output", "latency"]
    metric_columns = [col for col in data.columns if col not in exclude]

    html_parts = []
    # Use a box plot for latency if it is present
    if "latency" in data.columns:
        fig_latency = px.box(
            data, y="latency", points="all", title="Latency Distribution"
        )
        html_parts.append(
            pio.to_html(fig_latency, full_html=False, include_plotlyjs="cdn")
        )

    # Use a box plot for each metric column
    for col in metric_columns:
        fig = px.box(data, y=col, points="all", title=f"{col} Distribution")
        # Only include plotlyjs once in the report
        include_plotlyjs = None if html_parts else "cdn"
        html_parts.append(
            pio.to_html(fig, full_html=False, include_plotlyjs=include_plotlyjs)
        )

    html_report = "<html><head><title>Benchmark Report</title></head><body>"
    for part in html_parts:
        html_report += part
    html_report += "</body></html>"

    with open("report.html", "w") as f:
        f.write(html_report)

    return html_report