Metrics

Module for benchmark metrics.

This module provides metrics (e.g. ToxicityMetric, BiasMetric) for evaluating language outputs.

`BiasMetric`

Bases: Metric

Source code in langbench/metrics.py

class BiasMetric(Metric):
    def __init__(
        self,
        classes=[
            "physical",
            "socioeconomic",
            "disability",
            "political",
            "gender",
            "sexuality",
            "racial",
            "educational",
            "nationality",
            "age",
            "religious",
        ],
        model_name="maximuspowers/bias-type-classifier",
    ):
        super().__init__("bias")
        self.model_name = model_name
        self.pipeline = download_model(model_name)
        self.classes = classes

    def calculate(self, text) -> float:
        res = self.pipeline(text, top_k=None)
        return res

    def run(self, data) -> None:
        """
        Evaluates bias for each text in data["output"] and adds new columns for each bias dimension in the DataFrame.

        Args:
            data (DataFrame): Data containing an "output" column.

        Returns:
            None
        """
        results = []
        with Progress() as progress:
            task = progress.add_task(
                f"Calculating {self.name}...", total=len(data["output"])
            )
            for text in data["output"]:
                results.append(self.calculate(text))
                progress.update(task, advance=1)
        for i, class_ in enumerate(self.classes):
            data[f"{self.name}_{class_}"] = [res[i]["score"] for res in results]

    def details(self) -> str:
        return "Measures bias."

`run(data)`

Evaluates bias for each text in data["output"] and adds new columns for each bias dimension in the DataFrame.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data containing an "output" column.	required

Returns:

Type	Description
`None`	None

Source code in langbench/metrics.py

def run(self, data) -> None:
    """
    Evaluates bias for each text in data["output"] and adds new columns for each bias dimension in the DataFrame.

    Args:
        data (DataFrame): Data containing an "output" column.

    Returns:
        None
    """
    results = []
    with Progress() as progress:
        task = progress.add_task(
            f"Calculating {self.name}...", total=len(data["output"])
        )
        for text in data["output"]:
            results.append(self.calculate(text))
            progress.update(task, advance=1)
    for i, class_ in enumerate(self.classes):
        data[f"{self.name}_{class_}"] = [res[i]["score"] for res in results]

`ToxicityMetric`

Bases: Metric

Source code in langbench/metrics.py

class ToxicityMetric(Metric):
    def __init__(self, model_name="s-nlp/roberta_toxicity_classifier"):
        super().__init__("toxicity")
        self.model_name = model_name
        self.pipeline = download_model(self.model_name)

    def calculate(self, text) -> float:
        res = self.pipeline(text, top_k=None)
        return res[0]["score"] if res[0]["label"] == "toxic" else res[1]["score"]

    def run(self, data) -> None:
        """
        Evaluates toxicity for each text in data["output"] and adds a new column 'toxicity' to the DataFrame.

        Args:
            data (DataFrame): Data containing an "output" column.

        Returns:
            None
        """
        results = []
        # Use rich progress to iterate through each text entry in the output
        with Progress() as progress:
            task = progress.add_task(
                "Calculating toxicity...", total=len(data["output"])
            )
            for text in data["output"]:
                results.append(self.calculate(text))
                progress.update(task, advance=1)
        data[f"{self.name}"] = results

    def details(self) -> str:
        return "Measures the extent of harmful or offensive language in the output."

`run(data)`

Evaluates toxicity for each text in data["output"] and adds a new column 'toxicity' to the DataFrame.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data containing an "output" column.	required

Returns:

Type	Description
`None`	None

Source code in langbench/metrics.py

def run(self, data) -> None:
    """
    Evaluates toxicity for each text in data["output"] and adds a new column 'toxicity' to the DataFrame.

    Args:
        data (DataFrame): Data containing an "output" column.

    Returns:
        None
    """
    results = []
    # Use rich progress to iterate through each text entry in the output
    with Progress() as progress:
        task = progress.add_task(
            "Calculating toxicity...", total=len(data["output"])
        )
        for text in data["output"]:
            results.append(self.calculate(text))
            progress.update(task, advance=1)
    data[f"{self.name}"] = results

`download_model(model_name)`

Downloads a pretrained model and tokenizer, and creates a Hugging Face text classification pipeline.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	The name or path of the pretrained model.	required

Returns:

Name	Type	Description
`Pipeline`		A Hugging Face pipeline for text classification.

Source code in langbench/metrics.py

def download_model(model_name):
    """
    Downloads a pretrained model and tokenizer, and creates a Hugging Face text classification pipeline.

    Args:
        model_name (str): The name or path of the pretrained model.

    Returns:
        Pipeline: A Hugging Face pipeline for text classification.
    """
    total_steps = 3  # Updated to match the number of updates

    progress = Progress()
    progress.start()
    task = progress.add_task("[cyan]Downloading Model...", total=total_steps)

    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=model_name,
    )
    progress.update(task, advance=1)

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
    progress.update(task, advance=1)

    pipeline = hf_pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
    )
    progress.update(task, advance=1)
    progress.stop()
    return pipeline