Module for benchmark metrics.
This module provides metrics (e.g. ToxicityMetric, BiasMetric) for evaluating language outputs.
BiasMetric
Bases: Metric
Source code in langbench/metrics.py
| class BiasMetric(Metric):
def __init__(
self,
classes=[
"physical",
"socioeconomic",
"disability",
"political",
"gender",
"sexuality",
"racial",
"educational",
"nationality",
"age",
"religious",
],
model_name="maximuspowers/bias-type-classifier",
):
super().__init__("bias")
self.model_name = model_name
self.pipeline = download_model(model_name)
self.classes = classes
def calculate(self, text) -> float:
res = self.pipeline(text, top_k=None)
return res
def run(self, data) -> None:
"""
Evaluates bias for each text in data["output"] and adds new columns for each bias dimension in the DataFrame.
Args:
data (DataFrame): Data containing an "output" column.
Returns:
None
"""
results = []
with Progress() as progress:
task = progress.add_task(
f"Calculating {self.name}...", total=len(data["output"])
)
for text in data["output"]:
results.append(self.calculate(text))
progress.update(task, advance=1)
for i, class_ in enumerate(self.classes):
data[f"{self.name}_{class_}"] = [res[i]["score"] for res in results]
def details(self) -> str:
return "Measures bias."
|
run(data)
Evaluates bias for each text in data["output"] and adds new columns for each bias dimension in the DataFrame.
Parameters:
Name |
Type |
Description |
Default |
data
|
DataFrame
|
Data containing an "output" column.
|
required
|
Returns:
Source code in langbench/metrics.py
| def run(self, data) -> None:
"""
Evaluates bias for each text in data["output"] and adds new columns for each bias dimension in the DataFrame.
Args:
data (DataFrame): Data containing an "output" column.
Returns:
None
"""
results = []
with Progress() as progress:
task = progress.add_task(
f"Calculating {self.name}...", total=len(data["output"])
)
for text in data["output"]:
results.append(self.calculate(text))
progress.update(task, advance=1)
for i, class_ in enumerate(self.classes):
data[f"{self.name}_{class_}"] = [res[i]["score"] for res in results]
|
ToxicityMetric
Bases: Metric
Source code in langbench/metrics.py
| class ToxicityMetric(Metric):
def __init__(self, model_name="s-nlp/roberta_toxicity_classifier"):
super().__init__("toxicity")
self.model_name = model_name
self.pipeline = download_model(self.model_name)
def calculate(self, text) -> float:
res = self.pipeline(text, top_k=None)
return res[0]["score"] if res[0]["label"] == "toxic" else res[1]["score"]
def run(self, data) -> None:
"""
Evaluates toxicity for each text in data["output"] and adds a new column 'toxicity' to the DataFrame.
Args:
data (DataFrame): Data containing an "output" column.
Returns:
None
"""
results = []
# Use rich progress to iterate through each text entry in the output
with Progress() as progress:
task = progress.add_task(
"Calculating toxicity...", total=len(data["output"])
)
for text in data["output"]:
results.append(self.calculate(text))
progress.update(task, advance=1)
data[f"{self.name}"] = results
def details(self) -> str:
return "Measures the extent of harmful or offensive language in the output."
|
run(data)
Evaluates toxicity for each text in data["output"] and adds a new column 'toxicity' to the DataFrame.
Parameters:
Name |
Type |
Description |
Default |
data
|
DataFrame
|
Data containing an "output" column.
|
required
|
Returns:
Source code in langbench/metrics.py
| def run(self, data) -> None:
"""
Evaluates toxicity for each text in data["output"] and adds a new column 'toxicity' to the DataFrame.
Args:
data (DataFrame): Data containing an "output" column.
Returns:
None
"""
results = []
# Use rich progress to iterate through each text entry in the output
with Progress() as progress:
task = progress.add_task(
"Calculating toxicity...", total=len(data["output"])
)
for text in data["output"]:
results.append(self.calculate(text))
progress.update(task, advance=1)
data[f"{self.name}"] = results
|
download_model(model_name)
Downloads a pretrained model and tokenizer, and creates a Hugging Face text classification pipeline.
Parameters:
Name |
Type |
Description |
Default |
model_name
|
str
|
The name or path of the pretrained model.
|
required
|
Returns:
Name | Type |
Description |
Pipeline |
|
A Hugging Face pipeline for text classification.
|
Source code in langbench/metrics.py
| def download_model(model_name):
"""
Downloads a pretrained model and tokenizer, and creates a Hugging Face text classification pipeline.
Args:
model_name (str): The name or path of the pretrained model.
Returns:
Pipeline: A Hugging Face pipeline for text classification.
"""
total_steps = 3 # Updated to match the number of updates
progress = Progress()
progress.start()
task = progress.add_task("[cyan]Downloading Model...", total=total_steps)
model = AutoModelForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=model_name,
)
progress.update(task, advance=1)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
progress.update(task, advance=1)
pipeline = hf_pipeline(
"text-classification",
model=model,
tokenizer=tokenizer,
)
progress.update(task, advance=1)
progress.stop()
return pipeline
|