Skip to content

Commit 5541f3e

Browse files
committed
Merge branch 'main' into support-ov-models-via-genai
2 parents 423abcd + a4987bb commit 5541f3e

File tree

182 files changed

+1745
-527
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+1745
-527
lines changed

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
* @haileyschoelkopf @lintangsutawika
1+
* @haileyschoelkopf @lintangsutawika @baberabb

docs/interface.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,11 @@ This mode supports a number of command-line arguments, the details of which can
4646

4747
- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
4848

49-
- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
49+
- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways:
50+
- `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied.
51+
- `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt.
52+
53+
For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer.
5054

5155
- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
5256

docs/model_guide.md

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -118,17 +118,45 @@ class MyCustomLM(LM):
118118
#...
119119
@property
120120
def tokenizer_name(self) -> str:
121-
# should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
122-
123-
@property
124-
def chat_template(self) -> str:
125-
# should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
126-
# this will be saved in the evaluation results for reproducibility.
121+
"""
122+
Return the name of the model's tokenizer and/or the accompanying chat template.
123+
The returned string is used to cache requests.
124+
125+
Returns:
126+
str: The name of the model's tokenizer and/or chat template.
127+
"""
128+
129+
def chat_template(self, chat_template: Union[bool, str] = False) -> str:
130+
"""
131+
Get the appropriate chat template for the model based on the `chat_template` argument.
132+
133+
This method returns the chat template string to build the prompt from a chat history.
134+
The chat template is saved in the evaluation results for reproducibility.
135+
Boolean arguments should be used with models that have only one chat template,
136+
while string arguments are used with models that have multiple chat templates.
137+
For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
138+
139+
Args:
140+
chat_template (Union[bool, str]): Specifies whether to apply a chat template:
141+
- If False: Do not apply any chat template.
142+
- If True: Apply the default chat template.
143+
- If str: Apply the specified chat template by name.
144+
145+
Returns:
146+
str: The selected chat template in Jinja format.
147+
"""
127148

128149
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
129-
# responsible for taking as input a chat history that would be fed into the model, and
130-
# rendering it as a string that can be then tokenized and input into the model.
131-
#...
150+
"""
151+
Process a chat history to create a string that can be tokenized and input into the model.
152+
153+
Args:
154+
chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history,
155+
where each dictionary has "role" and "content" keys.
156+
157+
Returns:
158+
str: A string representing the chat history that can be tokenized and fed into the model.
159+
"""
132160
```
133161

134162
- `apply_chat_template`

lm_eval/__main__.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,16 @@ def setup_parser() -> argparse.ArgumentParser:
170170
)
171171
parser.add_argument(
172172
"--apply_chat_template",
173-
action="store_true",
173+
type=str,
174+
nargs="?",
175+
const=True,
174176
default=False,
175-
help="If True, applies the chat template to the prompt",
177+
help=(
178+
"If True, apply chat template to the prompt. "
179+
"Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
180+
"To apply a specific template from the available list of templates, provide the template name as an argument. "
181+
"E.g. `--apply_chat_template template_name`"
182+
),
176183
)
177184
parser.add_argument(
178185
"--fewshot_as_multiturn",
@@ -289,7 +296,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
289296

290297
if args.fewshot_as_multiturn and args.apply_chat_template is False:
291298
raise ValueError(
292-
"If fewshot_as_multiturn is set, apply_chat_template must be set to True."
299+
"When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
293300
)
294301

295302
if (

lm_eval/evaluator.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def simple_evaluate(
6464
log_samples: bool = True,
6565
evaluation_tracker: Optional[EvaluationTracker] = None,
6666
system_instruction: Optional[str] = None,
67-
apply_chat_template: bool = False,
67+
apply_chat_template: Union[bool, str] = False,
6868
fewshot_as_multiturn: bool = False,
6969
gen_kwargs: Optional[str] = None,
7070
task_manager: Optional[TaskManager] = None,
@@ -112,8 +112,11 @@ def simple_evaluate(
112112
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
113113
:param system_instruction: str
114114
System instruction to be applied to the prompt
115-
:param apply_chat_template: bool
116-
If True, apply chat template to the prompt
115+
:param apply_chat_template: Union[bool, str]
116+
Specifies whether to apply a chat template to the prompt.
117+
- If set to True, the default chat template is applied.
118+
- If set to a string, applies the specified chat template by name.
119+
Defaults to False (no chat template applied).
117120
:param fewshot_as_multiturn: bool
118121
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
119122
:param gen_kwargs: str
@@ -289,7 +292,7 @@ def _adjust_config(task_dict):
289292
model_source=model,
290293
model_args=model_args,
291294
system_instruction=system_instruction,
292-
chat_template=lm.chat_template if apply_chat_template else None,
295+
chat_template=lm.chat_template(apply_chat_template),
293296
fewshot_as_multiturn=fewshot_as_multiturn,
294297
)
295298

@@ -362,7 +365,7 @@ def evaluate(
362365
write_out: bool = False,
363366
log_samples: bool = True,
364367
system_instruction: Optional[str] = None,
365-
apply_chat_template: bool = False,
368+
apply_chat_template: Union[bool, str] = False,
366369
fewshot_as_multiturn: bool = False,
367370
verbosity: str = "INFO",
368371
):
@@ -382,8 +385,11 @@ def evaluate(
382385
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
383386
:param system_instruction: str
384387
System instruction to be applied to the prompt
385-
:param apply_chat_template: bool
386-
If True, apply chat template to the prompt
388+
:param apply_chat_template: Union[bool, str]
389+
Specifies whether to apply a chat template to the prompt.
390+
- If set to True, the default chat template is applied.
391+
- If set to a string, applies the specified chat template by name.
392+
Defaults to False (no chat template applied).
387393
:param fewshot_as_multiturn: bool
388394
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
389395
:return
@@ -416,7 +422,7 @@ def evaluate(
416422
cache_requests=cache_requests,
417423
rewrite_requests_cache=rewrite_requests_cache,
418424
system_instruction=system_instruction,
419-
apply_chat_template=apply_chat_template,
425+
apply_chat_template=bool(apply_chat_template),
420426
fewshot_as_multiturn=fewshot_as_multiturn,
421427
chat_template=getattr(lm, "apply_chat_template")
422428
if apply_chat_template

lm_eval/models/huggingface.py

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -438,11 +438,97 @@ def world_size(self):
438438
def tokenizer_name(self) -> str:
439439
return self.tokenizer.name_or_path.replace("/", "__")
440440

441-
@property
442-
def chat_template(self) -> str:
443-
if self.tokenizer.chat_template is not None:
444-
return self.tokenizer.chat_template
445-
return self.tokenizer.default_chat_template
441+
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
442+
"""
443+
Get the appropriate chat template for the model based on configuration and input.
444+
This method determines, and returns the correct chat template, ensuring reproducibility.
445+
446+
The template selection logic is adapted from the Transformers library's `apply_chat_template`
447+
method in the Tokenizer class. The original implementation can be found at:
448+
https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
449+
450+
This method ensures that the right template is chosen based on the following:
451+
1. If the model's tokenizer has multiple templates:
452+
a. Use the specified template if it exists in the dictionary.
453+
b. Use the default template from the list if no specific template is provided.
454+
c. Raise an error if no default template exists and no specific template is provided.
455+
2. If the model's tokenizer has a single template or no template:
456+
a. Use the tokenizer's chat template if available.
457+
b. Fall back to the default chat template if no tokenizer chat template exists.
458+
459+
Args:
460+
chat_template (Union[bool, str]): Specifies the chat template to use.
461+
- If False or None, no template is applied.
462+
- If True, the default or only available template is used.
463+
- If a string, the template with the matching name is used.
464+
465+
Returns:
466+
Optional[str]: The selected chat template, or None if no template is applied.
467+
"""
468+
if chat_template is False or chat_template is None:
469+
eval_logger.warning(
470+
"model.chat_template was called with the chat_template set to False or None. "
471+
"Therefore no chat template will be applied. Make sure this is an intended behavior."
472+
)
473+
return None
474+
475+
# Convert boolean chat_template to None to ensure compatibility with the adapted logic
476+
if isinstance(chat_template, bool):
477+
chat_template = None
478+
using_default_template = False
479+
480+
# First, handle the cases when the model has a dict of multiple templates
481+
template = self.tokenizer.chat_template or self.tokenizer.default_chat_template
482+
483+
if isinstance(template, dict):
484+
using_default_dict = self.tokenizer.chat_template is None
485+
486+
if chat_template is not None:
487+
if chat_template in template:
488+
selected_template = template[chat_template]
489+
if using_default_dict:
490+
using_default_template = True
491+
else:
492+
raise ValueError(
493+
f"The specified chat template '{chat_template}' is not available. "
494+
f"Available template names are {sorted(template.keys())}."
495+
)
496+
else:
497+
# If user didn't pass a chat template, use the default template from the dict
498+
if "default" in template:
499+
selected_template = template["default"]
500+
using_default_template = True
501+
else:
502+
raise ValueError(
503+
"This model has multiple chat templates with no default specified! Please either pass a chat "
504+
"template or the name of the template you wish to use to the `chat_template` argument. Available "
505+
f"template names are {sorted(template.keys())}."
506+
)
507+
508+
# Cases when the model has a single template or no template
509+
else:
510+
# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
511+
if isinstance(chat_template, str):
512+
eval_logger.warning(
513+
"Chat template name provided, but the tokenizer's chat template is not a dictionary. "
514+
"Using the tokenizer's chat template or the default template instead."
515+
)
516+
if self.tokenizer.chat_template is not None:
517+
selected_template = self.tokenizer.chat_template
518+
else:
519+
selected_template = self.tokenizer.default_chat_template
520+
using_default_template = True
521+
522+
if using_default_template:
523+
eval_logger.warning(
524+
"No chat template is set for this tokenizer, falling back to a default class-level template. This is "
525+
"very error-prone, because models are often trained with templates different from the class default! "
526+
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
527+
"point any code depending on them will stop working. We recommend setting a valid chat template before "
528+
"then to ensure that this model continues working without issues."
529+
)
530+
531+
return selected_template
446532

447533
def _get_backend(
448534
self,

lm_eval/tasks/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
5959
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
6060
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
61+
| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
6162
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
6263
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
6364
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |

lm_eval/tasks/belebele/_belebele.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,4 @@ aggregate_metric_list:
130130
metric: acc_norm
131131
weight_by_size: true
132132
metadata:
133-
version: 0.0
133+
version: 0.1
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "acm_Arab"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_acm_Arab"
4-
"test_split": "acm_Arab"
1+
dataset_name: acm_Arab
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_acm_Arab
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "afr_Latn"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_afr_Latn"
4-
"test_split": "afr_Latn"
1+
dataset_name: afr_Latn
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_afr_Latn
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "als_Latn"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_als_Latn"
4-
"test_split": "als_Latn"
1+
dataset_name: als_Latn
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_als_Latn
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "amh_Ethi"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_amh_Ethi"
4-
"test_split": "amh_Ethi"
1+
dataset_name: amh_Ethi
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_amh_Ethi
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "apc_Arab"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_apc_Arab"
4-
"test_split": "apc_Arab"
1+
dataset_name: apc_Arab
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_apc_Arab
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "arb_Arab"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_arb_Arab"
4-
"test_split": "arb_Arab"
1+
dataset_name: arb_Arab
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_arb_Arab
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "arb_Latn"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_arb_Latn"
4-
"test_split": "arb_Latn"
1+
dataset_name: arb_Latn
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_arb_Latn
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "ars_Arab"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_ars_Arab"
4-
"test_split": "ars_Arab"
1+
dataset_name: ars_Arab
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_ars_Arab
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "ary_Arab"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_ary_Arab"
4-
"test_split": "ary_Arab"
1+
dataset_name: ary_Arab
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_ary_Arab
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "arz_Arab"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_arz_Arab"
4-
"test_split": "arz_Arab"
1+
dataset_name: arz_Arab
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_arz_Arab
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "asm_Beng"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_asm_Beng"
4-
"test_split": "asm_Beng"
1+
dataset_name: asm_Beng
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_asm_Beng
5+
test_split: test
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
"fewshot_split": "azj_Latn"
2-
"include": "_default_template_yaml"
3-
"task": "belebele_azj_Latn"
4-
"test_split": "azj_Latn"
1+
dataset_name: azj_Latn
2+
fewshot_split: test
3+
include: _default_template_yaml
4+
task: belebele_azj_Latn
5+
test_split: test

0 commit comments

Comments
 (0)