EleutherAI
diff --git a/‎CODEOWNERS
Lines changed: 1 addition & 1 deletion b/‎CODEOWNERS
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/interface.md
Lines changed: 5 additions & 1 deletion b/‎docs/interface.md
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/model_guide.md
Lines changed: 37 additions & 9 deletions b/‎docs/model_guide.md
Lines changed: 37 additions & 9 deletions
diff --git a/‎lm_eval/__main__.py
Lines changed: 10 additions & 3 deletions b/‎lm_eval/__main__.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎lm_eval/evaluator.py
Lines changed: 14 additions & 8 deletions b/‎lm_eval/evaluator.py
Lines changed: 14 additions & 8 deletions
diff --git a/‎lm_eval/models/huggingface.py
Lines changed: 91 additions & 5 deletions b/‎lm_eval/models/huggingface.py
Lines changed: 91 additions & 5 deletions
diff --git a/‎lm_eval/tasks/README.md
Lines changed: 1 addition & 0 deletions b/‎lm_eval/tasks/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎lm_eval/tasks/belebele/_belebele.yaml
Lines changed: 1 addition & 1 deletion b/‎lm_eval/tasks/belebele/_belebele.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎lm_eval/tasks/belebele/belebele_acm_Arab.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_acm_Arab.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_afr_Latn.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_afr_Latn.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_als_Latn.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_als_Latn.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_apc_Arab.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_apc_Arab.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_arb_Arab.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_arb_Arab.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_arb_Latn.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_arb_Latn.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_ars_Arab.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_ars_Arab.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_ary_Arab.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_ary_Arab.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_arz_Arab.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_arz_Arab.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_asm_Beng.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_asm_Beng.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎lm_eval/tasks/belebele/belebele_azj_Latn.yaml
Lines changed: 5 additions & 4 deletions b/‎lm_eval/tasks/belebele/belebele_azj_Latn.yaml
Lines changed: 5 additions & 4 deletions
@@ -1 +1 @@
-* @haileyschoelkopf @lintangsutawika
+* @haileyschoelkopf @lintangsutawika @baberabb
@@ -46,7 +46,11 @@ This mode supports a number of command-line arguments, the details of which can
 
 - `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
 
-- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
+- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways:
+	- `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied.
+	- `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt.
+
+    For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer.
 
 - `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
 
 
@@ -118,17 +118,45 @@ class MyCustomLM(LM):
     #...
     @property
     def tokenizer_name(self) -> str:
-        # should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
-
-    @property
-    def chat_template(self) -> str:
-        # should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
-        # this will be saved in the evaluation results for reproducibility.
+        """
+        Return the name of the model's tokenizer and/or the accompanying chat template.
+        The returned string is used to cache requests.
+
+        Returns:
+            str: The name of the model's tokenizer and/or chat template.
+        """
+
+    def chat_template(self, chat_template: Union[bool, str] = False) -> str:
+        """
+        Get the appropriate chat template for the model based on the `chat_template` argument.
+
+        This method returns the chat template string to build the prompt from a chat history.
+        The chat template is saved in the evaluation results for reproducibility.
+        Boolean arguments should be used with models that have only one chat template,
+        while string arguments are used with models that have multiple chat templates.
+        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
+
+        Args:
+            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
+                - If False: Do not apply any chat template.
+                - If True: Apply the default chat template.
+                - If str: Apply the specified chat template by name.
+
+        Returns:
+            str: The selected chat template in Jinja format.
+        """
 
     def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
-        # responsible for taking as input a chat history that would be fed into the model, and
-        # rendering it as a string that can be then tokenized and input into the model.
-    #...
+        """
+        Process a chat history to create a string that can be tokenized and input into the model.
+
+        Args:
+            chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history,
+                where each dictionary has "role" and "content" keys.
+
+        Returns:
+            str: A string representing the chat history that can be tokenized and fed into the model.
+        """
 ```
 
 - `apply_chat_template`
 
@@ -170,9 +170,16 @@ def setup_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument(
         "--apply_chat_template",
-        action="store_true",
+        type=str,
+        nargs="?",
+        const=True,
         default=False,
-        help="If True, applies the chat template to the prompt",
+        help=(
+            "If True, apply chat template to the prompt. "
+            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
+            "To apply a specific template from the available list of templates, provide the template name as an argument. "
+            "E.g. `--apply_chat_template template_name`"
+        ),
     )
     parser.add_argument(
         "--fewshot_as_multiturn",
@@ -289,7 +296,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
 
     if args.fewshot_as_multiturn and args.apply_chat_template is False:
         raise ValueError(
-            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
+            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
         )
 
     if (
 
@@ -64,7 +64,7 @@ def simple_evaluate(
     log_samples: bool = True,
     evaluation_tracker: Optional[EvaluationTracker] = None,
     system_instruction: Optional[str] = None,
-    apply_chat_template: bool = False,
+    apply_chat_template: Union[bool, str] = False,
     fewshot_as_multiturn: bool = False,
     gen_kwargs: Optional[str] = None,
     task_manager: Optional[TaskManager] = None,
@@ -112,8 +112,11 @@ def simple_evaluate(
         If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
     :param system_instruction: str
         System instruction to be applied to the prompt
-    :param apply_chat_template: bool
-        If True, apply chat template to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
     :param fewshot_as_multiturn: bool
         Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
     :param gen_kwargs: str
@@ -289,7 +292,7 @@ def _adjust_config(task_dict):
             model_source=model,
             model_args=model_args,
             system_instruction=system_instruction,
-            chat_template=lm.chat_template if apply_chat_template else None,
+            chat_template=lm.chat_template(apply_chat_template),
             fewshot_as_multiturn=fewshot_as_multiturn,
         )
 
@@ -362,7 +365,7 @@ def evaluate(
     write_out: bool = False,
     log_samples: bool = True,
     system_instruction: Optional[str] = None,
-    apply_chat_template: bool = False,
+    apply_chat_template: Union[bool, str] = False,
     fewshot_as_multiturn: bool = False,
     verbosity: str = "INFO",
 ):
@@ -382,8 +385,11 @@ def evaluate(
         If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
     :param system_instruction: str
         System instruction to be applied to the prompt
-    :param apply_chat_template: bool
-        If True, apply chat template to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
     :param fewshot_as_multiturn: bool
         Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
     :return
@@ -416,7 +422,7 @@ def evaluate(
             cache_requests=cache_requests,
             rewrite_requests_cache=rewrite_requests_cache,
             system_instruction=system_instruction,
-            apply_chat_template=apply_chat_template,
+            apply_chat_template=bool(apply_chat_template),
             fewshot_as_multiturn=fewshot_as_multiturn,
             chat_template=getattr(lm, "apply_chat_template")
             if apply_chat_template
 
@@ -438,11 +438,97 @@ def world_size(self):
     def tokenizer_name(self) -> str:
         return self.tokenizer.name_or_path.replace("/", "__")
 
-    @property
-    def chat_template(self) -> str:
-        if self.tokenizer.chat_template is not None:
-            return self.tokenizer.chat_template
-        return self.tokenizer.default_chat_template
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        """
+        Get the appropriate chat template for the model based on configuration and input.
+        This method determines, and returns the correct chat template, ensuring reproducibility.
+
+        The template selection logic is adapted from the Transformers library's `apply_chat_template`
+        method in the Tokenizer class. The original implementation can be found at:
+        https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
+
+        This method ensures that the right template is chosen based on the following:
+        1. If the model's tokenizer has multiple templates:
+            a. Use the specified template if it exists in the dictionary.
+            b. Use the default template from the list if no specific template is provided.
+            c. Raise an error if no default template exists and no specific template is provided.
+        2. If the model's tokenizer has a single template or no template:
+            a. Use the tokenizer's chat template if available.
+            b. Fall back to the default chat template if no tokenizer chat template exists.
+
+        Args:
+            chat_template (Union[bool, str]): Specifies the chat template to use.
+                - If False or None, no template is applied.
+                - If True, the default or only available template is used.
+                - If a string, the template with the matching name is used.
+
+        Returns:
+            Optional[str]: The selected chat template, or None if no template is applied.
+        """
+        if chat_template is False or chat_template is None:
+            eval_logger.warning(
+                "model.chat_template was called with the chat_template set to False or None. "
+                "Therefore no chat template will be applied. Make sure this is an intended behavior."
+            )
+            return None
+
+        # Convert boolean chat_template to None to ensure compatibility with the adapted logic
+        if isinstance(chat_template, bool):
+            chat_template = None
+        using_default_template = False
+
+        # First, handle the cases when the model has a dict of multiple templates
+        template = self.tokenizer.chat_template or self.tokenizer.default_chat_template
+
+        if isinstance(template, dict):
+            using_default_dict = self.tokenizer.chat_template is None
+
+            if chat_template is not None:
+                if chat_template in template:
+                    selected_template = template[chat_template]
+                    if using_default_dict:
+                        using_default_template = True
+                else:
+                    raise ValueError(
+                        f"The specified chat template '{chat_template}' is not available. "
+                        f"Available template names are {sorted(template.keys())}."
+                    )
+            else:
+                # If user didn't pass a chat template, use the default template from the dict
+                if "default" in template:
+                    selected_template = template["default"]
+                    using_default_template = True
+                else:
+                    raise ValueError(
+                        "This model has multiple chat templates with no default specified! Please either pass a chat "
+                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                        f"template names are {sorted(template.keys())}."
+                    )
+
+        # Cases when the model has a single template or no template
+        else:
+            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+            if isinstance(chat_template, str):
+                eval_logger.warning(
+                    "Chat template name provided, but the tokenizer's chat template is not a dictionary. "
+                    "Using the tokenizer's chat template or the default template instead."
+                )
+            if self.tokenizer.chat_template is not None:
+                selected_template = self.tokenizer.chat_template
+            else:
+                selected_template = self.tokenizer.default_chat_template
+                using_default_template = True
+
+        if using_default_template:
+            eval_logger.warning(
+                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
+                "very error-prone, because models are often trained with templates different from the class default! "
+                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
+                "point any code depending on them will stop working. We recommend setting a valid chat template before "
+                "then to ensure that this model continues working without issues."
+            )
+
+        return selected_template
 
     def _get_backend(
         self,
 
@@ -58,6 +58,7 @@
 | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
 | [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
+| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
 | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
 
@@ -130,4 +130,4 @@ aggregate_metric_list:
     metric: acc_norm
     weight_by_size: true
 metadata:
-  version: 0.0
+  version: 0.1
@@ -1,4 +1,5 @@
-"fewshot_split": "acm_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_acm_Arab"
-"test_split": "acm_Arab"
+dataset_name: acm_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_acm_Arab
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "afr_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_afr_Latn"
-"test_split": "afr_Latn"
+dataset_name: afr_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_afr_Latn
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "als_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_als_Latn"
-"test_split": "als_Latn"
+dataset_name: als_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_als_Latn
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "amh_Ethi"
-"include": "_default_template_yaml"
-"task": "belebele_amh_Ethi"
-"test_split": "amh_Ethi"
+dataset_name: amh_Ethi
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_amh_Ethi
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "apc_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_apc_Arab"
-"test_split": "apc_Arab"
+dataset_name: apc_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_apc_Arab
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "arb_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_arb_Arab"
-"test_split": "arb_Arab"
+dataset_name: arb_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_arb_Arab
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "arb_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_arb_Latn"
-"test_split": "arb_Latn"
+dataset_name: arb_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_arb_Latn
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "ars_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_ars_Arab"
-"test_split": "ars_Arab"
+dataset_name: ars_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_ars_Arab
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "ary_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_ary_Arab"
-"test_split": "ary_Arab"
+dataset_name: ary_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_ary_Arab
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "arz_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_arz_Arab"
-"test_split": "arz_Arab"
+dataset_name: arz_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_arz_Arab
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "asm_Beng"
-"include": "_default_template_yaml"
-"task": "belebele_asm_Beng"
-"test_split": "asm_Beng"
+dataset_name: asm_Beng
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_asm_Beng
+test_split: test
@@ -1,4 +1,5 @@
-"fewshot_split": "azj_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_azj_Latn"
-"test_split": "azj_Latn"
+dataset_name: azj_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_azj_Latn
+test_split: test
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-* @haileyschoelkopf @lintangsutawika`
	`1`	`+* @haileyschoelkopf @lintangsutawika @baberabb`