EleutherAI
diff --git a/‎lm_eval/api/model.py
Lines changed: 5 additions & 2 deletions b/‎lm_eval/api/model.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎lm_eval/api/samplers.py
Lines changed: 2 additions & 1 deletion b/‎lm_eval/api/samplers.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎lm_eval/evaluator.py
Lines changed: 10 additions & 2 deletions b/‎lm_eval/evaluator.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎lm_eval/models/api_models.py
Lines changed: 10 additions & 3 deletions b/‎lm_eval/models/api_models.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎lm_eval/models/huggingface.py
Lines changed: 11 additions & 2 deletions b/‎lm_eval/models/huggingface.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎lm_eval/models/nemo_lm.py
Lines changed: 6 additions & 0 deletions b/‎lm_eval/models/nemo_lm.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎lm_eval/models/neuralmagic.py
Lines changed: 3 additions & 0 deletions b/‎lm_eval/models/neuralmagic.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎lm_eval/models/neuron_optimum.py
Lines changed: 7 additions & 2 deletions b/‎lm_eval/models/neuron_optimum.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎lm_eval/models/vllm_causallms.py
Lines changed: 9 additions & 2 deletions b/‎lm_eval/models/vllm_causallms.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎lm_eval/tasks/README.md
Lines changed: 1 addition & 1 deletion b/‎lm_eval/tasks/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎lm_eval/tasks/__init__.py
Lines changed: 1 addition & 1 deletion b/‎lm_eval/tasks/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎lm_eval/tasks/aclue/README.md
Lines changed: 1 addition & 1 deletion b/‎lm_eval/tasks/aclue/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎lm_eval/tasks/asdiv/README.md
Lines changed: 5 additions & 0 deletions b/‎lm_eval/tasks/asdiv/README.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
Lines changed: 88 additions & 0 deletions b/‎lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
Lines changed: 88 additions & 0 deletions
diff --git a/‎lm_eval/tasks/eq_bench/README.md
Lines changed: 2 additions & 2 deletions b/‎lm_eval/tasks/eq_bench/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎lm_eval/tasks/ifeval/ifeval.yaml
Lines changed: 1 addition & 1 deletion b/‎lm_eval/tasks/ifeval/ifeval.yaml
Lines changed: 1 addition & 1 deletion
@@ -283,8 +283,11 @@ def fn(requests):
             eval_logger.info(
                 f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
             )
-            # actually run the LM on the requests that do not have cached results
-            rem_res = getattr(self.lm, attr)(remaining_reqs)
+            if remaining_reqs:
+                # actually run the LM on the requests that do not have cached results
+                rem_res = getattr(self.lm, attr)(remaining_reqs)
+            else:
+                rem_res = []
 
             # stick the new ones back into the list and also cache any of the new ones
             resptr = 0
 
@@ -82,8 +82,9 @@ def get_context(self, doc, num_fewshot):
                 if self.config.doc_to_choice is None or isinstance(doc_content, str)
                 else self.doc_to_choice(doc)[doc_content]
             )
-            labeled_examples += self.target_delimiter
+
             if doc_target != "":
+                labeled_examples += self.target_delimiter
                 labeled_examples += (
                     str(doc_target[0])
                     if isinstance(doc_target, list)
 
@@ -208,7 +208,9 @@ def simple_evaluate(
             )
     else:
         if not isinstance(model, lm_eval.api.model.LM):
-            raise TypeError
+            raise TypeError(
+                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
+            )
         eval_logger.info("Using pre-initialized model")
         lm = model
 
@@ -287,12 +289,18 @@ def _adjust_config(task_dict):
     if check_integrity:
         run_task_tests(task_list=tasks)
 
+    # hotfix: delete when chat_template fixed
+    try:
+        chat = lm.chat_template(apply_chat_template)
+    except:  # noqa: E722
+        chat = None
+
     if evaluation_tracker is not None:
         evaluation_tracker.general_config_tracker.log_experiment_args(
             model_source=model,
             model_args=model_args,
             system_instruction=system_instruction,
-            chat_template=lm.chat_template(apply_chat_template),
+            chat_template=chat,
             fewshot_as_multiturn=fewshot_as_multiturn,
         )
 
 
@@ -104,7 +104,9 @@ def __init__(
         self._truncate = truncate
         self._max_gen_toks = int(max_gen_toks)
         self._seed = int(seed)
-        self.max_length = max_length
+        # max_length - 1 as we always have 1 token for generation
+        eval_logger.info(f"Using max length {max_length} - 1")
+        self.max_length = max_length - 1
         if int(num_concurrent) <= 1:
             eval_logger.info(
                 "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
@@ -417,6 +419,7 @@ def batch_logliklehood_requests(
         cache_keys = []
         for chunk in chunks:
             for cache_key, context_enc, continuation_enc in chunk:
+                # max_length - 1 as we always have 1 token for generation
                 inp = (context_enc + continuation_enc)[-(self.max_length) :]
                 ctxlen = len(context_enc) - max(
                     0, len(context_enc) + len(continuation_enc) - (self.max_length)
@@ -510,7 +513,7 @@ def _collate(req: LogLikelihoodInputs):
                 ):
                     if answer_ is not None:
                         res.append(answer_)
-                        # partial caching
+                        # cache requests that aren't from a loglikelihood_rolling request
                         if cache_key is not None:
                             self.cache_hook.add_partial(
                                 "loglikelihood", cache_key, answer_
@@ -619,7 +622,8 @@ def loglikelihood_rolling(
                     utils.get_rolling_token_windows(
                         token_list=self.tok_encode(string),
                         prefix_token=self.prefix_token_id,
-                        max_seq_len=self.max_length,
+                        # max_seq_len - (1 for context)
+                        max_seq_len=self.max_length - 1,
                         context_len=1,
                     ),
                 )
@@ -638,4 +642,7 @@ def loglikelihood_rolling(
 
             string_nll = sum(string_nll)
             loglikelihoods.append(string_nll)
+
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
         return loglikelihoods
@@ -688,10 +688,10 @@ def _create_model(
                     raise AssertionError("load_in_4bit requires peft >= 0.4.0")
             if self._model.config.vocab_size != len(self.tokenizer):
                 # resize model for LoRAs with added tokens
-                self._model.resize_token_embeddings(len(self.tokenizer))
                 eval_logger.info(
                     f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
                 )
+                self._model.resize_token_embeddings(len(self.tokenizer))
             self._model = PeftModel.from_pretrained(
                 self._model, peft, revision=revision
             )
@@ -1018,6 +1018,9 @@ def loglikelihood_rolling(
             string_nll = sum(string_nll)
             loglikelihoods.append(string_nll)
 
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+
         return loglikelihoods
 
     def _batch_scheduler(self, pos, n_reordered_requests):
@@ -1246,7 +1249,13 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
 
                     res.append(answer)
 
-                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    if request_str is not None:
+                        # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                        # all with cache key None. instead do add_partial on the per-example level
+                        # in the loglikelihood_rolling() function for those.
+                        self.cache_hook.add_partial(
+                            "loglikelihood", request_str, answer
+                        )
                     pbar.update(1)
 
         pbar.close()
 
@@ -386,6 +386,9 @@ def loglikelihood_rolling(
 
             string_nll = sum(string_nll)
             loglikelihoods.append(string_nll)
+
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
         return loglikelihoods
 
     def _loglikelihood_tokens(self, requests, disable_tqdm=False):
@@ -468,6 +471,9 @@ def _collate(x):
                 answer = (logprob, is_greedy)
 
                 if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                     self.cache_hook.add_partial("loglikelihood", cache_key, answer)
 
                 res.append(answer)
 
@@ -321,6 +321,9 @@ def _collate(x):
                 res.append(answer)
 
                 if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                     self.cache_hook.add_partial("loglikelihood", cache_key, answer)
 
         return re_ord.get_original(res)
 
@@ -502,7 +502,8 @@ def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
 
             string_nll = sum(string_nll)
             loglikelihoods.append(string_nll)
-
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
         return loglikelihoods
 
     def _loglikelihood_tokens(
@@ -620,7 +621,11 @@ def _collate(x):
 
                 res.append(answer)
 
-                self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
 
         return re_ord.get_original(res)
 
 
@@ -289,7 +289,8 @@ def loglikelihood_rolling(
                     make_disjoint_window,
                     get_rolling_token_windows(
                         token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
+                        # max_seq_len - (1 for context)
                         max_seq_len=self.max_length - 1,
                         context_len=1,
                     ),
@@ -307,6 +308,10 @@ def loglikelihood_rolling(
 
             string_nll = sum(string_nll)
             loglikelihoods.append(string_nll)
+
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+
         return loglikelihoods
 
     def generate_until(
@@ -453,8 +458,10 @@ def _collate(x):
 
                 res.append(answer)
 
-                # partial caching
                 if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                     self.cache_hook.add_partial("loglikelihood", cache_key, answer)
                 pbar.update(1)
         pbar.close()
 
@@ -69,7 +69,7 @@
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
 | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
 | mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
-| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English |
+| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
 | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
 | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
 | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
 
@@ -492,7 +492,7 @@ def _get_task_and_group(self, task_dir: str):
                                         "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
                                         "`tag` will be used to allow to call a collection of tasks just like `group`. "
                                         "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
-                                        "which will be the offical way to create groups with addition of group-wide configuations."
+                                        "which will be the official way to create groups with addition of group-wide configurations."
                                     )
                                     print_info = False
                                     # attr = "tag"
 
@@ -14,7 +14,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 
 ```bibtex
 @inproceedings{zhang-li-2023-large,
-    title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
+    title = "Can Large Language Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
     author = "Zhang, Yixuan  and Li, Haonan",
     booktitle = "Proceedings of the Ancient Language Processing Workshop",
     month = sep,
 
@@ -41,6 +41,11 @@ Homepage: https://github.com/chaochun/nlu-asdiv-dataset
 #### Tasks
 
 * `asdiv`
+* `asdiv_cot_llama`: ASDIV with prompt formatting modified to conform to the evaluation settings described by Meta here: https://huggingface.co/datasets/meta-llama/Meta-Llama-3.1-8B-Instruct-evals/viewer/Meta-Llama-3.1-8B-Instruct-evals__gsm8k__details?row=0
+    - Note that the CoT prompt from (https://arxiv.org/pdf/2201.11903) is used exactly as in GSM8k-CoT
+    - This file is setup to run identically to the task `gsm8k_cot_llama` but for asdiv.
+    - Use this task with --fewshot_as_multiturn and --apply_chat_template to run correctly with Llama Instruct models.
+
 
 ### Checklist
 
 
@@ -0,0 +1,88 @@
+dataset_path: EleutherAI/asdiv
+doc_to_target: "{{answer.split(' (')[0] if answer is defined else target}}"
+doc_to_text: "Given the following problem, reason and give a final answer to the problem.\nProblem: {{body if body is defined}} {{question}}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.\n"
+fewshot_config:
+  sampler: first_n
+  samples:
+  - question: There are 15 trees in the grove. Grove workers will plant trees in the
+      grove today. After they are done, there will be 21 trees. How many trees did
+      the grove workers plant today?
+    target: There are 15 trees originally. Then there were 21 trees after some more
+      were planted. So there must have been 21 - 15 = 6. The final answer is 6
+  - question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
+      cars are in the parking lot?
+    target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The final answer
+      is 5
+  - question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
+      pieces do they have left in total?
+    target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
+      had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The final answer is 39
+  - question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
+      lollipops. How many lollipops did Jason give to Denny?
+    target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
+      So he gave Denny 20 - 12 = 8. The final answer is 8
+  - question: Shawn has five toys. For Christmas, he got two toys each from his mom and
+      dad. How many toys does he have now?
+    target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
+      then that is 4 more toys. 5 + 4 = 9. The final answer is 9
+  - question: There were nine computers in the server room. Five more computers were
+      installed each day, from monday to thursday. How many computers are now in the
+      server room?
+    target: There were originally 9 computers. For each of 4 days, 5 more computers
+      were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The final answer is
+      29
+  - question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
+      he lost 2 more. How many golf balls did he have at the end of wednesday?
+    target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
+      58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The final answer
+      is 33
+  - question: Olivia has $23. She bought five bagels for $3 each. How much money does
+      she have left?
+    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
+      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
+filter_list:
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<|eot_id|>'
+  - '<|start_header_id|>user<|end_header_id|>'
+  - 'Q:'
+  - </s>
+  - <|im_end|>
+tag:
+- chain_of_thought
+metadata:
+  version: 1.0
+metric_list:
+- aggregation: mean
+  higher_is_better: true
+  ignore_case: true
+  ignore_punctuation: false
+  metric: exact_match
+  regexes_to_ignore:
+  - ','
+  - \$
+  - '(?s).*#### '
+  - \.$
+num_fewshot: 8
+output_type: generate_until
+repeats: 1
+task: asdiv_cot_llama
+validation_split: validation
+test_split: validation
+should_decontaminate: true
+doc_to_decontamination_query: "{{body}} {{question}}"
+dataset_kwargs:
+  trust_remote_code: true
@@ -16,8 +16,8 @@ Homepage: https://eqbench.com/
 NOTE: There are some key differences between the lm-evaluation-harness version and the implementation described in the EQ-Bench paper (These have been OK'd by the author):
 
 - The lm-eval version uses the EQ-Bench v2 test set (171 questions) and score calculation. It does not incorporate the revision part of the prompt, as per v2.1 (https://github.com/EQ-bench/EQ-Bench)
-- No retries in lm-eval version (EQ-Bench pipeline retries with successively higher temps if it encounters unparseable answers)
-- In the original implementation, unparseable answers are excluded from the final score, and 83% of answers have to be parseable or a fail is returned. The lm-eval version instead assigns 0 to unparsable answers and has no fail criteria. So for lower performing models, there may be differences with the EQ-Bench leaderboard.
+- No retries in lm-eval version (EQ-Bench pipeline retries with successively higher temps if it encounters unparsable answers)
+- In the original implementation, unparsable answers are excluded from the final score, and 83% of answers have to be parseable or a fail is returned. The lm-eval version instead assigns 0 to unparsable answers and has no fail criteria. So for lower performing models, there may be differences with the EQ-Bench leaderboard.
 
 
 ### Citation
 
@@ -26,4 +26,4 @@ metric_list:
     aggregation: !function utils.agg_inst_level_acc
     higher_is_better: true
 metadata:
-  version: 3.0
+  version: 4.0
Original file line number	Diff line number	Diff line change
`@@ -492,7 +492,7 @@ def _get_task_and_group(self, task_dir: str):`
`492`	`492`	"`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
`493`	`493`	"`tag` will be used to allow to call a collection of tasks just like `group`. "
`494`	`494`	"`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
`495`		`- "which will be the offical way to create groups with addition of group-wide configuations."`
	`495`	`+ "which will be the official way to create groups with addition of group-wide configurations."`
`496`	`496`	`)`
`497`	`497`	`print_info = False`
`498`	`498`	`# attr = "tag"`