add arc_challenge

EleutherAI · Dec 10, 2024 · 29ac037 · 29ac037
1 parent f44f2c5
commit 29ac037
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 0 deletions.
diff --git a/lm_eval/tasks/llama3/README.md b/lm_eval/tasks/llama3/README.md
@@ -0,0 +1,44 @@
+# Task-name
+
+### Paper
+
+Title: ``
+
+Abstract: ``
+
+
+Homepage: ``
+
+
+### Citation
+
+```
+
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+
+
+#### Subgroups
+
+
+### Tasks
+
+* `llama_arc_challenge`: 25-shot multiple-choice ARC challenge.
+* `mgsm_chat`: 0-shot mgsm benchmark. Use with chat-template.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/llama3/base/arc_challenge.yaml b/lm_eval/tasks/llama3/base/arc_challenge.yaml
@@ -0,0 +1,20 @@
+tag:
+  - llama3
+task: llama_arc_challenge
+dataset_path: meta-llama/Llama-3.1-8B-evals
+dataset_name: Llama-3.1-8B-evals__arc_challenge__details
+output_type: multiple_choice
+test_split: latest
+process_docs: !function utils.process_arc_c_docs
+doc_to_text: "{{doc_to_text}}"
+doc_to_target: "{{doc_to_target}}"
+doc_to_choice: "{{doc_to_choice}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/llama3/base/utils.py b/lm_eval/tasks/llama3/base/utils.py
@@ -0,0 +1,15 @@
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    COLUMNS = dataset.column_names
+
+    def map_(doc):
+        doc["doc_to_text"] = doc["input_final_prompts"][0].strip()[:-2].strip()
+        doc["doc_to_choice"] = [
+            x.replace("Answer:", "").strip() for x in doc["output_choice_completions"]
+        ]
+        doc["doc_to_target"] = doc["input_correct_responses"][0].strip()[-1]
+        return doc
+
+    return dataset.map(map_, remove_columns=COLUMNS)