EleutherAI · baberabb · Jan 20, 2025 · Nov 18, 2024 · Dec 4, 2024 · Jan 20, 2025
@@ -13,46 +13,48 @@
 eval_logger = logging.getLogger("lm-eval")
 
 
-SUBJECTS = {'Islamic Studies': 'humanities',
- 'Driving Test': 'other',
- 'Natural Science (Middle School)': 'stem',
- 'Natural Science (Primary School)': 'stem',
- 'History (Primary School)': 'humanities',
- 'History (Middle School)': 'humanities',
- 'History (High School)': 'humanities',
- 'General Knowledge': 'other',
- 'General Knowledge (Primary School)': 'other',
- 'General Knowledge (Middle School)': 'other',
- 'Law (Professional)': 'humanities',
- 'Physics (High School)': 'stem',
- 'Social Science (Middle School)': 'social_science',
- 'Social Science (Primary School)': 'social_science',
- 'Management (University)': 'other',
- 'Arabic Language (Primary School)': 'language',
- 'Arabic Language (Middle School)': 'language',
- 'Arabic Language (High School)': 'language',
- 'Political Science (University)': 'social_science',
- 'Philosophy (High School)': 'humanities',
- 'Accounting (University)': 'social_science',
- 'Computer Science (University)': 'stem',
- 'Computer Science (Middle School)': 'stem',
- 'Computer Science (Primary School)': 'stem',
- 'Computer Science (High School)': 'stem',
- 'Geography (Primary School)': 'social_science',
- 'Geography (Middle School)': 'social_science',
- 'Geography (High School)': 'social_science',
- 'Math (Primary School)': 'stem',
- 'Biology (High School)': 'stem',
- 'Economics (University)': 'social_science',
- 'Economics (Middle School)': 'social_science',
- 'Economics (High School)': 'social_science',
- 'Arabic Language (General)': 'language',
- 'Arabic Language (Grammar)': 'language',
- 'Islamic Studies (High School)': 'humanities',
- 'Islamic Studies (Middle School)': 'humanities',
- 'Islamic Studies (Primary School)': 'humanities',
- 'Civics (Middle School)': 'social_science',
- 'Civics (High School)': 'social_science'}
+SUBJECTS = {
+    "Islamic Studies": "humanities",
+    "Driving Test": "other",
+    "Natural Science (Middle School)": "stem",
+    "Natural Science (Primary School)": "stem",
+    "History (Primary School)": "humanities",
+    "History (Middle School)": "humanities",
+    "History (High School)": "humanities",
+    "General Knowledge": "other",
+    "General Knowledge (Primary School)": "other",
+    "General Knowledge (Middle School)": "other",
+    "Law (Professional)": "humanities",
+    "Physics (High School)": "stem",
+    "Social Science (Middle School)": "social_science",
+    "Social Science (Primary School)": "social_science",
+    "Management (University)": "other",
+    "Arabic Language (Primary School)": "language",
+    "Arabic Language (Middle School)": "language",
+    "Arabic Language (High School)": "language",
+    "Political Science (University)": "social_science",
+    "Philosophy (High School)": "humanities",
+    "Accounting (University)": "social_science",
+    "Computer Science (University)": "stem",
+    "Computer Science (Middle School)": "stem",
+    "Computer Science (Primary School)": "stem",
+    "Computer Science (High School)": "stem",
+    "Geography (Primary School)": "social_science",
+    "Geography (Middle School)": "social_science",
+    "Geography (High School)": "social_science",
+    "Math (Primary School)": "stem",
+    "Biology (High School)": "stem",
+    "Economics (University)": "social_science",
+    "Economics (Middle School)": "social_science",
+    "Economics (High School)": "social_science",
+    "Arabic Language (General)": "language",
+    "Arabic Language (Grammar)": "language",
+    "Islamic Studies (High School)": "humanities",
+    "Islamic Studies (Middle School)": "humanities",
+    "Islamic Studies (Primary School)": "humanities",
+    "Civics (Middle School)": "social_science",
+    "Civics (High School)": "social_science",
+}
 
 
 def parse_args():

@@ -1,3 +1,4 @@
+# noqa
 """
 Take in a YAML, and output all "other" splits with this YAML
 """

@@ -14,7 +14,21 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
+    ignore_punctuation: true
+    ignore_case: true
+filter_list:
+  - name: get_response
+    filter:
+      # Filter everything after the first break line
+      - function: "regex"
+        regex_pattern: "^(.*?)(?=\\n|$)"
+      # Remove leading white spaces
+      - function: remove_whitespace
+      # function to ignore right white spaces or line breaks
+      - function: "regex"
+        regex_pattern: "^(.*?)\\s*$"
+      - function: take_first
 metadata:
-  version: 2.0
+  version: 3.0
 dataset_kwargs:
   trust_remote_code: true
@@ -5,29 +5,29 @@ task:
     task:
       - mmlu_stem_generative
     aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
   - group: other
     task:
       - mmlu_other_generative
     aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
   - group: social sciences
     task:
       - mmlu_social_sciences_generative
     aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
   - group: humanities
     task:
       - mmlu_humanities_generative
     aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
 aggregate_metric_list:
   - aggregation: mean
     metric: exact_match
-    weight_by_size: True
+    weight_by_size: true
 metadata:
-  version: 2
+  version: 3
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    # noqa
     """
     Take in a YAML, and output all "other" splits with this YAML
     """
@@ Expand Down @@