Skip to content

Commit c05af4d

Browse files
authored
Merge pull request #143 from danmcp/aggfix
Remove task logic with lm_eval 0.4.4 for agg_score
2 parents 40cc370 + b896d0a commit c05af4d

File tree

1 file changed

+31
-64
lines changed

1 file changed

+31
-64
lines changed

src/instructlab/eval/mmlu.py

Lines changed: 31 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,37 @@ def __init__(
122122
self.batch_size = batch_size
123123
self.device = device
124124

125+
def run(self, server_url: str | None = None) -> tuple:
126+
"""
127+
Runs evaluation
128+
129+
Attributes
130+
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
131+
132+
Returns:
133+
overall_score Average score for the task group
134+
individual_scores Individual scores for each task in the task group
135+
"""
136+
logger.debug(locals())
137+
138+
# TODO: make this a parameter for class?
139+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
140+
141+
individual_scores: dict = {}
142+
agg_score: float = 0.0
143+
144+
results = self._run_mmlu(server_url)
145+
for task, result in results.items():
146+
agg_score += float(result["acc,none"])
147+
individual_scores[task] = {
148+
"score": float(result["acc,none"]),
149+
"stderr": float(result["acc_stderr,none"]),
150+
}
151+
152+
overall_score = float(agg_score / len(self.tasks))
153+
154+
return overall_score, individual_scores
155+
125156
def _run_mmlu(self, server_url: str | None = None) -> dict:
126157
if server_url is not None:
127158
# Requires lm_eval >= 0.4.4
@@ -205,36 +236,6 @@ def __init__(
205236
model_path, None, tasks, model_dtype, few_shots, batch_size, device
206237
)
207238

208-
def run(self, server_url: str | None = None) -> tuple:
209-
"""
210-
Runs MMLU evaluation
211-
212-
Attributes
213-
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
214-
215-
Returns:
216-
overall_score MMLU score for the overall model evaluation
217-
individual_scores Individual MMLU score for each task
218-
"""
219-
logger.debug(locals())
220-
# TODO: make this a parameter for class?
221-
os.environ["TOKENIZERS_PARALLELISM"] = "true"
222-
223-
individual_scores: dict = {}
224-
agg_score: float = 0.0
225-
226-
results = self._run_mmlu(server_url)
227-
228-
for task in self.tasks:
229-
mmlu_res = results[task]
230-
agg_score += float(mmlu_res["acc,none"])
231-
individual_scores[task] = {}
232-
individual_scores[task]["score"] = float(mmlu_res["acc,none"])
233-
individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])
234-
235-
overall_score = float(agg_score / len(self.tasks))
236-
return overall_score, individual_scores
237-
238239

239240
class MMLUBranchEvaluator(AbstractMMLUEvaluator):
240241
"""
@@ -251,37 +252,3 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
251252
"""
252253

253254
name = "mmlu_branch"
254-
255-
def run(self, server_url: str | None = None) -> tuple:
256-
"""
257-
Runs MMLUBranch evaluation
258-
259-
Attributes
260-
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
261-
262-
Returns:
263-
overall_score Average MMLUBranch score for the task group
264-
individual_scores Individual MMLUBranch scores for each task in the task group
265-
"""
266-
logger.debug(locals())
267-
268-
# TODO: make this a parameter for class?
269-
os.environ["TOKENIZERS_PARALLELISM"] = "true"
270-
271-
individual_scores: dict = {}
272-
agg_score: float = 0.0
273-
274-
results = self._run_mmlu(server_url)
275-
276-
for task, result in results.items():
277-
if task in self.tasks:
278-
agg_score += float(result["acc,none"])
279-
else:
280-
individual_scores[task] = {
281-
"score": float(result["acc,none"]),
282-
"stderr": float(result["acc_stderr,none"]),
283-
}
284-
285-
overall_score = float(agg_score / len(self.tasks))
286-
287-
return overall_score, individual_scores

0 commit comments

Comments
 (0)