@@ -122,6 +122,37 @@ def __init__(
122122 self .batch_size = batch_size
123123 self .device = device
124124
125+ def run (self , server_url : str | None = None ) -> tuple :
126+ """
127+ Runs evaluation
128+
129+ Attributes
130+ server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
131+
132+ Returns:
133+ overall_score Average score for the task group
134+ individual_scores Individual scores for each task in the task group
135+ """
136+ logger .debug (locals ())
137+
138+ # TODO: make this a parameter for class?
139+ os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
140+
141+ individual_scores : dict = {}
142+ agg_score : float = 0.0
143+
144+ results = self ._run_mmlu (server_url )
145+ for task , result in results .items ():
146+ agg_score += float (result ["acc,none" ])
147+ individual_scores [task ] = {
148+ "score" : float (result ["acc,none" ]),
149+ "stderr" : float (result ["acc_stderr,none" ]),
150+ }
151+
152+ overall_score = float (agg_score / len (self .tasks ))
153+
154+ return overall_score , individual_scores
155+
125156 def _run_mmlu (self , server_url : str | None = None ) -> dict :
126157 if server_url is not None :
127158 # Requires lm_eval >= 0.4.4
@@ -205,36 +236,6 @@ def __init__(
205236 model_path , None , tasks , model_dtype , few_shots , batch_size , device
206237 )
207238
208- def run (self , server_url : str | None = None ) -> tuple :
209- """
210- Runs MMLU evaluation
211-
212- Attributes
213- server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
214-
215- Returns:
216- overall_score MMLU score for the overall model evaluation
217- individual_scores Individual MMLU score for each task
218- """
219- logger .debug (locals ())
220- # TODO: make this a parameter for class?
221- os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
222-
223- individual_scores : dict = {}
224- agg_score : float = 0.0
225-
226- results = self ._run_mmlu (server_url )
227-
228- for task in self .tasks :
229- mmlu_res = results [task ]
230- agg_score += float (mmlu_res ["acc,none" ])
231- individual_scores [task ] = {}
232- individual_scores [task ]["score" ] = float (mmlu_res ["acc,none" ])
233- individual_scores [task ]["stderr" ] = float (mmlu_res ["acc_stderr,none" ])
234-
235- overall_score = float (agg_score / len (self .tasks ))
236- return overall_score , individual_scores
237-
238239
239240class MMLUBranchEvaluator (AbstractMMLUEvaluator ):
240241 """
@@ -251,37 +252,3 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
251252 """
252253
253254 name = "mmlu_branch"
254-
255- def run (self , server_url : str | None = None ) -> tuple :
256- """
257- Runs MMLUBranch evaluation
258-
259- Attributes
260- server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
261-
262- Returns:
263- overall_score Average MMLUBranch score for the task group
264- individual_scores Individual MMLUBranch scores for each task in the task group
265- """
266- logger .debug (locals ())
267-
268- # TODO: make this a parameter for class?
269- os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
270-
271- individual_scores : dict = {}
272- agg_score : float = 0.0
273-
274- results = self ._run_mmlu (server_url )
275-
276- for task , result in results .items ():
277- if task in self .tasks :
278- agg_score += float (result ["acc,none" ])
279- else :
280- individual_scores [task ] = {
281- "score" : float (result ["acc,none" ]),
282- "stderr" : float (result ["acc_stderr,none" ]),
283- }
284-
285- overall_score = float (agg_score / len (self .tasks ))
286-
287- return overall_score , individual_scores
0 commit comments