-
Notifications
You must be signed in to change notification settings - Fork 8
Evaluation: Use Config Management #477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
3f8ddcf
5280622
13eb778
7bdd322
8f9561c
f612da4
4f89f43
82bee43
a2c8a95
b9fd664
31d9523
6b00e0f
ceb3970
82c7b70
ebdda81
3faa3ab
866443c
a29bb77
f00e7e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| """add config in evals run table | ||
| Revision ID: 041 | ||
| Revises: 040 | ||
| Create Date: 2025-12-15 14:03:22.082746 | ||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
| import sqlmodel.sql.sqltypes | ||
| from sqlalchemy.dialects import postgresql | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "041" | ||
| down_revision = "040" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "config_id", | ||
| sa.Uuid(), | ||
| nullable=True, | ||
| comment="Reference to the stored config used", | ||
| ), | ||
| ) | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "config_version", | ||
| sa.Integer(), | ||
| nullable=True, | ||
| comment="Version of the config used", | ||
| ), | ||
| ) | ||
| op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) | ||
| op.drop_column("evaluation_run", "config") | ||
|
Comment on lines
+22
to
+41
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Critical: Data loss and foreign key constraint naming issues. This migration has several critical problems:
Required actions:
🔧 Proposed fix for FK constraint naming- op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+ op.create_foreign_key(
+ "fk_evaluation_run_config_id",
+ "evaluation_run",
+ "config",
+ ["config_id"],
+ ["id"]
+ )And update the downgrade: - op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+ op.drop_constraint("fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey")
|
||
| # ### end Alembic commands ### | ||
|
|
||
|
|
||
| def downgrade(): | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "config", | ||
| postgresql.JSONB(astext_type=sa.Text()), | ||
| autoincrement=False, | ||
| nullable=False, | ||
| comment="Evaluation configuration (model, instructions, etc.)", | ||
| ), | ||
| ) | ||
|
Comment on lines
+47
to
+56
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Critical: Downgrade will fail with existing data. The downgrade re-adds the Either:
🔧 Proposed fix (Option 1: Make nullable) op.add_column(
"evaluation_run",
sa.Column(
"config",
postgresql.JSONB(astext_type=sa.Text()),
autoincrement=False,
- nullable=False,
+ nullable=True,
comment="Evaluation configuration (model, instructions, etc.)",
),
)🤖 Prompt for AI Agents |
||
| op.drop_constraint(None, "evaluation_run", type_="foreignkey") | ||
| op.drop_column("evaluation_run", "config_version") | ||
| op.drop_column("evaluation_run", "config_id") | ||
| # ### end Alembic commands ### | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,12 +1,16 @@ | ||||||||||||||||||
| import logging | ||||||||||||||||||
| from uuid import UUID | ||||||||||||||||||
|
|
||||||||||||||||||
| from langfuse import Langfuse | ||||||||||||||||||
| from sqlmodel import Session, select | ||||||||||||||||||
|
|
||||||||||||||||||
| from app.core.util import now | ||||||||||||||||||
| from app.crud.config.version import ConfigVersionCrud | ||||||||||||||||||
| from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse | ||||||||||||||||||
| from app.crud.evaluations.score import EvaluationScore | ||||||||||||||||||
| from app.models import EvaluationRun | ||||||||||||||||||
| from app.models.llm.request import LLMCallConfig | ||||||||||||||||||
| from app.services.llm.jobs import resolve_config_blob | ||||||||||||||||||
|
|
||||||||||||||||||
| logger = logging.getLogger(__name__) | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
@@ -16,7 +20,8 @@ def create_evaluation_run( | |||||||||||||||||
| run_name: str, | ||||||||||||||||||
| dataset_name: str, | ||||||||||||||||||
| dataset_id: int, | ||||||||||||||||||
| config: dict, | ||||||||||||||||||
| config_id: UUID, | ||||||||||||||||||
| config_version: int, | ||||||||||||||||||
| organization_id: int, | ||||||||||||||||||
| project_id: int, | ||||||||||||||||||
| ) -> EvaluationRun: | ||||||||||||||||||
|
|
@@ -28,7 +33,8 @@ def create_evaluation_run( | |||||||||||||||||
| run_name: Name of the evaluation run/experiment | ||||||||||||||||||
| dataset_name: Name of the dataset being used | ||||||||||||||||||
| dataset_id: ID of the dataset | ||||||||||||||||||
| config: Configuration dict for the evaluation | ||||||||||||||||||
| config_id: UUID of the stored config | ||||||||||||||||||
| config_version: Version number of the config | ||||||||||||||||||
| organization_id: Organization ID | ||||||||||||||||||
| project_id: Project ID | ||||||||||||||||||
|
|
@@ -39,7 +45,8 @@ def create_evaluation_run( | |||||||||||||||||
| run_name=run_name, | ||||||||||||||||||
| dataset_name=dataset_name, | ||||||||||||||||||
| dataset_id=dataset_id, | ||||||||||||||||||
| config=config, | ||||||||||||||||||
| config_id=config_id, | ||||||||||||||||||
| config_version=config_version, | ||||||||||||||||||
| status="pending", | ||||||||||||||||||
| organization_id=organization_id, | ||||||||||||||||||
| project_id=project_id, | ||||||||||||||||||
|
|
@@ -56,7 +63,10 @@ def create_evaluation_run( | |||||||||||||||||
| logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) | ||||||||||||||||||
| raise | ||||||||||||||||||
|
|
||||||||||||||||||
| logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") | ||||||||||||||||||
| logger.info( | ||||||||||||||||||
| f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, " | ||||||||||||||||||
| f"config_id={config_id}, config_version={config_version}" | ||||||||||||||||||
| ) | ||||||||||||||||||
|
Comment on lines
+66
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefix the new info log with the function name. Line 66-69 doesn’t include the required 🔧 Proposed fix- logger.info(
- f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
- f"config_id={config_id}, config_version={config_version}"
- )
+ logger.info(
+ f"[create_evaluation_run] Created EvaluationRun record: id={eval_run.id}, "
+ f"run_name={run_name}, config_id={config_id}, config_version={config_version}"
+ )📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||
|
|
||||||||||||||||||
| return eval_run | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
@@ -311,3 +321,47 @@ def save_score( | |||||||||||||||||
| f"traces={len(score.get('traces', []))}" | ||||||||||||||||||
| ) | ||||||||||||||||||
| return eval_run | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| def resolve_model_from_config( | ||||||||||||||||||
| session: Session, | ||||||||||||||||||
| eval_run: EvaluationRun, | ||||||||||||||||||
| ) -> str: | ||||||||||||||||||
| """ | ||||||||||||||||||
| Resolve the model name from the evaluation run's config. | ||||||||||||||||||
| Args: | ||||||||||||||||||
| session: Database session | ||||||||||||||||||
| eval_run: EvaluationRun instance | ||||||||||||||||||
| Returns: | ||||||||||||||||||
| Model name from config | ||||||||||||||||||
| Raises: | ||||||||||||||||||
| ValueError: If config is missing, invalid, or has no model | ||||||||||||||||||
| """ | ||||||||||||||||||
| if not eval_run.config_id or not eval_run.config_version: | ||||||||||||||||||
| raise ValueError( | ||||||||||||||||||
| f"Evaluation run {eval_run.id} has no config reference " | ||||||||||||||||||
| f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})" | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| config_version_crud = ConfigVersionCrud( | ||||||||||||||||||
| session=session, | ||||||||||||||||||
| config_id=eval_run.config_id, | ||||||||||||||||||
| project_id=eval_run.project_id, | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| config, error = resolve_config_blob( | ||||||||||||||||||
| config_crud=config_version_crud, | ||||||||||||||||||
| config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version), | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| if error or config is None: | ||||||||||||||||||
| raise ValueError( | ||||||||||||||||||
| f"Config resolution failed for evaluation {eval_run.id} " | ||||||||||||||||||
| f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| model = config.completion.params.model | ||||||||||||||||||
| return model | ||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add return type hints to migration functions.
Both
upgrade()anddowngrade()functions are missing return type hints.As per coding guidelines, all functions should have type hints.
📝 Proposed fix
Also applies to: 45-45
🤖 Prompt for AI Agents