Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3f8ddcf
Refactor evaluation endpoint to use stored configuration and remove a…
avirajsingh7 Dec 9, 2025
5280622
Refactor evaluation run to use config ID and version instead of confi…
avirajsingh7 Dec 9, 2025
13eb778
Add config_id, config_version, and model fields to evaluation run table
avirajsingh7 Dec 9, 2025
7bdd322
Refactor batch evaluation tests to use config_id and config_version i…
avirajsingh7 Dec 10, 2025
8f9561c
Update EvaluationRunPublic model to allow nullable config_id and conf…
avirajsingh7 Dec 10, 2025
f612da4
Refactor evaluation run model handling: remove model field, add resol…
avirajsingh7 Dec 15, 2025
4f89f43
fix migration number
avirajsingh7 Dec 15, 2025
82bee43
fix test
avirajsingh7 Dec 15, 2025
a2c8a95
fix status code
avirajsingh7 Dec 15, 2025
b9fd664
remove old mirgation
avirajsingh7 Dec 15, 2025
31d9523
Merge branch 'main' into evals/config_addition
nishika26 Jan 7, 2026
6b00e0f
added depends as import
AkhileshNegi Jan 12, 2026
ceb3970
fix: spread config object while building batch eval jsonl
Prajna1999 Jan 13, 2026
82c7b70
chore: remove audio poc code
Prajna1999 Jan 14, 2026
ebdda81
fix: add comprehensive expansion of 'tools' key while building evalua…
Prajna1999 Jan 14, 2026
3faa3ab
fix: merge conflict resolution old eval
Prajna1999 Jan 20, 2026
866443c
Merge main into evals/config_addition and update to use config_id/ver…
AkhileshNegi Jan 24, 2026
a29bb77
fix: resolve merge conflicts after pull
AkhileshNegi Jan 24, 2026
f00e7e0
fixing endpoints
AkhileshNegi Jan 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions backend/app/alembic/versions/041_add_config_in_evals_run_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""add config in evals run table
Revision ID: 041
Revises: 040
Create Date: 2025-12-15 14:03:22.082746
"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "041"
down_revision = "040"
branch_labels = None
depends_on = None


def upgrade():
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add return type hints to migration functions.

Both upgrade() and downgrade() functions are missing return type hints.

As per coding guidelines, all functions should have type hints.

📝 Proposed fix
-def upgrade():
+def upgrade() -> None:
-def downgrade():
+def downgrade() -> None:

Also applies to: 45-45

🤖 Prompt for AI Agents
In @backend/app/alembic/versions/041_add_config_in_evals_run_table.py at line
20, The migration functions upgrade() and downgrade() lack return type hints;
update both function definitions (upgrade and downgrade) to include explicit
return types (e.g., change "def upgrade():" and "def downgrade():" to "def
upgrade() -> None:" and "def downgrade() -> None:") so they conform to the
project's typing guidelines.

# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config_id",
sa.Uuid(),
nullable=True,
comment="Reference to the stored config used",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"config_version",
sa.Integer(),
nullable=True,
comment="Version of the config used",
),
)
op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
op.drop_column("evaluation_run", "config")
Comment on lines +22 to +41
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Data loss and foreign key constraint naming issues.

This migration has several critical problems:

  1. Data loss: Line 41 drops the config column without migrating existing data to the new config_id/config_version columns. Any existing evaluation runs will lose their configuration data permanently.

  2. Foreign key constraint naming: Line 40 creates a foreign key with None as the constraint name, causing Alembic to auto-generate a name. However, the downgrade function (Line 57) also uses None to drop the constraint, which won't match the auto-generated name and will fail.

Required actions:

  1. Add a data migration step before dropping the config column. You'll need to:

    • Parse each existing config JSONB object
    • Look up or create corresponding config records with appropriate versions
    • Update config_id and config_version for each evaluation_run
    • Or, if data migration isn't feasible, add a comment explaining why data loss is acceptable
  2. Specify an explicit constraint name instead of None:

🔧 Proposed fix for FK constraint naming
-    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+    op.create_foreign_key(
+        "fk_evaluation_run_config_id", 
+        "evaluation_run", 
+        "config", 
+        ["config_id"], 
+        ["id"]
+    )

And update the downgrade:

-    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+    op.drop_constraint("fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey")

Committable suggestion skipped: line range outside the PR's diff.

# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"evaluation_run",
sa.Column(
"config",
postgresql.JSONB(astext_type=sa.Text()),
autoincrement=False,
nullable=False,
comment="Evaluation configuration (model, instructions, etc.)",
),
)
Comment on lines +47 to +56
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Downgrade will fail with existing data.

The downgrade re-adds the config column with nullable=False (Line 53). If the evaluation_run table contains any records when downgrading, this operation will fail because PostgreSQL cannot add a non-nullable column to a table with existing rows without specifying a default value.

Either:

  1. Make the column nullable during downgrade: nullable=True
  2. Provide a server default value
  3. Add a data migration to populate the column before setting it non-nullable
🔧 Proposed fix (Option 1: Make nullable)
     op.add_column(
         "evaluation_run",
         sa.Column(
             "config",
             postgresql.JSONB(astext_type=sa.Text()),
             autoincrement=False,
-            nullable=False,
+            nullable=True,
             comment="Evaluation configuration (model, instructions, etc.)",
         ),
     )
🤖 Prompt for AI Agents
In @backend/app/alembic/versions/041_add_config_in_evals_run_table.py around
lines 47 - 56, The downgrade currently re-adds the "config" column on the
"evaluation_run" table using op.add_column with sa.Column(..., nullable=False)
which will fail if rows exist; update that op.add_column call in the downgrade
to use nullable=True (or alternatively add a server_default or a prior data
migration to populate values before setting non-nullable), ensuring the column
is created nullable during downgrade to avoid PostgreSQL errors.

op.drop_constraint(None, "evaluation_run", type_="foreignkey")
op.drop_column("evaluation_run", "config_version")
op.drop_column("evaluation_run", "config_id")
# ### end Alembic commands ###
4 changes: 2 additions & 2 deletions backend/app/api/routes/evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse:


@router.post(
"/",
"",
description=load_description("evaluation/upload_dataset.md"),
response_model=APIResponse[DatasetUploadResponse],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand Down Expand Up @@ -87,7 +87,7 @@ async def upload_dataset(


@router.get(
"/",
"",
description=load_description("evaluation/list_datasets.md"),
response_model=APIResponse[list[DatasetUploadResponse]],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand Down
16 changes: 7 additions & 9 deletions backend/app/api/routes/evaluations/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Evaluation run API routes."""

import logging
from uuid import UUID

from fastapi import (
APIRouter,
Expand Down Expand Up @@ -29,7 +30,7 @@


@router.post(
"/",
"",
description=load_description("evaluation/create_evaluation.md"),
response_model=APIResponse[EvaluationRunPublic],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand All @@ -41,19 +42,16 @@ def evaluate(
experiment_name: str = Body(
..., description="Name for this evaluation experiment/run"
),
config: dict = Body(default_factory=dict, description="Evaluation configuration"),
assistant_id: str
| None = Body(
None, description="Optional assistant ID to fetch configuration from"
),
config_id: UUID = Body(..., description="Stored config ID"),
config_version: int = Body(..., ge=1, description="Stored config version"),
) -> APIResponse[EvaluationRunPublic]:
"""Start an evaluation run."""
eval_run = start_evaluation(
session=_session,
dataset_id=dataset_id,
experiment_name=experiment_name,
config=config,
assistant_id=assistant_id,
config_id=config_id,
config_version=config_version,
organization_id=auth_context.organization_.id,
project_id=auth_context.project_.id,
)
Expand All @@ -68,7 +66,7 @@ def evaluate(


@router.get(
"/",
"",
description=load_description("evaluation/list_evaluations.md"),
response_model=APIResponse[list[EvaluationRunPublic]],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand Down
42 changes: 42 additions & 0 deletions backend/app/crud/evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
create_evaluation_run,
get_evaluation_run_by_id,
list_evaluation_runs,
resolve_model_from_config,
save_score,
)
from app.crud.evaluations.cron import (
Expand Down Expand Up @@ -43,3 +44,44 @@
TraceData,
TraceScore,
)

__all__ = [
# Core
"create_evaluation_run",
"get_evaluation_run_by_id",
"list_evaluation_runs",
"resolve_model_from_config",
"save_score",
# Cron
"process_all_pending_evaluations",
"process_all_pending_evaluations_sync",
# Dataset
"create_evaluation_dataset",
"delete_dataset",
"get_dataset_by_id",
"list_datasets",
"upload_csv_to_object_store",
# Batch
"start_evaluation_batch",
# Processing
"check_and_process_evaluation",
"poll_all_pending_evaluations",
"process_completed_embedding_batch",
"process_completed_evaluation",
# Embeddings
"calculate_average_similarity",
"calculate_cosine_similarity",
"start_embedding_batch",
# Langfuse
"create_langfuse_dataset_run",
"fetch_trace_scores_from_langfuse",
"update_traces_with_cosine_scores",
"upload_dataset_to_langfuse",
# Score types
"CategoricalSummaryScore",
"EvaluationScore",
"NumericSummaryScore",
"SummaryScore",
"TraceData",
"TraceScore",
]
21 changes: 16 additions & 5 deletions backend/app/crud/evaluations/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from app.core.batch import OpenAIBatchProvider, start_batch_job
from app.models import EvaluationRun
from app.models.llm.request import KaapiLLMParams

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -59,7 +60,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,


def build_evaluation_jsonl(
dataset_items: list[dict[str, Any]], config: dict[str, Any]
dataset_items: list[dict[str, Any]], config: KaapiLLMParams
) -> list[dict[str, Any]]:
"""
Build JSONL data for evaluation batch using OpenAI Responses API.
Expand Down Expand Up @@ -88,7 +89,6 @@ def build_evaluation_jsonl(
List of dictionaries (JSONL data)
"""
jsonl_data = []

for item in dataset_items:
# Extract question from input
question = item["input"].get("question", "")
Expand All @@ -105,7 +105,18 @@ def build_evaluation_jsonl(
"method": "POST",
"url": "/v1/responses",
"body": {
**config, # Use config as-is
# Use config as-is
"model": config.model,
"instructions": config.instructions,
"temperature": config.temperature,
"reasoning": {"effort": config.reasoning} if config.reasoning else None,
"tools": [
{
"type": "file_search",
"vector_store_ids": config.knowledge_base_ids,
"max_num_results": config.max_num_results or 20,
}
],
"input": question, # Add input from dataset
},
}
Expand All @@ -119,7 +130,7 @@ def start_evaluation_batch(
openai_client: OpenAI,
session: Session,
eval_run: EvaluationRun,
config: dict[str, Any],
config: KaapiLLMParams,
) -> EvaluationRun:
"""
Fetch data, build JSONL, and start evaluation batch.
Expand Down Expand Up @@ -166,7 +177,7 @@ def start_evaluation_batch(
"description": f"Evaluation: {eval_run.run_name}",
"completion_window": "24h",
# Store complete config for reference
"evaluation_config": config,
"evaluation_config": config.model_dump(exclude_none=True),
}

# Step 5: Start batch job using generic infrastructure
Expand Down
62 changes: 58 additions & 4 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
from uuid import UUID

from langfuse import Langfuse
from sqlmodel import Session, select

from app.core.util import now
from app.crud.config.version import ConfigVersionCrud
from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
from app.crud.evaluations.score import EvaluationScore
from app.models import EvaluationRun
from app.models.llm.request import LLMCallConfig
from app.services.llm.jobs import resolve_config_blob

logger = logging.getLogger(__name__)

Expand All @@ -16,7 +20,8 @@ def create_evaluation_run(
run_name: str,
dataset_name: str,
dataset_id: int,
config: dict,
config_id: UUID,
config_version: int,
organization_id: int,
project_id: int,
) -> EvaluationRun:
Expand All @@ -28,7 +33,8 @@ def create_evaluation_run(
run_name: Name of the evaluation run/experiment
dataset_name: Name of the dataset being used
dataset_id: ID of the dataset
config: Configuration dict for the evaluation
config_id: UUID of the stored config
config_version: Version number of the config
organization_id: Organization ID
project_id: Project ID
Expand All @@ -39,7 +45,8 @@ def create_evaluation_run(
run_name=run_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
config=config,
config_id=config_id,
config_version=config_version,
status="pending",
organization_id=organization_id,
project_id=project_id,
Expand All @@ -56,7 +63,10 @@ def create_evaluation_run(
logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True)
raise

logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}")
logger.info(
f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
f"config_id={config_id}, config_version={config_version}"
)
Comment on lines +66 to +69
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Prefix the new info log with the function name.

Line 66-69 doesn’t include the required [create_evaluation_run] prefix.

🔧 Proposed fix
-    logger.info(
-        f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
-        f"config_id={config_id}, config_version={config_version}"
-    )
+    logger.info(
+        f"[create_evaluation_run] Created EvaluationRun record: id={eval_run.id}, "
+        f"run_name={run_name}, config_id={config_id}, config_version={config_version}"
+    )
As per coding guidelines, ...
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
logger.info(
f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
f"config_id={config_id}, config_version={config_version}"
)
logger.info(
f"[create_evaluation_run] Created EvaluationRun record: id={eval_run.id}, "
f"run_name={run_name}, config_id={config_id}, config_version={config_version}"
)
🤖 Prompt for AI Agents
In `@backend/app/crud/evaluations/core.py` around lines 66 - 69, The logger.info
call that reports the created EvaluationRun (the one referencing eval_run.id,
run_name, config_id, config_version) must be prefixed with the function name
tag; update the log message in create_evaluation_run to start with
"[create_evaluation_run]" so the entry reads like "[create_evaluation_run]
Created EvaluationRun record: id=..., run_name=..., config_id=...,
config_version=...". Ensure you only change the log string (the logger.info
call) and keep the existing variables (eval_run, run_name, config_id,
config_version) intact.


return eval_run

Expand Down Expand Up @@ -311,3 +321,47 @@ def save_score(
f"traces={len(score.get('traces', []))}"
)
return eval_run


def resolve_model_from_config(
session: Session,
eval_run: EvaluationRun,
) -> str:
"""
Resolve the model name from the evaluation run's config.
Args:
session: Database session
eval_run: EvaluationRun instance
Returns:
Model name from config
Raises:
ValueError: If config is missing, invalid, or has no model
"""
if not eval_run.config_id or not eval_run.config_version:
raise ValueError(
f"Evaluation run {eval_run.id} has no config reference "
f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})"
)

config_version_crud = ConfigVersionCrud(
session=session,
config_id=eval_run.config_id,
project_id=eval_run.project_id,
)

config, error = resolve_config_blob(
config_crud=config_version_crud,
config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version),
)

if error or config is None:
raise ValueError(
f"Config resolution failed for evaluation {eval_run.id} "
f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}"
)

model = config.completion.params.model
return model
14 changes: 1 addition & 13 deletions backend/app/crud/evaluations/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,19 +363,7 @@ def start_embedding_batch(
logger.info(f"Starting embedding batch for evaluation run {eval_run.id}")

# Get embedding model from config (default: text-embedding-3-large)
embedding_model = eval_run.config.get(
"embedding_model", "text-embedding-3-large"
)

# Validate and fallback to default if invalid
try:
validate_embedding_model(embedding_model)
except ValueError as e:
logger.warning(
f"Invalid embedding model '{embedding_model}' in config: {e}. "
f"Falling back to text-embedding-3-large"
)
embedding_model = "text-embedding-3-large"
embedding_model = "text-embedding-3-large"

# Step 1: Build embedding JSONL with trace_ids
jsonl_data = build_embedding_jsonl(
Expand Down
Loading
Loading