Quickstart
Before starting, navigate to docent-alpha.transluce.org and sign up for an account.
To ingest your own agent runs, install the Python SDK:
Then create a new Python script that instantiates a client object:
import os
from docent import Docent
client = Docent(
email=os.getenv("DOCENT_EMAIL"), # is default and can be omitted
password=os.getenv("DOCENT_PASSWORD"), # is default and can be omitted
# Uncomment and adjust these if you're self-hosting
# server_url="http://localhost:8889",
# web_url="http://localhost:3001",
)
You can think of each frame grid as a collection of agent runs. Let's create a fresh one:
fg_id = client.create_framegrid(
name="sample framegrid",
description="example that comes with the Docent repo",
)
Now we're ready to ingest some logs!
Note
To directly run the code in this section, see examples/ingest_simple.ipynb
.
Say we have three simple agent runs.
transcript_1 = [
{
"role": "user",
"content": "What's the weather like in New York today?"
},
{
"role": "assistant",
"content": "The weather in New York today is mostly sunny with a high of 75°F (24°C)."
}
]
metadata_1 = {"model": "gpt-3.5-turbo", "agent_scaffold": "foo", "hallucinated": True}
transcript_2 = [
{
"role": "user",
"content": "What's the weather like in San Francisco today?"
},
{
"role": "assistant",
"content": "The weather in San Francisco today is mostly cloudy with a high of 65°F (18°C)."
}
]
metadata_2 = {"model": "gpt-3.5-turbo", "agent_scaffold": "foo", "hallucinated": True}
transcript_3 = [
{
"role": "user",
"content": "What's the weather like in Paris today?"
},
{
"role": "assistant",
"content": "I'm sorry, I don't know because I don't have access to weather tools."
}
]
metadata_3 = {"model": "gpt-3.5-turbo", "agent_scaffold": "bar", "hallucinated": False}
transcripts = [transcript_1, transcript_2, transcript_3]
metadata = [metadata_1, metadata_2, metadata_3]
We need to convert each input into an AgentRun
object, which holds Transcript
objects where each message needs to be a ChatMessage
. We could construct the messages manually, but it's easier to use the parse_chat_message
function, since the raw dicts already conform to the expected schema.
from docent.data_models.chat import parse_chat_message
from docent.data_models import Transcript
parsed_transcripts = [
Transcript(messages=[parse_chat_message(msg) for msg in transcript])
for transcript in transcripts
]
We also need to convert the metadata into a list of BaseAgentRunMetadata
objects. Let's subclass the base class to add some additional metadata.
from pydantic import Field
from docent.data_models import BaseAgentRunMetadata
class MyMetadata(BaseAgentRunMetadata):
model: str = Field(description="LLM API model used to generate the transcript")
agent_scaffold: str = Field(description="Agent scaffold in which the agent was run")
Now we can create the AgentRun
objects.
Note
To directly run the code in this section, see examples/ingest_tau_bench.ipynb
.
For a more complex case that involves tool calls, Docent ships with a sample \(\tau\)-bench log file, generated by running Sonnet 3.5 (new) on one task from the \(\tau\)-bench-airline dataset.
To inspect the log, we can load it as a dictionary.
from docent.samples import get_tau_bench_airline_fpath
import json
with open(get_tau_bench_airline_fpath(), "r") as f:
tb_log = json.load(f)
print(tb_log)
First, we need to define a metadata class. In addition to the required scores
field, we'll add a few additional fields:
from typing import Any
from docent.data_models import BaseAgentRunMetadata
from pydantic import Field
class TauBenchMetadata(BaseAgentRunMetadata):
benchmark_id: str = Field(
description="The benchmark that the task belongs to", default="tau_bench"
)
task_id: str = Field(description="The task within the benchmark that the agent is solving")
model: str = Field(description="The LLM used by the agent")
additional_metadata: dict[str, Any] = Field(description="Additional metadata about the task")
scoring_metadata: dict[str, Any] | None = Field(
description="Additional metadata about the scoring process"
)
Next, we write a function that parses the dict into an AgentRun
object, complete with TauBenchMetadata
. Most of the effort is in converting the raw tool calls into the expected format.
from docent.data_models import AgentRun, Transcript
from docent.data_models.chat import ChatMessage, ToolCall, parse_chat_message
def load_tau_bench_log(data: dict[str, Any]) -> list[AgentRun]:
traj, info, reward, task_id = data["traj"], data["info"], data["reward"], data["task_id"]
messages: list[ChatMessage] = []
for msg in traj:
# Extract raw message data
role = msg.get("role")
content = msg.get("content", "")
raw_tool_calls = msg.get("tool_calls")
tool_call_id = msg.get("tool_call_id")
# Create a message data dictionary
message_data = {
"role": role,
"content": content,
}
# For tool messages, include the tool name
if role == "tool":
message_data["name"] = msg.get("name")
message_data["tool_call_id"] = tool_call_id
# For assistant messages, include tool calls if present
if role == "assistant" and raw_tool_calls:
# Convert tool calls to the expected format
parsed_tool_calls: list[ToolCall] = []
for tc in raw_tool_calls:
tool_call = ToolCall(
id=tc.get("id"),
function=tc.get("function", {}).get("name"),
arguments=tc.get("function", {}).get("arguments", {}),
type="function",
parse_error=None,
)
parsed_tool_calls.append(tool_call)
message_data["tool_calls"] = parsed_tool_calls
# Parse the message into the appropriate type
chat_message = parse_chat_message(message_data)
messages.append(chat_message)
# Extract metadata from the sample
task_id = info["task"]["user_id"]
scores = {"reward": round(reward, 3)}
# Build metadata
metadata = TauBenchMetadata(
benchmark_id=task_id,
task_id=task_id,
model="sonnet-35-new",
scores=scores,
additional_metadata=info,
scoring_metadata=info["reward_info"],
)
# Create the transcript and wrap in AgentRun
transcript = Transcript(
messages=messages,
metadata=metadata,
)
agent_run = AgentRun(
transcripts={"default": transcript},
metadata=metadata,
)
return agent_run
Let's just load the single run in, and print its string representation.
Note
To directly run the code in this section, see examples/ingest_inspect.ipynb
.
Our ChatMessage
schema is compatible with Inspect AI's format (as of inspect-ai==0.3.93
), which means you can directly use the parse_chat_message
function to parse Inspect messages.
Docent ships with a sample Inspect log file, generated by running GPT-4o on a subset of the Intercode CTF benchmark.
Inspect provides a library function to read the log; we can convert it to a dictionary for easier viewing.
from docent.samples import get_inspect_fpath
from inspect_ai.log import read_eval_log
from pydantic_core import to_jsonable_python
ctf_log = read_eval_log(get_inspect_fpath())
ctf_log_dict = to_jsonable_python(ctf_log)
Now we define a metadata class with some fields relevant to the CTF task.
from typing import Any
from docent.data_models import BaseAgentRunMetadata
from pydantic import Field
class InspectAgentRunMetadata(BaseAgentRunMetadata):
task_id: str = Field(
description="The ID of the 'benchmark' or 'set of evals' that the transcript belongs to"
)
# Identification of this particular run
sample_id: str = Field(
description="The specific task inside of the `task_id` benchmark that the transcript was run on"
)
epoch_id: int = Field(
description="Each `sample_id` should be run multiple times due to stochasticity; `epoch_id` is the integer index of a specific run."
)
# Parameters for the run
model: str = Field(description="The model that was used to generate the transcript")
# Outcome
scoring_metadata: dict[str, Any] | None = Field(
description="Additional metadata about the scoring process"
)
# Inspect metadata
additional_metadata: dict[str, Any] | None = Field(
description="Additional metadata about the transcript"
)
Now we can write a function that takes the Inspect log and converts it into an AgentRun
object.
from inspect_ai.log import EvalLog
from docent.data_models import AgentRun, Transcript
from docent.data_models.chat import parse_chat_message
def load_inspect_log(log: EvalLog) -> list[AgentRun]:
if log.samples is None:
return []
agent_runs: list[AgentRun] = []
for s in log.samples:
# Extract sample_id from the sample ID
sample_id = s.id
epoch_id = s.epoch
# Gather scores
scores: dict[str, int | float | bool] = {}
# Evaluate correctness (for this CTF benchmark)
if s.scores and "includes" in s.scores:
scores["correct"] = s.scores["includes"].value == "C"
# Set metadata
metadata = InspectAgentRunMetadata(
task_id=log.eval.task,
sample_id=str(sample_id),
epoch_id=epoch_id,
model=log.eval.model,
scores=scores,
additional_metadata=s.metadata,
scoring_metadata=s.scores,
)
# Create transcript
agent_runs.append(
AgentRun(
transcripts={
"default": Transcript(
messages=[parse_chat_message(m.model_dump()) for m in s.messages]
)
},
metadata=metadata,
)
)
return agent_runs
Let's check on our loaded run:
We can finally ingest the agent run and watch the UI update:
If you navigate to the frontend URL printed by client.create_framegrid(...)
, you should see the run available for viewing.
Tips and tricks
Including sufficient context
Docent can only catch issues that are evident from the context it has about your evaluation. For example:
- If you're looking to catch issues with solution labels, you should provide the exact label in the metadata, not just the agent's score.
- For software engineering tasks, if you want to know why agents failed, you should include information about what tests were run and their traceback/execution logs.