chore: initial public snapshot for github upload

This commit is contained in:
Your Name
2026-03-26 20:06:14 +08:00
commit 0e5ecd930e
3497 changed files with 1586236 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
from .transformation import LiteLLMAnthropicMessagesAdapter
__all__ = ["LiteLLMAnthropicMessagesAdapter"]

View File

@@ -0,0 +1,345 @@
from typing import (
TYPE_CHECKING,
Any,
AsyncIterator,
Coroutine,
Dict,
List,
Optional,
Tuple,
Union,
cast,
)
import litellm
from litellm.llms.anthropic.experimental_pass_through.adapters.transformation import (
AnthropicAdapter,
)
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
)
from litellm.types.utils import ModelResponse
from litellm.utils import get_model_info
if TYPE_CHECKING:
pass
########################################################
# init adapter
ANTHROPIC_ADAPTER = AnthropicAdapter()
########################################################
class LiteLLMMessagesToCompletionTransformationHandler:
@staticmethod
def _route_openai_thinking_to_responses_api_if_needed(
completion_kwargs: Dict[str, Any],
*,
thinking: Optional[Dict[str, Any]],
) -> None:
"""
When users call `litellm.anthropic.messages.*` with a non-Anthropic model and
`thinking={"type": "enabled", ...}`, LiteLLM converts this into OpenAI
`reasoning_effort`.
For OpenAI models, Chat Completions typically does not return reasoning text
(only token accounting). To return a thinking-like content block in the
Anthropic response format, we route the request through OpenAI's Responses API
and request a reasoning summary.
"""
custom_llm_provider = completion_kwargs.get("custom_llm_provider")
if custom_llm_provider is None:
try:
_, inferred_provider, _, _ = litellm.utils.get_llm_provider(
model=cast(str, completion_kwargs.get("model"))
)
custom_llm_provider = inferred_provider
except Exception:
custom_llm_provider = None
if custom_llm_provider != "openai":
return
if not isinstance(thinking, dict) or thinking.get("type") != "enabled":
return
model = completion_kwargs.get("model")
try:
model_info = get_model_info(
model=cast(str, model), custom_llm_provider=custom_llm_provider
)
if model_info and model_info.get("supports_reasoning") is False:
# Model doesn't support reasoning/responses API, don't route
return
except Exception:
pass
if isinstance(model, str) and model and not model.startswith("responses/"):
# Prefix model with "responses/" to route to OpenAI Responses API
completion_kwargs["model"] = f"responses/{model}"
reasoning_effort = completion_kwargs.get("reasoning_effort")
if isinstance(reasoning_effort, str) and reasoning_effort:
completion_kwargs["reasoning_effort"] = {
"effort": reasoning_effort,
"summary": "detailed",
}
elif isinstance(reasoning_effort, dict):
if (
"summary" not in reasoning_effort
and "generate_summary" not in reasoning_effort
):
updated_reasoning_effort = dict(reasoning_effort)
updated_reasoning_effort["summary"] = "detailed"
completion_kwargs["reasoning_effort"] = updated_reasoning_effort
@staticmethod
def _prepare_completion_kwargs(
*,
max_tokens: int,
messages: List[Dict],
model: str,
metadata: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
output_format: Optional[Dict] = None,
extra_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[Dict[str, Any], Dict[str, str]]:
"""Prepare kwargs for litellm.completion/acompletion.
Returns:
Tuple of (completion_kwargs, tool_name_mapping)
- tool_name_mapping maps truncated tool names back to original names
for tools that exceeded OpenAI's 64-char limit
"""
from litellm.litellm_core_utils.litellm_logging import (
Logging as LiteLLMLoggingObject,
)
request_data = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
}
if metadata:
request_data["metadata"] = metadata
if stop_sequences:
request_data["stop_sequences"] = stop_sequences
if system:
request_data["system"] = system
if temperature is not None:
request_data["temperature"] = temperature
if thinking:
request_data["thinking"] = thinking
if tool_choice:
request_data["tool_choice"] = tool_choice
if tools:
request_data["tools"] = tools
if top_k is not None:
request_data["top_k"] = top_k
if top_p is not None:
request_data["top_p"] = top_p
if output_format:
request_data["output_format"] = output_format
(
openai_request,
tool_name_mapping,
) = ANTHROPIC_ADAPTER.translate_completion_input_params_with_tool_mapping(
request_data
)
if openai_request is None:
raise ValueError("Failed to translate request to OpenAI format")
completion_kwargs: Dict[str, Any] = dict(openai_request)
if stream:
completion_kwargs["stream"] = stream
completion_kwargs["stream_options"] = {
"include_usage": True,
}
excluded_keys = {"anthropic_messages"}
extra_kwargs = extra_kwargs or {}
for key, value in extra_kwargs.items():
if (
key == "litellm_logging_obj"
and value is not None
and isinstance(value, LiteLLMLoggingObject)
):
from litellm.types.utils import CallTypes
setattr(value, "call_type", CallTypes.completion.value)
setattr(
value, "stream_options", completion_kwargs.get("stream_options")
)
if (
key not in excluded_keys
and key not in completion_kwargs
and value is not None
):
completion_kwargs[key] = value
LiteLLMMessagesToCompletionTransformationHandler._route_openai_thinking_to_responses_api_if_needed(
completion_kwargs,
thinking=thinking,
)
return completion_kwargs, tool_name_mapping
@staticmethod
async def async_anthropic_messages_handler(
max_tokens: int,
messages: List[Dict],
model: str,
metadata: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
output_format: Optional[Dict] = None,
**kwargs,
) -> Union[AnthropicMessagesResponse, AsyncIterator]:
"""Handle non-Anthropic models asynchronously using the adapter"""
(
completion_kwargs,
tool_name_mapping,
) = LiteLLMMessagesToCompletionTransformationHandler._prepare_completion_kwargs(
max_tokens=max_tokens,
messages=messages,
model=model,
metadata=metadata,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
output_format=output_format,
extra_kwargs=kwargs,
)
completion_response = await litellm.acompletion(**completion_kwargs)
if stream:
transformed_stream = (
ANTHROPIC_ADAPTER.translate_completion_output_params_streaming(
completion_response,
model=model,
tool_name_mapping=tool_name_mapping,
)
)
if transformed_stream is not None:
return transformed_stream
raise ValueError("Failed to transform streaming response")
else:
anthropic_response = ANTHROPIC_ADAPTER.translate_completion_output_params(
cast(ModelResponse, completion_response),
tool_name_mapping=tool_name_mapping,
)
if anthropic_response is not None:
return anthropic_response
raise ValueError("Failed to transform response to Anthropic format")
@staticmethod
def anthropic_messages_handler(
max_tokens: int,
messages: List[Dict],
model: str,
metadata: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
output_format: Optional[Dict] = None,
_is_async: bool = False,
**kwargs,
) -> Union[
AnthropicMessagesResponse,
AsyncIterator[Any],
Coroutine[Any, Any, Union[AnthropicMessagesResponse, AsyncIterator[Any]]],
]:
"""Handle non-Anthropic models using the adapter."""
if _is_async is True:
return LiteLLMMessagesToCompletionTransformationHandler.async_anthropic_messages_handler(
max_tokens=max_tokens,
messages=messages,
model=model,
metadata=metadata,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
output_format=output_format,
**kwargs,
)
(
completion_kwargs,
tool_name_mapping,
) = LiteLLMMessagesToCompletionTransformationHandler._prepare_completion_kwargs(
max_tokens=max_tokens,
messages=messages,
model=model,
metadata=metadata,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
output_format=output_format,
extra_kwargs=kwargs,
)
completion_response = litellm.completion(**completion_kwargs)
if stream:
transformed_stream = (
ANTHROPIC_ADAPTER.translate_completion_output_params_streaming(
completion_response,
model=model,
tool_name_mapping=tool_name_mapping,
)
)
if transformed_stream is not None:
return transformed_stream
raise ValueError("Failed to transform streaming response")
else:
anthropic_response = ANTHROPIC_ADAPTER.translate_completion_output_params(
cast(ModelResponse, completion_response),
tool_name_mapping=tool_name_mapping,
)
if anthropic_response is not None:
return anthropic_response
raise ValueError("Failed to transform response to Anthropic format")

View File

@@ -0,0 +1,488 @@
# What is this?
## Translates OpenAI call to Anthropic `/v1/messages` format
import json
import traceback
from collections import deque
from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Iterator, Literal, Optional
from litellm import verbose_logger
from litellm._uuid import uuid
from litellm.types.llms.anthropic import UsageDelta
from litellm.types.utils import AdapterCompletionStreamWrapper
if TYPE_CHECKING:
from litellm.types.utils import ModelResponseStream
class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
"""
- first chunk return 'message_start'
- content block must be started and stopped
- finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
"""
from litellm.types.llms.anthropic import (
ContentBlockContentBlockDict,
ContentBlockStart,
ContentBlockStartText,
TextBlock,
)
sent_first_chunk: bool = False
sent_content_block_start: bool = False
sent_content_block_finish: bool = False
current_content_block_type: Literal["text", "tool_use", "thinking"] = "text"
sent_last_message: bool = False
holding_chunk: Optional[Any] = None
holding_stop_reason_chunk: Optional[Any] = None
queued_usage_chunk: bool = False
current_content_block_index: int = 0
current_content_block_start: ContentBlockContentBlockDict = TextBlock(
type="text",
text="",
)
chunk_queue: deque = deque() # Queue for buffering multiple chunks
def __init__(
self,
completion_stream: Any,
model: str,
tool_name_mapping: Optional[Dict[str, str]] = None,
):
super().__init__(completion_stream)
self.model = model
# Mapping of truncated tool names to original names (for OpenAI's 64-char limit)
self.tool_name_mapping = tool_name_mapping or {}
def _create_initial_usage_delta(self) -> UsageDelta:
"""
Create the initial UsageDelta for the message_start event.
Initializes cache token fields (cache_creation_input_tokens, cache_read_input_tokens)
to 0 to indicate to clients (like Claude Code) that prompt caching is supported.
The actual cache token values will be provided in the message_delta event at the
end of the stream, since Bedrock Converse API only returns usage data in the final
response chunk.
Returns:
UsageDelta with all token counts initialized to 0.
"""
return UsageDelta(
input_tokens=0,
output_tokens=0,
cache_creation_input_tokens=0,
cache_read_input_tokens=0,
)
def __next__(self):
from .transformation import LiteLLMAnthropicMessagesAdapter
try:
# Always return queued chunks first
if self.chunk_queue:
return self.chunk_queue.popleft()
# Queue initial chunks if not sent yet
if self.sent_first_chunk is False:
self.sent_first_chunk = True
self.chunk_queue.append(
{
"type": "message_start",
"message": {
"id": "msg_{}".format(uuid.uuid4()),
"type": "message",
"role": "assistant",
"content": [],
"model": self.model,
"stop_reason": None,
"stop_sequence": None,
"usage": self._create_initial_usage_delta(),
},
}
)
return self.chunk_queue.popleft()
if self.sent_content_block_start is False:
self.sent_content_block_start = True
self.chunk_queue.append(
{
"type": "content_block_start",
"index": self.current_content_block_index,
"content_block": {"type": "text", "text": ""},
}
)
return self.chunk_queue.popleft()
for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
should_start_new_block = self._should_start_new_content_block(chunk)
if should_start_new_block:
self._increment_content_block_index()
processed_chunk = LiteLLMAnthropicMessagesAdapter().translate_streaming_openai_response_to_anthropic(
response=chunk,
current_content_block_index=self.current_content_block_index,
)
if should_start_new_block and not self.sent_content_block_finish:
# Queue the sequence: content_block_stop -> content_block_start
# The trigger chunk itself is not emitted as a delta since the
# content_block_start already carries the relevant information.
self.chunk_queue.append(
{
"type": "content_block_stop",
"index": max(self.current_content_block_index - 1, 0),
}
)
self.chunk_queue.append(
{
"type": "content_block_start",
"index": self.current_content_block_index,
"content_block": self.current_content_block_start,
}
)
self.sent_content_block_finish = False
return self.chunk_queue.popleft()
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
# Queue both the content_block_stop and the message_delta
self.chunk_queue.append(
{
"type": "content_block_stop",
"index": self.current_content_block_index,
}
)
self.sent_content_block_finish = True
self.chunk_queue.append(processed_chunk)
return self.chunk_queue.popleft()
elif self.holding_chunk is not None:
self.chunk_queue.append(self.holding_chunk)
self.chunk_queue.append(processed_chunk)
self.holding_chunk = None
return self.chunk_queue.popleft()
else:
self.chunk_queue.append(processed_chunk)
return self.chunk_queue.popleft()
# Handle any remaining held chunks after stream ends
if self.holding_chunk is not None:
self.chunk_queue.append(self.holding_chunk)
self.holding_chunk = None
if not self.sent_last_message:
self.sent_last_message = True
self.chunk_queue.append({"type": "message_stop"})
if self.chunk_queue:
return self.chunk_queue.popleft()
raise StopIteration
except StopIteration:
if self.chunk_queue:
return self.chunk_queue.popleft()
if self.sent_last_message is False:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopIteration
except Exception as e:
verbose_logger.error(
"Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
)
raise StopAsyncIteration
async def __anext__(self): # noqa: PLR0915
from .transformation import LiteLLMAnthropicMessagesAdapter
try:
# Always return queued chunks first
if self.chunk_queue:
return self.chunk_queue.popleft()
# Queue initial chunks if not sent yet
if self.sent_first_chunk is False:
self.sent_first_chunk = True
self.chunk_queue.append(
{
"type": "message_start",
"message": {
"id": "msg_{}".format(uuid.uuid4()),
"type": "message",
"role": "assistant",
"content": [],
"model": self.model,
"stop_reason": None,
"stop_sequence": None,
"usage": self._create_initial_usage_delta(),
},
}
)
return self.chunk_queue.popleft()
if self.sent_content_block_start is False:
self.sent_content_block_start = True
self.chunk_queue.append(
{
"type": "content_block_start",
"index": self.current_content_block_index,
"content_block": {"type": "text", "text": ""},
}
)
return self.chunk_queue.popleft()
async for chunk in self.completion_stream:
if chunk == "None" or chunk is None:
raise Exception
# Check if we need to start a new content block
should_start_new_block = self._should_start_new_content_block(chunk)
if should_start_new_block:
self._increment_content_block_index()
processed_chunk = LiteLLMAnthropicMessagesAdapter().translate_streaming_openai_response_to_anthropic(
response=chunk,
current_content_block_index=self.current_content_block_index,
)
# Check if this is a usage chunk and we have a held stop_reason chunk
if (
self.holding_stop_reason_chunk is not None
and getattr(chunk, "usage", None) is not None
):
# Merge usage into the held stop_reason chunk
merged_chunk = self.holding_stop_reason_chunk.copy()
if "delta" not in merged_chunk:
merged_chunk["delta"] = {}
# Add usage to the held chunk
uncached_input_tokens = chunk.usage.prompt_tokens or 0
if (
hasattr(chunk.usage, "prompt_tokens_details")
and chunk.usage.prompt_tokens_details
):
cached_tokens = (
getattr(
chunk.usage.prompt_tokens_details, "cached_tokens", 0
)
or 0
)
uncached_input_tokens -= cached_tokens
usage_dict: UsageDelta = {
"input_tokens": uncached_input_tokens,
"output_tokens": chunk.usage.completion_tokens or 0,
}
# Add cache tokens if available (for prompt caching support)
if (
hasattr(chunk.usage, "_cache_creation_input_tokens")
and chunk.usage._cache_creation_input_tokens > 0
):
usage_dict[
"cache_creation_input_tokens"
] = chunk.usage._cache_creation_input_tokens
if (
hasattr(chunk.usage, "_cache_read_input_tokens")
and chunk.usage._cache_read_input_tokens > 0
):
usage_dict[
"cache_read_input_tokens"
] = chunk.usage._cache_read_input_tokens
merged_chunk["usage"] = usage_dict
# Queue the merged chunk and reset
self.chunk_queue.append(merged_chunk)
self.queued_usage_chunk = True
self.holding_stop_reason_chunk = None
return self.chunk_queue.popleft()
# Check if this processed chunk has a stop_reason - hold it for next chunk
if not self.queued_usage_chunk:
if should_start_new_block and not self.sent_content_block_finish:
# Queue the sequence: content_block_stop -> content_block_start
# The trigger chunk itself is not emitted as a delta since the
# content_block_start already carries the relevant information.
# 1. Stop current content block
self.chunk_queue.append(
{
"type": "content_block_stop",
"index": max(self.current_content_block_index - 1, 0),
}
)
# 2. Start new content block
self.chunk_queue.append(
{
"type": "content_block_start",
"index": self.current_content_block_index,
"content_block": self.current_content_block_start,
}
)
# Reset state for new block
self.sent_content_block_finish = False
# Return the first queued item
return self.chunk_queue.popleft()
if (
processed_chunk["type"] == "message_delta"
and self.sent_content_block_finish is False
):
# Queue both the content_block_stop and the holding chunk
self.chunk_queue.append(
{
"type": "content_block_stop",
"index": self.current_content_block_index,
}
)
self.sent_content_block_finish = True
if (
processed_chunk.get("delta", {}).get("stop_reason")
is not None
):
self.holding_stop_reason_chunk = processed_chunk
else:
self.chunk_queue.append(processed_chunk)
return self.chunk_queue.popleft()
elif self.holding_chunk is not None:
# Queue both chunks
self.chunk_queue.append(self.holding_chunk)
self.chunk_queue.append(processed_chunk)
self.holding_chunk = None
return self.chunk_queue.popleft()
else:
# Queue the current chunk
self.chunk_queue.append(processed_chunk)
return self.chunk_queue.popleft()
# Handle any remaining held chunks after stream ends
if not self.queued_usage_chunk:
if self.holding_stop_reason_chunk is not None:
self.chunk_queue.append(self.holding_stop_reason_chunk)
self.holding_stop_reason_chunk = None
if self.holding_chunk is not None:
self.chunk_queue.append(self.holding_chunk)
self.holding_chunk = None
if not self.sent_last_message:
self.sent_last_message = True
self.chunk_queue.append({"type": "message_stop"})
# Return queued items if any
if self.chunk_queue:
return self.chunk_queue.popleft()
raise StopIteration
except StopIteration:
# Handle any remaining queued chunks before stopping
if self.chunk_queue:
return self.chunk_queue.popleft()
# Handle any held stop_reason chunk
if self.holding_stop_reason_chunk is not None:
return self.holding_stop_reason_chunk
if not self.sent_last_message:
self.sent_last_message = True
return {"type": "message_stop"}
raise StopAsyncIteration
def anthropic_sse_wrapper(self) -> Iterator[bytes]:
"""
Convert AnthropicStreamWrapper dict chunks to Server-Sent Events format.
Similar to the Bedrock bedrock_sse_wrapper implementation.
This wrapper ensures dict chunks are SSE formatted with both event and data lines.
"""
for chunk in self:
if isinstance(chunk, dict):
event_type: str = str(chunk.get("type", "message"))
payload = f"event: {event_type}\ndata: {json.dumps(chunk)}\n\n"
yield payload.encode()
else:
# For non-dict chunks, forward the original value unchanged
yield chunk
async def async_anthropic_sse_wrapper(self) -> AsyncIterator[bytes]:
"""
Async version of anthropic_sse_wrapper.
Convert AnthropicStreamWrapper dict chunks to Server-Sent Events format.
"""
async for chunk in self:
if isinstance(chunk, dict):
event_type: str = str(chunk.get("type", "message"))
payload = f"event: {event_type}\ndata: {json.dumps(chunk)}\n\n"
yield payload.encode()
else:
# For non-dict chunks, forward the original value unchanged
yield chunk
def _increment_content_block_index(self):
self.current_content_block_index += 1
def _should_start_new_content_block(self, chunk: "ModelResponseStream") -> bool:
"""
Determine if we should start a new content block based on the processed chunk.
Override this method with your specific logic for detecting new content blocks.
Examples of when you might want to start a new content block:
- Switching from text to tool calls
- Different content types in the response
- Specific markers in the content
"""
from .transformation import LiteLLMAnthropicMessagesAdapter
# Example logic - customize based on your needs:
# If chunk indicates a tool call
if chunk.choices[0].finish_reason is not None:
return False
(
block_type,
content_block_start,
) = LiteLLMAnthropicMessagesAdapter()._translate_streaming_openai_chunk_to_anthropic_content_block(
choices=chunk.choices # type: ignore
)
# Restore original tool name if it was truncated for OpenAI's 64-char limit
if block_type == "tool_use":
# Type narrowing: content_block_start is ToolUseBlock when block_type is "tool_use"
from typing import cast
from litellm.types.llms.anthropic import ToolUseBlock
tool_block = cast(ToolUseBlock, content_block_start)
if tool_block.get("name"):
truncated_name = tool_block["name"]
original_name = self.tool_name_mapping.get(
truncated_name, truncated_name
)
tool_block["name"] = original_name
if block_type != self.current_content_block_type:
self.current_content_block_type = block_type
self.current_content_block_start = content_block_start
return True
# For parallel tool calls, we'll necessarily have a new content block
# if we get a function name since it signals a new tool call
if block_type == "tool_use":
from typing import cast
from litellm.types.llms.anthropic import ToolUseBlock
tool_block = cast(ToolUseBlock, content_block_start)
if tool_block.get("name"):
self.current_content_block_type = block_type
self.current_content_block_start = content_block_start
return True
return False

View File

@@ -0,0 +1,51 @@
# Anthropic Messages Pass-Through Architecture
## Request Flow
```mermaid
flowchart TD
A[litellm.anthropic.messages.acreate] --> B{Provider?}
B -->|anthropic| C[AnthropicMessagesConfig]
B -->|azure_ai| D[AzureAnthropicMessagesConfig]
B -->|bedrock invoke| E[BedrockAnthropicMessagesConfig]
B -->|vertex_ai| F[VertexAnthropicMessagesConfig]
B -->|Other providers| G[LiteLLMAnthropicMessagesAdapter]
C --> H[Direct Anthropic API]
D --> I[Azure AI Foundry API]
E --> J[Bedrock Invoke API]
F --> K[Vertex AI API]
G --> L[translate_anthropic_to_openai]
L --> M[litellm.completion]
M --> N[Provider API]
N --> O[translate_openai_response_to_anthropic]
O --> P[Anthropic Response Format]
H --> P
I --> P
J --> P
K --> P
```
## Adapter Flow (Non-Native Providers)
```mermaid
sequenceDiagram
participant User
participant Handler as anthropic_messages_handler
participant Adapter as LiteLLMAnthropicMessagesAdapter
participant LiteLLM as litellm.completion
participant Provider as Provider API
User->>Handler: Anthropic Messages Request
Handler->>Adapter: translate_anthropic_to_openai()
Note over Adapter: messages, tools, thinking,<br/>output_format → response_format
Adapter->>LiteLLM: OpenAI Format Request
LiteLLM->>Provider: Provider-specific Request
Provider->>LiteLLM: Provider Response
LiteLLM->>Adapter: OpenAI Format Response
Adapter->>Handler: translate_openai_response_to_anthropic()
Handler->>User: Anthropic Messages Response
```

View File

@@ -0,0 +1,251 @@
"""
Fake Streaming Iterator for Anthropic Messages
This module provides a fake streaming iterator that converts non-streaming
Anthropic Messages responses into proper streaming format.
Used when WebSearch interception converts stream=True to stream=False but
the LLM doesn't make a tool call, and we need to return a stream to the user.
"""
import json
from typing import Any, Dict, List, cast
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
)
class FakeAnthropicMessagesStreamIterator:
"""
Fake streaming iterator for Anthropic Messages responses.
Used when we need to convert a non-streaming response to a streaming format,
such as when WebSearch interception converts stream=True to stream=False but
the LLM doesn't make a tool call.
This creates a proper Anthropic-style streaming response with multiple events:
- message_start
- content_block_start (for each content block)
- content_block_delta (for text content, chunked)
- content_block_stop
- message_delta (for usage)
- message_stop
"""
def __init__(self, response: AnthropicMessagesResponse):
self.response = response
self.chunks = self._create_streaming_chunks()
self.current_index = 0
def _create_streaming_chunks(self) -> List[bytes]:
"""Convert the non-streaming response to streaming chunks"""
chunks = []
# Cast response to dict for easier access
response_dict = cast(Dict[str, Any], self.response)
# 1. message_start event
usage = response_dict.get("usage", {})
message_start = {
"type": "message_start",
"message": {
"id": response_dict.get("id"),
"type": "message",
"role": response_dict.get("role", "assistant"),
"model": response_dict.get("model"),
"content": [],
"stop_reason": None,
"stop_sequence": None,
"usage": {
"input_tokens": usage.get("input_tokens", 0) if usage else 0,
"output_tokens": 0,
},
},
}
chunks.append(
f"event: message_start\ndata: {json.dumps(message_start)}\n\n".encode()
)
# 2-4. For each content block, send start/delta/stop events
content_blocks = response_dict.get("content", [])
if content_blocks:
for index, block in enumerate(content_blocks):
# Cast block to dict for easier access
block_dict = cast(Dict[str, Any], block)
block_type = block_dict.get("type")
if block_type == "text":
# content_block_start
content_block_start = {
"type": "content_block_start",
"index": index,
"content_block": {"type": "text", "text": ""},
}
chunks.append(
f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()
)
# content_block_delta (send full text as one delta for simplicity)
text = block_dict.get("text", "")
content_block_delta = {
"type": "content_block_delta",
"index": index,
"delta": {"type": "text_delta", "text": text},
}
chunks.append(
f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n".encode()
)
# content_block_stop
content_block_stop = {"type": "content_block_stop", "index": index}
chunks.append(
f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()
)
elif block_type == "thinking":
# content_block_start for thinking
content_block_start = {
"type": "content_block_start",
"index": index,
"content_block": {
"type": "thinking",
"thinking": "",
"signature": "",
},
}
chunks.append(
f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()
)
# content_block_delta for thinking text
thinking_text = block_dict.get("thinking", "")
if thinking_text:
content_block_delta = {
"type": "content_block_delta",
"index": index,
"delta": {
"type": "thinking_delta",
"thinking": thinking_text,
},
}
chunks.append(
f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n".encode()
)
# content_block_delta for signature (if present)
signature = block_dict.get("signature", "")
if signature:
signature_delta = {
"type": "content_block_delta",
"index": index,
"delta": {
"type": "signature_delta",
"signature": signature,
},
}
chunks.append(
f"event: content_block_delta\ndata: {json.dumps(signature_delta)}\n\n".encode()
)
# content_block_stop
content_block_stop = {"type": "content_block_stop", "index": index}
chunks.append(
f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()
)
elif block_type == "redacted_thinking":
# content_block_start for redacted_thinking
content_block_start = {
"type": "content_block_start",
"index": index,
"content_block": {"type": "redacted_thinking"},
}
chunks.append(
f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()
)
# content_block_stop (no delta for redacted thinking)
content_block_stop = {"type": "content_block_stop", "index": index}
chunks.append(
f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()
)
elif block_type == "tool_use":
# content_block_start
content_block_start = {
"type": "content_block_start",
"index": index,
"content_block": {
"type": "tool_use",
"id": block_dict.get("id"),
"name": block_dict.get("name"),
"input": {},
},
}
chunks.append(
f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()
)
# content_block_delta (send input as JSON delta)
input_data = block_dict.get("input", {})
content_block_delta = {
"type": "content_block_delta",
"index": index,
"delta": {
"type": "input_json_delta",
"partial_json": json.dumps(input_data),
},
}
chunks.append(
f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n".encode()
)
# content_block_stop
content_block_stop = {"type": "content_block_stop", "index": index}
chunks.append(
f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()
)
# 5. message_delta event (with final usage and stop_reason)
message_delta = {
"type": "message_delta",
"delta": {
"stop_reason": response_dict.get("stop_reason"),
"stop_sequence": response_dict.get("stop_sequence"),
},
"usage": {"output_tokens": usage.get("output_tokens", 0) if usage else 0},
}
chunks.append(
f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n".encode()
)
# 6. message_stop event
message_stop = {"type": "message_stop", "usage": usage if usage else {}}
chunks.append(
f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n".encode()
)
return chunks
def __aiter__(self):
return self
async def __anext__(self):
if self.current_index >= len(self.chunks):
raise StopAsyncIteration
chunk = self.chunks[self.current_index]
self.current_index += 1
return chunk
def __iter__(self):
return self
def __next__(self):
if self.current_index >= len(self.chunks):
raise StopIteration
chunk = self.chunks[self.current_index]
self.current_index += 1
return chunk

View File

@@ -0,0 +1,362 @@
"""
- call /messages on Anthropic API
- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here
- Ensure requests are logged in the DB - stream + non-stream
"""
import asyncio
import contextvars
from functools import partial
from typing import Any, AsyncIterator, Coroutine, Dict, List, Optional, Union
import litellm
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.anthropic_messages.transformation import (
BaseAnthropicMessagesConfig,
)
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
from litellm.types.llms.anthropic_messages.anthropic_request import AnthropicMetadata
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
)
from litellm.types.router import GenericLiteLLMParams
from litellm.utils import ProviderConfigManager, client
from ..adapters.handler import LiteLLMMessagesToCompletionTransformationHandler
from ..responses_adapters.handler import LiteLLMMessagesToResponsesAPIHandler
from .utils import AnthropicMessagesRequestUtils, mock_response
# Providers that are routed directly to the OpenAI Responses API instead of
# going through chat/completions.
_RESPONSES_API_PROVIDERS = frozenset({"openai"})
def _should_route_to_responses_api(custom_llm_provider: Optional[str]) -> bool:
"""Return True when the provider should use the Responses API path.
Set ``litellm.use_chat_completions_url_for_anthropic_messages = True`` to
opt out and route OpenAI/Azure requests through chat/completions instead.
"""
if litellm.use_chat_completions_url_for_anthropic_messages:
return False
return custom_llm_provider in _RESPONSES_API_PROVIDERS
####### ENVIRONMENT VARIABLES ###################
# Initialize any necessary instances or variables here
base_llm_http_handler = BaseLLMHTTPHandler()
#################################################
async def _execute_pre_request_hooks(
model: str,
messages: List[Dict],
tools: Optional[List[Dict]],
stream: Optional[bool],
custom_llm_provider: Optional[str],
**kwargs,
) -> Dict:
"""
Execute pre-request hooks from CustomLogger callbacks.
Allows CustomLoggers to modify request parameters before the API call.
Used for WebSearch tool conversion, stream modification, etc.
Args:
model: Model name
messages: List of messages
tools: Optional tools list
stream: Optional stream flag
custom_llm_provider: Provider name (if not set, will be extracted from model)
**kwargs: Additional request parameters
Returns:
Dict containing all (potentially modified) request parameters including tools, stream
"""
# If custom_llm_provider not provided, extract from model
if not custom_llm_provider:
try:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
except Exception:
# If extraction fails, continue without provider
pass
# Build complete request kwargs dict
request_kwargs = {
"tools": tools,
"stream": stream,
"litellm_params": {
"custom_llm_provider": custom_llm_provider,
},
**kwargs,
}
if not litellm.callbacks:
return request_kwargs
from litellm.integrations.custom_logger import CustomLogger as _CustomLogger
for callback in litellm.callbacks:
if not isinstance(callback, _CustomLogger):
continue
# Call the pre-request hook
modified_kwargs = await callback.async_pre_request_hook(
model, messages, request_kwargs
)
# If hook returned modified kwargs, use them
if modified_kwargs is not None:
request_kwargs = modified_kwargs
return request_kwargs
@client
async def anthropic_messages(
max_tokens: int,
messages: List[Dict],
model: str,
metadata: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client: Optional[AsyncHTTPHandler] = None,
custom_llm_provider: Optional[str] = None,
**kwargs,
) -> Union[AnthropicMessagesResponse, AsyncIterator]:
"""
Async: Make llm api request in Anthropic /messages API spec
"""
# Execute pre-request hooks to allow CustomLoggers to modify request
request_kwargs = await _execute_pre_request_hooks(
model=model,
messages=messages,
tools=tools,
stream=stream,
custom_llm_provider=custom_llm_provider,
**kwargs,
)
# Extract modified parameters
tools = request_kwargs.pop("tools", tools)
stream = request_kwargs.pop("stream", stream)
# Remove litellm_params from kwargs (only needed for hooks)
request_kwargs.pop("litellm_params", None)
# Merge back any other modifications
kwargs.update(request_kwargs)
loop = asyncio.get_event_loop()
kwargs["is_async"] = True
func = partial(
anthropic_messages_handler,
max_tokens=max_tokens,
messages=messages,
model=model,
metadata=metadata,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
api_key=api_key,
api_base=api_base,
client=client,
custom_llm_provider=custom_llm_provider,
**kwargs,
)
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response
return response
def validate_anthropic_api_metadata(metadata: Optional[Dict] = None) -> Optional[Dict]:
"""
Validate Anthropic API metadata - This is done to ensure only allowed `metadata` fields are passed to Anthropic API
If there are any litellm specific metadata fields, use `litellm_metadata` key to pass them.
"""
if metadata is None:
return None
anthropic_metadata_obj = AnthropicMetadata(**metadata)
return anthropic_metadata_obj.model_dump(exclude_none=True)
def anthropic_messages_handler(
max_tokens: int,
messages: List[Dict],
model: str,
metadata: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
container: Optional[Dict] = None,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client: Optional[AsyncHTTPHandler] = None,
custom_llm_provider: Optional[str] = None,
**kwargs,
) -> Union[
AnthropicMessagesResponse,
AsyncIterator[Any],
Coroutine[Any, Any, Union[AnthropicMessagesResponse, AsyncIterator[Any]]],
]:
"""
Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
Args:
container: Container config with skills for code execution
"""
from litellm.types.utils import LlmProviders
metadata = validate_anthropic_api_metadata(metadata)
local_vars = locals()
is_async = kwargs.pop("is_async", False)
# Use provided client or create a new one
litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj") # type: ignore
# Store original model name before get_llm_provider strips the provider prefix
# This is needed by agentic hooks (e.g., websearch_interception) to make follow-up requests
original_model = model
litellm_params = GenericLiteLLMParams(
**kwargs,
api_key=api_key,
api_base=api_base,
custom_llm_provider=custom_llm_provider,
)
(
model,
custom_llm_provider,
dynamic_api_key,
dynamic_api_base,
) = litellm.get_llm_provider(
model=model,
custom_llm_provider=custom_llm_provider,
api_base=litellm_params.api_base,
api_key=litellm_params.api_key,
)
# Store agentic loop params in logging object for agentic hooks
# This provides original request context needed for follow-up calls
if litellm_logging_obj is not None:
litellm_logging_obj.model_call_details["agentic_loop_params"] = {
"model": original_model,
"custom_llm_provider": custom_llm_provider,
}
# Check if stream was converted for WebSearch interception
# This is set in the async wrapper above when stream=True is converted to stream=False
if kwargs.get("_websearch_interception_converted_stream", False):
litellm_logging_obj.model_call_details[
"websearch_interception_converted_stream"
] = True
if litellm_params.mock_response and isinstance(litellm_params.mock_response, str):
return mock_response(
model=model,
messages=messages,
max_tokens=max_tokens,
mock_response=litellm_params.mock_response,
)
anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = None
if custom_llm_provider is not None and custom_llm_provider in [
provider.value for provider in LlmProviders
]:
anthropic_messages_provider_config = (
ProviderConfigManager.get_provider_anthropic_messages_config(
model=model,
provider=litellm.LlmProviders(custom_llm_provider),
)
)
if anthropic_messages_provider_config is None:
# Route to Responses API for OpenAI / Azure, chat/completions for everything else.
_shared_kwargs = dict(
max_tokens=max_tokens,
messages=messages,
model=model,
metadata=metadata,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
_is_async=is_async,
api_key=api_key,
api_base=api_base,
client=client,
custom_llm_provider=custom_llm_provider,
**kwargs,
)
if _should_route_to_responses_api(custom_llm_provider):
return LiteLLMMessagesToResponsesAPIHandler.anthropic_messages_handler(
**_shared_kwargs
)
return (
LiteLLMMessagesToCompletionTransformationHandler.anthropic_messages_handler(
**_shared_kwargs
)
)
if custom_llm_provider is None:
raise ValueError(
f"custom_llm_provider is required for Anthropic messages, passed in model={model}, custom_llm_provider={custom_llm_provider}"
)
local_vars.update(kwargs)
anthropic_messages_optional_request_params = (
AnthropicMessagesRequestUtils.get_requested_anthropic_messages_optional_param(
params=local_vars
)
)
return base_llm_http_handler.anthropic_messages_handler(
model=model,
messages=messages,
anthropic_messages_provider_config=anthropic_messages_provider_config,
anthropic_messages_optional_request_params=dict(
anthropic_messages_optional_request_params
),
_is_async=is_async,
client=client,
custom_llm_provider=custom_llm_provider,
litellm_params=litellm_params,
logging_obj=litellm_logging_obj,
api_key=api_key,
api_base=api_base,
stream=stream,
kwargs=kwargs,
)

View File

@@ -0,0 +1,108 @@
import asyncio
import json
from datetime import datetime
from typing import Any, AsyncIterator, List, Union
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.proxy.pass_through_endpoints.success_handler import (
PassThroughEndpointLogging,
)
from litellm.types.passthrough_endpoints.pass_through_endpoints import EndpointType
from litellm.types.utils import GenericStreamingChunk, ModelResponseStream
GLOBAL_PASS_THROUGH_SUCCESS_HANDLER_OBJ = PassThroughEndpointLogging()
class BaseAnthropicMessagesStreamingIterator:
"""
Base class for Anthropic Messages streaming iterators that provides common logic
for streaming response handling and logging.
"""
def __init__(
self,
litellm_logging_obj: LiteLLMLoggingObj,
request_body: dict,
):
self.litellm_logging_obj = litellm_logging_obj
self.request_body = request_body
self.start_time = datetime.now()
async def _handle_streaming_logging(self, collected_chunks: List[bytes]):
"""Handle the logging after all chunks have been collected."""
from litellm.proxy.pass_through_endpoints.streaming_handler import (
PassThroughStreamingHandler,
)
end_time = datetime.now()
asyncio.create_task(
PassThroughStreamingHandler._route_streaming_logging_to_handler(
litellm_logging_obj=self.litellm_logging_obj,
passthrough_success_handler_obj=GLOBAL_PASS_THROUGH_SUCCESS_HANDLER_OBJ,
url_route="/v1/messages",
request_body=self.request_body or {},
endpoint_type=EndpointType.ANTHROPIC,
start_time=self.start_time,
raw_bytes=collected_chunks,
end_time=end_time,
)
)
def get_async_streaming_response_iterator(
self,
httpx_response,
request_body: dict,
litellm_logging_obj: LiteLLMLoggingObj,
) -> AsyncIterator:
"""Helper function to handle Anthropic streaming responses using the existing logging handlers"""
from litellm.proxy.pass_through_endpoints.streaming_handler import (
PassThroughStreamingHandler,
)
# Use the existing streaming handler for Anthropic
return PassThroughStreamingHandler.chunk_processor(
response=httpx_response,
request_body=request_body,
litellm_logging_obj=litellm_logging_obj,
endpoint_type=EndpointType.ANTHROPIC,
start_time=self.start_time,
passthrough_success_handler_obj=GLOBAL_PASS_THROUGH_SUCCESS_HANDLER_OBJ,
url_route="/v1/messages",
)
def _convert_chunk_to_sse_format(self, chunk: Union[dict, Any]) -> bytes:
"""
Convert a chunk to Server-Sent Events format.
This method should be overridden by subclasses if they need custom
chunk formatting logic.
"""
if isinstance(chunk, dict):
event_type: str = str(chunk.get("type", "message"))
payload = f"event: {event_type}\n" f"data: {json.dumps(chunk)}\n\n"
return payload.encode()
else:
# For non-dict chunks, return as is
return chunk
async def async_sse_wrapper(
self,
completion_stream: AsyncIterator[
Union[bytes, GenericStreamingChunk, ModelResponseStream, dict]
],
) -> AsyncIterator[bytes]:
"""
Generic async SSE wrapper that converts streaming chunks to SSE format
and handles logging.
This method provides the common logic for both Anthropic and Bedrock implementations.
"""
collected_chunks = []
async for chunk in completion_stream:
encoded_chunk = self._convert_chunk_to_sse_format(chunk)
collected_chunks.append(encoded_chunk)
yield encoded_chunk
# Handle logging after all chunks are processed
await self._handle_streaming_logging(collected_chunks)

View File

@@ -0,0 +1,308 @@
from typing import Any, AsyncIterator, Dict, List, Optional, Tuple
import httpx
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.litellm_core_utils.litellm_logging import verbose_logger
from litellm.llms.base_llm.anthropic_messages.transformation import (
BaseAnthropicMessagesConfig,
)
from litellm.types.llms.anthropic import (
ANTHROPIC_BETA_HEADER_VALUES,
AnthropicMessagesRequest,
)
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
)
from litellm.types.llms.anthropic_tool_search import get_tool_search_beta_header
from litellm.types.router import GenericLiteLLMParams
from ...common_utils import (
AnthropicError,
AnthropicModelInfo,
optionally_handle_anthropic_oauth,
)
DEFAULT_ANTHROPIC_API_BASE = "https://api.anthropic.com"
DEFAULT_ANTHROPIC_API_VERSION = "2023-06-01"
class AnthropicMessagesConfig(BaseAnthropicMessagesConfig):
def get_supported_anthropic_messages_params(self, model: str) -> list:
return [
"messages",
"model",
"system",
"max_tokens",
"stop_sequences",
"temperature",
"top_p",
"top_k",
"tools",
"tool_choice",
"thinking",
"context_management",
"output_format",
"inference_geo",
"speed",
"output_config",
# TODO: Add Anthropic `metadata` support
# "metadata",
]
@staticmethod
def _filter_billing_headers_from_system(system_param):
"""
Filter out x-anthropic-billing-header metadata from system parameter.
Args:
system_param: Can be a string or a list of system message content blocks
Returns:
Filtered system parameter (string or list), or None if all content was filtered
"""
if isinstance(system_param, str):
# If it's a string and starts with billing header, filter it out
if system_param.startswith("x-anthropic-billing-header:"):
return None
return system_param
elif isinstance(system_param, list):
# Filter list of system content blocks
filtered_list = []
for content_block in system_param:
if isinstance(content_block, dict):
text = content_block.get("text", "")
content_type = content_block.get("type", "")
# Skip text blocks that start with billing header
if content_type == "text" and text.startswith(
"x-anthropic-billing-header:"
):
continue
filtered_list.append(content_block)
else:
# Keep non-dict items as-is
filtered_list.append(content_block)
return filtered_list if len(filtered_list) > 0 else None
else:
return system_param
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
api_base = api_base or DEFAULT_ANTHROPIC_API_BASE
if not api_base.endswith("/v1/messages"):
api_base = f"{api_base}/v1/messages"
return api_base
def validate_anthropic_messages_environment(
self,
headers: dict,
model: str,
messages: List[Any],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> Tuple[dict, Optional[str]]:
import os
# Check for Anthropic OAuth token in Authorization header
headers, api_key = optionally_handle_anthropic_oauth(
headers=headers, api_key=api_key
)
if api_key is None:
api_key = os.getenv("ANTHROPIC_API_KEY")
if "x-api-key" not in headers and "authorization" not in headers and api_key:
headers["x-api-key"] = api_key
if "anthropic-version" not in headers:
headers["anthropic-version"] = DEFAULT_ANTHROPIC_API_VERSION
if "content-type" not in headers:
headers["content-type"] = "application/json"
headers = self._update_headers_with_anthropic_beta(
headers=headers,
optional_params=optional_params,
)
return headers, api_base
def transform_anthropic_messages_request(
self,
model: str,
messages: List[Dict],
anthropic_messages_optional_request_params: Dict,
litellm_params: GenericLiteLLMParams,
headers: dict,
) -> Dict:
"""
No transformation is needed for Anthropic messages
This takes in a request in the Anthropic /v1/messages API spec -> transforms it to /v1/messages API spec (i.e) no transformation is needed
"""
max_tokens = anthropic_messages_optional_request_params.pop("max_tokens", None)
if max_tokens is None:
raise AnthropicError(
message="max_tokens is required for Anthropic /v1/messages API",
status_code=400,
)
# Filter out x-anthropic-billing-header from system messages
system_param = anthropic_messages_optional_request_params.get("system")
if system_param is not None:
filtered_system = self._filter_billing_headers_from_system(system_param)
if filtered_system is not None and len(filtered_system) > 0:
anthropic_messages_optional_request_params["system"] = filtered_system
else:
# Remove system parameter if all content was filtered out
anthropic_messages_optional_request_params.pop("system", None)
# Transform context_management from OpenAI format to Anthropic format if needed
context_management_param = anthropic_messages_optional_request_params.get(
"context_management"
)
if context_management_param is not None:
from litellm.llms.anthropic.chat.transformation import AnthropicConfig
transformed_context_management = (
AnthropicConfig.map_openai_context_management_to_anthropic(
context_management_param
)
)
if transformed_context_management is not None:
anthropic_messages_optional_request_params[
"context_management"
] = transformed_context_management
####### get required params for all anthropic messages requests ######
verbose_logger.debug(f"TRANSFORMATION DEBUG - Messages: {messages}")
anthropic_messages_request: AnthropicMessagesRequest = AnthropicMessagesRequest(
messages=messages,
max_tokens=max_tokens,
model=model,
**anthropic_messages_optional_request_params,
)
return dict(anthropic_messages_request)
def transform_anthropic_messages_response(
self,
model: str,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> AnthropicMessagesResponse:
"""
No transformation is needed for Anthropic messages, since we want the response in the Anthropic /v1/messages API spec
"""
try:
raw_response_json = raw_response.json()
except Exception:
raise AnthropicError(
message=raw_response.text, status_code=raw_response.status_code
)
return AnthropicMessagesResponse(**raw_response_json)
def get_async_streaming_response_iterator(
self,
model: str,
httpx_response: httpx.Response,
request_body: dict,
litellm_logging_obj: LiteLLMLoggingObj,
) -> AsyncIterator:
"""Helper function to handle Anthropic streaming responses using the existing logging handlers"""
from litellm.llms.anthropic.experimental_pass_through.messages.streaming_iterator import (
BaseAnthropicMessagesStreamingIterator,
)
# Use the shared streaming handler for Anthropic
handler = BaseAnthropicMessagesStreamingIterator(
litellm_logging_obj=litellm_logging_obj,
request_body=request_body,
)
return handler.get_async_streaming_response_iterator(
httpx_response=httpx_response,
request_body=request_body,
litellm_logging_obj=litellm_logging_obj,
)
@staticmethod
def _update_headers_with_anthropic_beta(
headers: dict,
optional_params: dict,
custom_llm_provider: str = "anthropic",
) -> dict:
"""
Auto-inject anthropic-beta headers based on features used.
Handles:
- context_management: adds 'context-management-2025-06-27'
- tool_search: adds provider-specific tool search header
- output_format: adds 'structured-outputs-2025-11-13'
- speed: adds 'fast-mode-2026-02-01'
Args:
headers: Request headers dict
optional_params: Optional parameters including tools, context_management, output_format, speed
custom_llm_provider: Provider name for looking up correct tool search header
"""
beta_values: set = set()
# Get existing beta headers if any
existing_beta = headers.get("anthropic-beta")
if existing_beta:
beta_values.update(b.strip() for b in existing_beta.split(","))
# Check for context management
context_management_param = optional_params.get("context_management")
if context_management_param is not None:
# Check edits array for compact_20260112 type
edits = context_management_param.get("edits", [])
has_compact = False
has_other = False
for edit in edits:
edit_type = edit.get("type", "")
if edit_type == "compact_20260112":
has_compact = True
else:
has_other = True
# Add compact header if any compact edits exist
if has_compact:
beta_values.add(ANTHROPIC_BETA_HEADER_VALUES.COMPACT_2026_01_12.value)
# Add context management header if any other edits exist
if has_other:
beta_values.add(
ANTHROPIC_BETA_HEADER_VALUES.CONTEXT_MANAGEMENT_2025_06_27.value
)
# Check for structured outputs
if optional_params.get("output_format") is not None:
beta_values.add(
ANTHROPIC_BETA_HEADER_VALUES.STRUCTURED_OUTPUT_2025_09_25.value
)
# Check for fast mode
if optional_params.get("speed") == "fast":
beta_values.add(ANTHROPIC_BETA_HEADER_VALUES.FAST_MODE_2026_02_01.value)
# Check for tool search tools
tools = optional_params.get("tools")
if tools:
anthropic_model_info = AnthropicModelInfo()
if anthropic_model_info.is_tool_search_used(tools):
# Use provider-specific tool search header
tool_search_header = get_tool_search_beta_header(custom_llm_provider)
beta_values.add(tool_search_header)
if beta_values:
headers["anthropic-beta"] = ",".join(sorted(beta_values))
return headers

View File

@@ -0,0 +1,75 @@
from typing import Any, Dict, List, cast, get_type_hints
from litellm.types.llms.anthropic import AnthropicMessagesRequestOptionalParams
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
)
class AnthropicMessagesRequestUtils:
@staticmethod
def get_requested_anthropic_messages_optional_param(
params: Dict[str, Any],
) -> AnthropicMessagesRequestOptionalParams:
"""
Filter parameters to only include those defined in AnthropicMessagesRequestOptionalParams.
Args:
params: Dictionary of parameters to filter
Returns:
AnthropicMessagesRequestOptionalParams instance with only the valid parameters
"""
valid_keys = get_type_hints(AnthropicMessagesRequestOptionalParams).keys()
filtered_params = {
k: v for k, v in params.items() if k in valid_keys and v is not None
}
return cast(AnthropicMessagesRequestOptionalParams, filtered_params)
def mock_response(
model: str,
messages: List[Dict],
max_tokens: int,
mock_response: str = "Hi! My name is Claude.",
**kwargs,
) -> AnthropicMessagesResponse:
"""
Mock response for Anthropic messages
"""
from litellm.exceptions import (
ContextWindowExceededError,
InternalServerError,
RateLimitError,
)
if mock_response == "litellm.InternalServerError":
raise InternalServerError(
message="this is a mock internal server error",
llm_provider="anthropic",
model=model,
)
elif mock_response == "litellm.ContextWindowExceededError":
raise ContextWindowExceededError(
message="this is a mock context window exceeded error",
llm_provider="anthropic",
model=model,
)
elif mock_response == "litellm.RateLimitError":
raise RateLimitError(
message="this is a mock rate limit error",
llm_provider="anthropic",
model=model,
)
return AnthropicMessagesResponse(
**{
"content": [{"text": mock_response, "type": "text"}],
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
"model": "claude-sonnet-4-20250514",
"role": "assistant",
"stop_reason": "end_turn",
"stop_sequence": None,
"type": "message",
"usage": {"input_tokens": 2095, "output_tokens": 503},
}
)

View File

@@ -0,0 +1,3 @@
from .transformation import LiteLLMAnthropicToResponsesAPIAdapter
__all__ = ["LiteLLMAnthropicToResponsesAPIAdapter"]

View File

@@ -0,0 +1,239 @@
"""
Handler for the Anthropic v1/messages -> OpenAI Responses API path.
Used when the target model is an OpenAI or Azure model.
"""
from typing import Any, AsyncIterator, Coroutine, Dict, List, Optional, Union
import litellm
from litellm.types.llms.anthropic import AnthropicMessagesRequest
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
)
from litellm.types.llms.openai import ResponsesAPIResponse
from .streaming_iterator import AnthropicResponsesStreamWrapper
from .transformation import LiteLLMAnthropicToResponsesAPIAdapter
_ADAPTER = LiteLLMAnthropicToResponsesAPIAdapter()
def _build_responses_kwargs(
*,
max_tokens: int,
messages: List[Dict],
model: str,
context_management: Optional[Dict] = None,
metadata: Optional[Dict] = None,
output_config: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
output_format: Optional[Dict] = None,
extra_kwargs: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Build the kwargs dict to pass directly to litellm.responses() / litellm.aresponses().
"""
# Build a typed AnthropicMessagesRequest for the adapter
request_data: Dict[str, Any] = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
}
if context_management:
request_data["context_management"] = context_management
if output_config:
request_data["output_config"] = output_config
if metadata:
request_data["metadata"] = metadata
if system:
request_data["system"] = system
if temperature is not None:
request_data["temperature"] = temperature
if thinking:
request_data["thinking"] = thinking
if tool_choice:
request_data["tool_choice"] = tool_choice
if tools:
request_data["tools"] = tools
if top_p is not None:
request_data["top_p"] = top_p
if output_format:
request_data["output_format"] = output_format
anthropic_request = AnthropicMessagesRequest(**request_data) # type: ignore[typeddict-item]
responses_kwargs = _ADAPTER.translate_request(anthropic_request)
if stream:
responses_kwargs["stream"] = True
# Forward litellm-specific kwargs (api_key, api_base, logging obj, etc.)
excluded = {"anthropic_messages"}
for key, value in (extra_kwargs or {}).items():
if key == "litellm_logging_obj" and value is not None:
from litellm.litellm_core_utils.litellm_logging import (
Logging as LiteLLMLoggingObject,
)
from litellm.types.utils import CallTypes
if isinstance(value, LiteLLMLoggingObject):
# Reclassify as acompletion so the success handler doesn't try to
# validate the Responses API event as an AnthropicResponse.
# (Mirrors the pattern used in LiteLLMMessagesToCompletionTransformationHandler.)
setattr(value, "call_type", CallTypes.acompletion.value)
responses_kwargs[key] = value
elif key not in excluded and key not in responses_kwargs and value is not None:
responses_kwargs[key] = value
return responses_kwargs
class LiteLLMMessagesToResponsesAPIHandler:
"""
Handles Anthropic /v1/messages requests for OpenAI / Azure models by
calling litellm.responses() / litellm.aresponses() directly and translating
the response back to Anthropic format.
"""
@staticmethod
async def async_anthropic_messages_handler(
max_tokens: int,
messages: List[Dict],
model: str,
context_management: Optional[Dict] = None,
metadata: Optional[Dict] = None,
output_config: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
output_format: Optional[Dict] = None,
**kwargs,
) -> Union[AnthropicMessagesResponse, AsyncIterator]:
responses_kwargs = _build_responses_kwargs(
max_tokens=max_tokens,
messages=messages,
model=model,
context_management=context_management,
metadata=metadata,
output_config=output_config,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
output_format=output_format,
extra_kwargs=kwargs,
)
result = await litellm.aresponses(**responses_kwargs)
if stream:
wrapper = AnthropicResponsesStreamWrapper(
responses_stream=result, model=model
)
return wrapper.async_anthropic_sse_wrapper()
if not isinstance(result, ResponsesAPIResponse):
raise ValueError(f"Expected ResponsesAPIResponse, got {type(result)}")
return _ADAPTER.translate_response(result)
@staticmethod
def anthropic_messages_handler(
max_tokens: int,
messages: List[Dict],
model: str,
context_management: Optional[Dict] = None,
metadata: Optional[Dict] = None,
output_config: Optional[Dict] = None,
stop_sequences: Optional[List[str]] = None,
stream: Optional[bool] = False,
system: Optional[str] = None,
temperature: Optional[float] = None,
thinking: Optional[Dict] = None,
tool_choice: Optional[Dict] = None,
tools: Optional[List[Dict]] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
output_format: Optional[Dict] = None,
_is_async: bool = False,
**kwargs,
) -> Union[
AnthropicMessagesResponse,
AsyncIterator[Any],
Coroutine[Any, Any, Union[AnthropicMessagesResponse, AsyncIterator[Any]]],
]:
if _is_async:
return (
LiteLLMMessagesToResponsesAPIHandler.async_anthropic_messages_handler(
max_tokens=max_tokens,
messages=messages,
model=model,
context_management=context_management,
metadata=metadata,
output_config=output_config,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
output_format=output_format,
**kwargs,
)
)
# Sync path
responses_kwargs = _build_responses_kwargs(
max_tokens=max_tokens,
messages=messages,
model=model,
context_management=context_management,
metadata=metadata,
output_config=output_config,
stop_sequences=stop_sequences,
stream=stream,
system=system,
temperature=temperature,
thinking=thinking,
tool_choice=tool_choice,
tools=tools,
top_k=top_k,
top_p=top_p,
output_format=output_format,
extra_kwargs=kwargs,
)
result = litellm.responses(**responses_kwargs)
if stream:
wrapper = AnthropicResponsesStreamWrapper(
responses_stream=result, model=model
)
return wrapper.async_anthropic_sse_wrapper()
if not isinstance(result, ResponsesAPIResponse):
raise ValueError(f"Expected ResponsesAPIResponse, got {type(result)}")
return _ADAPTER.translate_response(result)

View File

@@ -0,0 +1,344 @@
# What is this?
## Translates OpenAI call to Anthropic `/v1/messages` format
import json
import traceback
from collections import deque
from typing import Any, AsyncIterator, Dict
from litellm import verbose_logger
from litellm._uuid import uuid
class AnthropicResponsesStreamWrapper:
"""
Wraps a Responses API streaming iterator and re-emits events in Anthropic SSE format.
Responses API event flow (relevant subset):
response.created -> message_start
response.output_item.added -> content_block_start (if message/function_call)
response.output_text.delta -> content_block_delta (text_delta)
response.reasoning_summary_text.delta -> content_block_delta (thinking_delta)
response.function_call_arguments.delta -> content_block_delta (input_json_delta)
response.output_item.done -> content_block_stop
response.completed -> message_delta + message_stop
"""
def __init__(
self,
responses_stream: Any,
model: str,
) -> None:
self.responses_stream = responses_stream
self.model = model
self._message_id: str = f"msg_{uuid.uuid4()}"
self._current_block_index: int = -1
# Map item_id -> content_block_index so we can stop the right block later
self._item_id_to_block_index: Dict[str, int] = {}
# Track open function_call items by item_id so we can emit tool_use start
self._pending_tool_ids: Dict[
str, str
] = {} # item_id -> call_id / name accumulator
self._sent_message_start = False
self._sent_message_stop = False
self._chunk_queue: deque = deque()
def _make_message_start(self) -> Dict[str, Any]:
return {
"type": "message_start",
"message": {
"id": self._message_id,
"type": "message",
"role": "assistant",
"content": [],
"model": self.model,
"stop_reason": None,
"stop_sequence": None,
"usage": {
"input_tokens": 0,
"output_tokens": 0,
"cache_creation_input_tokens": 0,
"cache_read_input_tokens": 0,
},
},
}
def _next_block_index(self) -> int:
self._current_block_index += 1
return self._current_block_index
def _process_event(self, event: Any) -> None: # noqa: PLR0915
"""Convert one Responses API event into zero or more Anthropic chunks queued for emission."""
event_type = getattr(event, "type", None)
if event_type is None and isinstance(event, dict):
event_type = event.get("type")
if event_type is None:
return
# ---- message_start ----
if event_type == "response.created":
self._sent_message_start = True
self._chunk_queue.append(self._make_message_start())
return
# ---- content_block_start for a new output message item ----
if event_type == "response.output_item.added":
item = getattr(event, "item", None) or (
event.get("item") if isinstance(event, dict) else None
)
if item is None:
return
item_type = getattr(item, "type", None) or (
item.get("type") if isinstance(item, dict) else None
)
item_id = getattr(item, "id", None) or (
item.get("id") if isinstance(item, dict) else None
)
if item_type == "message":
block_idx = self._next_block_index()
if item_id:
self._item_id_to_block_index[item_id] = block_idx
self._chunk_queue.append(
{
"type": "content_block_start",
"index": block_idx,
"content_block": {"type": "text", "text": ""},
}
)
elif item_type == "function_call":
call_id = (
getattr(item, "call_id", None)
or (item.get("call_id") if isinstance(item, dict) else None)
or ""
)
name = (
getattr(item, "name", None)
or (item.get("name") if isinstance(item, dict) else None)
or ""
)
block_idx = self._next_block_index()
if item_id:
self._item_id_to_block_index[item_id] = block_idx
self._pending_tool_ids[item_id] = call_id
self._chunk_queue.append(
{
"type": "content_block_start",
"index": block_idx,
"content_block": {
"type": "tool_use",
"id": call_id,
"name": name,
"input": {},
},
}
)
elif item_type == "reasoning":
block_idx = self._next_block_index()
if item_id:
self._item_id_to_block_index[item_id] = block_idx
self._chunk_queue.append(
{
"type": "content_block_start",
"index": block_idx,
"content_block": {"type": "thinking", "thinking": ""},
}
)
return
# ---- text delta ----
if event_type == "response.output_text.delta":
item_id = getattr(event, "item_id", None) or (
event.get("item_id") if isinstance(event, dict) else None
)
delta = getattr(event, "delta", "") or (
event.get("delta", "") if isinstance(event, dict) else ""
)
block_idx = (
self._item_id_to_block_index.get(item_id, self._current_block_index)
if item_id
else self._current_block_index
)
self._chunk_queue.append(
{
"type": "content_block_delta",
"index": block_idx,
"delta": {"type": "text_delta", "text": delta},
}
)
return
# ---- reasoning summary text delta ----
if event_type == "response.reasoning_summary_text.delta":
item_id = getattr(event, "item_id", None) or (
event.get("item_id") if isinstance(event, dict) else None
)
delta = getattr(event, "delta", "") or (
event.get("delta", "") if isinstance(event, dict) else ""
)
block_idx = (
self._item_id_to_block_index.get(item_id, self._current_block_index)
if item_id
else self._current_block_index
)
self._chunk_queue.append(
{
"type": "content_block_delta",
"index": block_idx,
"delta": {"type": "thinking_delta", "thinking": delta},
}
)
return
# ---- function call arguments delta ----
if event_type == "response.function_call_arguments.delta":
item_id = getattr(event, "item_id", None) or (
event.get("item_id") if isinstance(event, dict) else None
)
delta = getattr(event, "delta", "") or (
event.get("delta", "") if isinstance(event, dict) else ""
)
block_idx = (
self._item_id_to_block_index.get(item_id, self._current_block_index)
if item_id
else self._current_block_index
)
self._chunk_queue.append(
{
"type": "content_block_delta",
"index": block_idx,
"delta": {"type": "input_json_delta", "partial_json": delta},
}
)
return
# ---- output item done -> content_block_stop ----
if event_type == "response.output_item.done":
item = getattr(event, "item", None) or (
event.get("item") if isinstance(event, dict) else None
)
item_id = (
getattr(item, "id", None)
or (item.get("id") if isinstance(item, dict) else None)
if item
else None
)
block_idx = (
self._item_id_to_block_index.get(item_id, self._current_block_index)
if item_id
else self._current_block_index
)
self._chunk_queue.append(
{
"type": "content_block_stop",
"index": block_idx,
}
)
return
# ---- response completed -> message_delta + message_stop ----
if event_type in (
"response.completed",
"response.failed",
"response.incomplete",
):
response_obj = getattr(event, "response", None) or (
event.get("response") if isinstance(event, dict) else None
)
stop_reason = "end_turn"
input_tokens = 0
output_tokens = 0
cache_creation_tokens = 0
cache_read_tokens = 0
if response_obj is not None:
status = getattr(response_obj, "status", None)
if status == "incomplete":
stop_reason = "max_tokens"
usage = getattr(response_obj, "usage", None)
if usage is not None:
input_tokens = getattr(usage, "input_tokens", 0) or 0
output_tokens = getattr(usage, "output_tokens", 0) or 0
cache_creation_tokens = getattr(usage, "input_tokens_details", None) # type: ignore[assignment]
cache_read_tokens = getattr(usage, "output_tokens_details", None) # type: ignore[assignment]
# Prefer direct cache fields if present
cache_creation_tokens = int(
getattr(usage, "cache_creation_input_tokens", 0) or 0
)
cache_read_tokens = int(
getattr(usage, "cache_read_input_tokens", 0) or 0
)
# Check if tool_use was in the output to override stop_reason
if response_obj is not None:
output = getattr(response_obj, "output", []) or []
for out_item in output:
out_type = getattr(out_item, "type", None) or (
out_item.get("type") if isinstance(out_item, dict) else None
)
if out_type == "function_call":
stop_reason = "tool_use"
break
usage_delta: Dict[str, Any] = {
"input_tokens": input_tokens,
"output_tokens": output_tokens,
}
if cache_creation_tokens:
usage_delta["cache_creation_input_tokens"] = cache_creation_tokens
if cache_read_tokens:
usage_delta["cache_read_input_tokens"] = cache_read_tokens
self._chunk_queue.append(
{
"type": "message_delta",
"delta": {"stop_reason": stop_reason, "stop_sequence": None},
"usage": usage_delta,
}
)
self._chunk_queue.append({"type": "message_stop"})
self._sent_message_stop = True
return
def __aiter__(self) -> "AnthropicResponsesStreamWrapper":
return self
async def __anext__(self) -> Dict[str, Any]:
# Return any queued chunks first
if self._chunk_queue:
return self._chunk_queue.popleft()
# Emit message_start if not yet done (fallback if response.created wasn't fired)
if not self._sent_message_start:
self._sent_message_start = True
self._chunk_queue.append(self._make_message_start())
return self._chunk_queue.popleft()
# Consume the upstream stream
try:
async for event in self.responses_stream:
self._process_event(event)
if self._chunk_queue:
return self._chunk_queue.popleft()
except StopAsyncIteration:
pass
except Exception as e:
verbose_logger.error(
f"AnthropicResponsesStreamWrapper error: {e}\n{traceback.format_exc()}"
)
# Drain any remaining queued chunks
if self._chunk_queue:
return self._chunk_queue.popleft()
raise StopAsyncIteration
async def async_anthropic_sse_wrapper(self) -> AsyncIterator[bytes]:
"""Yield SSE-encoded bytes for each Anthropic event chunk."""
async for chunk in self:
if isinstance(chunk, dict):
event_type: str = str(chunk.get("type", "message"))
payload = f"event: {event_type}\ndata: {json.dumps(chunk)}\n\n"
yield payload.encode()
else:
yield chunk

View File

@@ -0,0 +1,488 @@
"""
Transformation layer: Anthropic /v1/messages <-> OpenAI Responses API.
This module owns all format conversions for the direct v1/messages -> Responses API
path used for OpenAI and Azure models.
"""
import json
from typing import Any, Dict, List, Optional, Union, cast
from litellm.types.llms.anthropic import (
AllAnthropicToolsValues,
AnthopicMessagesAssistantMessageParam,
AnthropicFinishReason,
AnthropicMessagesRequest,
AnthropicMessagesToolChoice,
AnthropicMessagesUserMessageParam,
AnthropicResponseContentBlockText,
AnthropicResponseContentBlockThinking,
AnthropicResponseContentBlockToolUse,
)
from litellm.types.llms.anthropic_messages.anthropic_response import (
AnthropicMessagesResponse,
AnthropicUsage,
)
from litellm.types.llms.openai import ResponsesAPIResponse
class LiteLLMAnthropicToResponsesAPIAdapter:
"""
Converts Anthropic /v1/messages requests to OpenAI Responses API format and
converts Responses API responses back to Anthropic format.
"""
# ------------------------------------------------------------------ #
# Request translation: Anthropic -> Responses API #
# ------------------------------------------------------------------ #
@staticmethod
def _translate_anthropic_image_source_to_url(source: dict) -> Optional[str]:
"""Convert Anthropic image source to a URL string."""
source_type = source.get("type")
if source_type == "base64":
media_type = source.get("media_type", "image/jpeg")
data = source.get("data", "")
return f"data:{media_type};base64,{data}" if data else None
elif source_type == "url":
return source.get("url")
return None
def translate_messages_to_responses_input( # noqa: PLR0915
self,
messages: List[
Union[
AnthropicMessagesUserMessageParam,
AnthopicMessagesAssistantMessageParam,
]
],
) -> List[Dict[str, Any]]:
"""
Convert Anthropic messages list to Responses API `input` items.
Mapping:
user text -> message(role=user, input_text)
user image -> message(role=user, input_image)
user tool_result -> function_call_output
assistant text -> message(role=assistant, output_text)
assistant tool_use -> function_call
"""
input_items: List[Dict[str, Any]] = []
for m in messages:
role = m["role"]
content = m.get("content")
if role == "user":
if isinstance(content, str):
input_items.append(
{
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": content}],
}
)
elif isinstance(content, list):
user_parts: List[Dict[str, Any]] = []
for block in content:
if not isinstance(block, dict):
continue
btype = block.get("type")
if btype == "text":
user_parts.append(
{"type": "input_text", "text": block.get("text", "")}
)
elif btype == "image":
url = self._translate_anthropic_image_source_to_url(
block.get("source", {})
)
if url:
user_parts.append(
{"type": "input_image", "image_url": url}
)
elif btype == "tool_result":
tool_use_id = block.get("tool_use_id", "")
inner = block.get("content")
if inner is None:
output_text = ""
elif isinstance(inner, str):
output_text = inner
elif isinstance(inner, list):
parts = [
c.get("text", "")
for c in inner
if isinstance(c, dict) and c.get("type") == "text"
]
output_text = "\n".join(parts)
else:
output_text = str(inner)
# tool_result is a top-level item, not inside the message
input_items.append(
{
"type": "function_call_output",
"call_id": tool_use_id,
"output": output_text,
}
)
if user_parts:
input_items.append(
{
"type": "message",
"role": "user",
"content": user_parts,
}
)
elif role == "assistant":
if isinstance(content, str):
input_items.append(
{
"type": "message",
"role": "assistant",
"content": [{"type": "output_text", "text": content}],
}
)
elif isinstance(content, list):
asst_parts: List[Dict[str, Any]] = []
for block in content:
if not isinstance(block, dict):
continue
btype = block.get("type")
if btype == "text":
asst_parts.append(
{"type": "output_text", "text": block.get("text", "")}
)
elif btype == "tool_use":
# tool_use becomes a top-level function_call item
input_items.append(
{
"type": "function_call",
"call_id": block.get("id", ""),
"name": block.get("name", ""),
"arguments": json.dumps(block.get("input", {})),
}
)
elif btype == "thinking":
thinking_text = block.get("thinking", "")
if thinking_text:
asst_parts.append(
{"type": "output_text", "text": thinking_text}
)
if asst_parts:
input_items.append(
{
"type": "message",
"role": "assistant",
"content": asst_parts,
}
)
return input_items
def translate_tools_to_responses_api(
self,
tools: List[AllAnthropicToolsValues],
) -> List[Dict[str, Any]]:
"""Convert Anthropic tool definitions to Responses API function tools."""
result: List[Dict[str, Any]] = []
for tool in tools:
tool_dict = cast(Dict[str, Any], tool)
tool_type = tool_dict.get("type", "")
tool_name = tool_dict.get("name", "")
# web_search tool
if (
isinstance(tool_type, str) and tool_type.startswith("web_search")
) or tool_name == "web_search":
result.append({"type": "web_search_preview"})
continue
func_tool: Dict[str, Any] = {"type": "function", "name": tool_name}
if "description" in tool_dict:
func_tool["description"] = tool_dict["description"]
if "input_schema" in tool_dict:
func_tool["parameters"] = tool_dict["input_schema"]
result.append(func_tool)
return result
@staticmethod
def translate_tool_choice_to_responses_api(
tool_choice: AnthropicMessagesToolChoice,
) -> Dict[str, Any]:
"""Convert Anthropic tool_choice to Responses API tool_choice."""
tc_type = tool_choice.get("type")
if tc_type == "any":
return {"type": "required"}
elif tc_type == "tool":
return {"type": "function", "name": tool_choice.get("name", "")}
return {"type": "auto"}
@staticmethod
def translate_context_management_to_responses_api(
context_management: Dict[str, Any],
) -> Optional[List[Dict[str, Any]]]:
"""
Convert Anthropic context_management dict to OpenAI Responses API array format.
Anthropic format: {"edits": [{"type": "compact_20260112", "trigger": {"type": "input_tokens", "value": 150000}}]}
OpenAI format: [{"type": "compaction", "compact_threshold": 150000}]
"""
if not isinstance(context_management, dict):
return None
edits = context_management.get("edits", [])
if not isinstance(edits, list):
return None
result: List[Dict[str, Any]] = []
for edit in edits:
if not isinstance(edit, dict):
continue
edit_type = edit.get("type", "")
if edit_type == "compact_20260112":
entry: Dict[str, Any] = {"type": "compaction"}
trigger = edit.get("trigger")
if isinstance(trigger, dict) and trigger.get("value") is not None:
entry["compact_threshold"] = int(trigger["value"])
result.append(entry)
return result if result else None
@staticmethod
def translate_thinking_to_reasoning(
thinking: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""
Convert Anthropic thinking param to Responses API reasoning param.
thinking.budget_tokens maps to reasoning effort:
>= 10000 -> high, >= 5000 -> medium, >= 2000 -> low, < 2000 -> minimal
"""
if not isinstance(thinking, dict) or thinking.get("type") != "enabled":
return None
budget = thinking.get("budget_tokens", 0)
if budget >= 10000:
effort = "high"
elif budget >= 5000:
effort = "medium"
elif budget >= 2000:
effort = "low"
else:
effort = "minimal"
return {"effort": effort, "summary": "detailed"}
def translate_request(
self,
anthropic_request: AnthropicMessagesRequest,
) -> Dict[str, Any]:
"""
Translate a full Anthropic /v1/messages request dict to
litellm.responses() / litellm.aresponses() kwargs.
"""
model: str = anthropic_request["model"]
messages_list = cast(
List[
Union[
AnthropicMessagesUserMessageParam,
AnthopicMessagesAssistantMessageParam,
]
],
anthropic_request["messages"],
)
responses_kwargs: Dict[str, Any] = {
"model": model,
"input": self.translate_messages_to_responses_input(messages_list),
}
# system -> instructions
system = anthropic_request.get("system")
if system:
if isinstance(system, str):
responses_kwargs["instructions"] = system
elif isinstance(system, list):
text_parts = [
b.get("text", "")
for b in system
if isinstance(b, dict) and b.get("type") == "text"
]
responses_kwargs["instructions"] = "\n".join(filter(None, text_parts))
# max_tokens -> max_output_tokens
max_tokens = anthropic_request.get("max_tokens")
if max_tokens:
responses_kwargs["max_output_tokens"] = max_tokens
# temperature / top_p passed through
if "temperature" in anthropic_request:
responses_kwargs["temperature"] = anthropic_request["temperature"]
if "top_p" in anthropic_request:
responses_kwargs["top_p"] = anthropic_request["top_p"]
# tools
tools = anthropic_request.get("tools")
if tools:
responses_kwargs["tools"] = self.translate_tools_to_responses_api(
cast(List[AllAnthropicToolsValues], tools)
)
# tool_choice
tool_choice = anthropic_request.get("tool_choice")
if tool_choice:
responses_kwargs[
"tool_choice"
] = self.translate_tool_choice_to_responses_api(
cast(AnthropicMessagesToolChoice, tool_choice)
)
# thinking -> reasoning
thinking = anthropic_request.get("thinking")
if isinstance(thinking, dict):
reasoning = self.translate_thinking_to_reasoning(thinking)
if reasoning:
responses_kwargs["reasoning"] = reasoning
# output_format / output_config.format -> text format
# output_format: {"type": "json_schema", "schema": {...}}
# output_config: {"format": {"type": "json_schema", "schema": {...}}}
output_format: Any = anthropic_request.get("output_format")
output_config = anthropic_request.get("output_config")
if not isinstance(output_format, dict) and isinstance(output_config, dict):
output_format = output_config.get("format") # type: ignore[assignment]
if (
isinstance(output_format, dict)
and output_format.get("type") == "json_schema"
):
schema = output_format.get("schema")
if schema:
responses_kwargs["text"] = {
"format": {
"type": "json_schema",
"name": "structured_output",
"schema": schema,
"strict": True,
}
}
# context_management: Anthropic dict -> OpenAI array
context_management = anthropic_request.get("context_management")
if isinstance(context_management, dict):
openai_cm = self.translate_context_management_to_responses_api(
context_management
)
if openai_cm is not None:
responses_kwargs["context_management"] = openai_cm
# metadata user_id -> user
metadata = anthropic_request.get("metadata")
if isinstance(metadata, dict) and "user_id" in metadata:
responses_kwargs["user"] = str(metadata["user_id"])[:64]
return responses_kwargs
# ------------------------------------------------------------------ #
# Response translation: Responses API -> Anthropic #
# ------------------------------------------------------------------ #
def translate_response(
self,
response: ResponsesAPIResponse,
) -> AnthropicMessagesResponse:
"""
Translate an OpenAI ResponsesAPIResponse to AnthropicMessagesResponse.
"""
from openai.types.responses import (
ResponseFunctionToolCall,
ResponseOutputMessage,
ResponseReasoningItem,
)
from litellm.types.llms.openai import ResponseAPIUsage
content: List[Dict[str, Any]] = []
stop_reason: AnthropicFinishReason = "end_turn"
for item in response.output:
if isinstance(item, ResponseReasoningItem):
for summary in item.summary:
text = getattr(summary, "text", "")
if text:
content.append(
AnthropicResponseContentBlockThinking(
type="thinking",
thinking=text,
signature=None,
).model_dump()
)
elif isinstance(item, ResponseOutputMessage):
for part in item.content:
if getattr(part, "type", None) == "output_text":
content.append(
AnthropicResponseContentBlockText(
type="text", text=getattr(part, "text", "")
).model_dump()
)
elif isinstance(item, ResponseFunctionToolCall):
try:
input_data = json.loads(item.arguments) if item.arguments else {}
except (json.JSONDecodeError, TypeError):
input_data = {}
content.append(
AnthropicResponseContentBlockToolUse(
type="tool_use",
id=item.call_id or item.id or "",
name=item.name,
input=input_data,
).model_dump()
)
stop_reason = "tool_use"
elif isinstance(item, dict):
item_type = item.get("type")
if item_type == "message":
for part in item.get("content", []):
if isinstance(part, dict) and part.get("type") == "output_text":
content.append(
AnthropicResponseContentBlockText(
type="text", text=part.get("text", "")
).model_dump()
)
elif item_type == "function_call":
try:
input_data = json.loads(item.get("arguments", "{}"))
except (json.JSONDecodeError, TypeError):
input_data = {}
content.append(
AnthropicResponseContentBlockToolUse(
type="tool_use",
id=item.get("call_id") or item.get("id", ""),
name=item.get("name", ""),
input=input_data,
).model_dump()
)
stop_reason = "tool_use"
# status -> stop_reason override
if response.status == "incomplete":
stop_reason = "max_tokens"
# usage
raw_usage: Optional[ResponseAPIUsage] = response.usage
input_tokens = int(getattr(raw_usage, "input_tokens", 0) or 0)
output_tokens = int(getattr(raw_usage, "output_tokens", 0) or 0)
anthropic_usage = AnthropicUsage(
input_tokens=input_tokens,
output_tokens=output_tokens,
)
return AnthropicMessagesResponse(
id=response.id,
type="message",
role="assistant",
model=response.model or "unknown-model",
stop_sequence=None,
usage=anthropic_usage, # type: ignore
content=content, # type: ignore
stop_reason=stop_reason,
)