1195 lines
42 KiB
Python
1195 lines
42 KiB
Python
"""
|
|
This file contains the transformation logic for Bedrock Nova Sonic realtime API.
|
|
|
|
Transforms between OpenAI Realtime API format and Bedrock Nova Sonic format.
|
|
"""
|
|
|
|
import json
|
|
import uuid as uuid_lib
|
|
from typing import Any, List, Optional, Union
|
|
|
|
from litellm._logging import verbose_logger
|
|
from litellm._uuid import uuid
|
|
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
|
from litellm.llms.base_llm.realtime.transformation import BaseRealtimeConfig
|
|
from litellm.types.llms.openai import (
|
|
OpenAIRealtimeContentPartDone,
|
|
OpenAIRealtimeDoneEvent,
|
|
OpenAIRealtimeEvents,
|
|
OpenAIRealtimeOutputItemDone,
|
|
OpenAIRealtimeResponseAudioDone,
|
|
OpenAIRealtimeResponseContentPartAdded,
|
|
OpenAIRealtimeResponseDelta,
|
|
OpenAIRealtimeResponseDoneObject,
|
|
OpenAIRealtimeResponseTextDone,
|
|
OpenAIRealtimeStreamResponseBaseObject,
|
|
OpenAIRealtimeStreamResponseOutputItemAdded,
|
|
OpenAIRealtimeStreamSession,
|
|
OpenAIRealtimeStreamSessionEvents,
|
|
)
|
|
from litellm.types.realtime import (
|
|
ALL_DELTA_TYPES,
|
|
RealtimeResponseTransformInput,
|
|
RealtimeResponseTypedDict,
|
|
)
|
|
from litellm.utils import get_empty_usage
|
|
|
|
|
|
class BedrockRealtimeConfig(BaseRealtimeConfig):
|
|
"""Configuration for Bedrock Nova Sonic realtime transformations."""
|
|
|
|
def __init__(self):
|
|
# Track session state
|
|
self.prompt_name = str(uuid_lib.uuid4())
|
|
self.content_name = str(uuid_lib.uuid4())
|
|
self.audio_content_name = str(uuid_lib.uuid4())
|
|
|
|
# Default configuration values
|
|
# Inference configuration
|
|
self.max_tokens = 1024
|
|
self.top_p = 0.9
|
|
self.temperature = 0.7
|
|
|
|
# Audio output configuration
|
|
self.output_sample_rate_hertz = 24000
|
|
self.output_sample_size_bits = 16
|
|
self.output_channel_count = 1
|
|
self.voice_id = "matthew"
|
|
self.output_encoding = "base64"
|
|
self.output_audio_type = "SPEECH"
|
|
self.output_media_type = "audio/lpcm"
|
|
|
|
# Audio input configuration
|
|
self.input_sample_rate_hertz = 16000
|
|
self.input_sample_size_bits = 16
|
|
self.input_channel_count = 1
|
|
self.input_encoding = "base64"
|
|
self.input_audio_type = "SPEECH"
|
|
self.input_media_type = "audio/lpcm"
|
|
|
|
# Text configuration
|
|
self.text_media_type = "text/plain"
|
|
|
|
def validate_environment(
|
|
self, headers: dict, model: str, api_key: Optional[str] = None
|
|
) -> dict:
|
|
"""Validate environment - no special validation needed for Bedrock."""
|
|
return headers
|
|
|
|
def get_complete_url(
|
|
self, api_base: Optional[str], model: str, api_key: Optional[str] = None
|
|
) -> str:
|
|
"""Get complete URL - handled by aws_sdk_bedrock_runtime."""
|
|
return api_base or ""
|
|
|
|
def requires_session_configuration(self) -> bool:
|
|
"""Bedrock requires session configuration."""
|
|
return True
|
|
|
|
def session_configuration_request(
|
|
self, model: str, tools: Optional[List[dict]] = None
|
|
) -> str:
|
|
"""
|
|
Create initial session configuration for Bedrock Nova Sonic.
|
|
|
|
Args:
|
|
model: Model ID
|
|
tools: Optional list of tool definitions
|
|
|
|
Returns JSON string with session start and prompt start events.
|
|
"""
|
|
session_start = {
|
|
"event": {
|
|
"sessionStart": {
|
|
"inferenceConfiguration": {
|
|
"maxTokens": self.max_tokens,
|
|
"topP": self.top_p,
|
|
"temperature": self.temperature,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
prompt_start_config = {
|
|
"promptName": self.prompt_name,
|
|
"textOutputConfiguration": {"mediaType": self.text_media_type},
|
|
"audioOutputConfiguration": {
|
|
"mediaType": self.output_media_type,
|
|
"sampleRateHertz": self.output_sample_rate_hertz,
|
|
"sampleSizeBits": self.output_sample_size_bits,
|
|
"channelCount": self.output_channel_count,
|
|
"voiceId": self.voice_id,
|
|
"encoding": self.output_encoding,
|
|
"audioType": self.output_audio_type,
|
|
},
|
|
}
|
|
|
|
# Add tool configuration if tools are provided
|
|
if tools:
|
|
prompt_start_config["toolUseOutputConfiguration"] = {
|
|
"mediaType": "application/json"
|
|
}
|
|
prompt_start_config["toolConfiguration"] = {
|
|
"tools": self._transform_tools_to_bedrock_format(tools)
|
|
}
|
|
|
|
prompt_start = {"event": {"promptStart": prompt_start_config}}
|
|
|
|
# Return as a marker that we've sent the configuration
|
|
return json.dumps(
|
|
{"session_start": session_start, "prompt_start": prompt_start}
|
|
)
|
|
|
|
def _transform_tools_to_bedrock_format(self, tools: List[dict]) -> List[dict]:
|
|
"""
|
|
Transform OpenAI tool format to Bedrock tool format.
|
|
|
|
Args:
|
|
tools: List of OpenAI format tools
|
|
|
|
Returns:
|
|
List of Bedrock format tools
|
|
"""
|
|
bedrock_tools = []
|
|
for tool in tools:
|
|
if tool.get("type") == "function":
|
|
function = tool.get("function", {})
|
|
bedrock_tool = {
|
|
"toolSpec": {
|
|
"name": function.get("name", ""),
|
|
"description": function.get("description", ""),
|
|
"inputSchema": {
|
|
"json": json.dumps(function.get("parameters", {}))
|
|
},
|
|
}
|
|
}
|
|
bedrock_tools.append(bedrock_tool)
|
|
return bedrock_tools
|
|
|
|
def _map_audio_format_to_sample_rate(
|
|
self, audio_format: str, is_output: bool = True
|
|
) -> int:
|
|
"""
|
|
Map OpenAI audio format to sample rate.
|
|
|
|
Args:
|
|
audio_format: OpenAI audio format (pcm16, g711_ulaw, g711_alaw)
|
|
is_output: Whether this is for output (True) or input (False)
|
|
|
|
Returns:
|
|
Sample rate in Hz
|
|
"""
|
|
# OpenAI uses 24kHz for output and can vary for input
|
|
# Bedrock Nova Sonic uses 24kHz for output and 16kHz for input by default
|
|
if audio_format == "pcm16":
|
|
return 24000 if is_output else 16000
|
|
elif audio_format in ["g711_ulaw", "g711_alaw"]:
|
|
return 8000 # G.711 typically uses 8kHz
|
|
return 24000 if is_output else 16000
|
|
|
|
def transform_session_update_event(self, json_message: dict) -> List[str]:
|
|
"""
|
|
Transform session.update event to Bedrock session configuration.
|
|
|
|
Args:
|
|
json_message: OpenAI session.update message
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling session.update")
|
|
messages: List[str] = []
|
|
|
|
session_config = json_message.get("session", {})
|
|
|
|
# Update inference configuration from session if provided
|
|
if "max_response_output_tokens" in session_config:
|
|
self.max_tokens = session_config["max_response_output_tokens"]
|
|
if "temperature" in session_config:
|
|
self.temperature = session_config["temperature"]
|
|
|
|
# Update audio output configuration from session if provided
|
|
if "voice" in session_config:
|
|
self.voice_id = session_config["voice"]
|
|
if "output_audio_format" in session_config:
|
|
output_format = session_config["output_audio_format"]
|
|
self.output_sample_rate_hertz = self._map_audio_format_to_sample_rate(
|
|
output_format, is_output=True
|
|
)
|
|
|
|
# Update audio input configuration from session if provided
|
|
if "input_audio_format" in session_config:
|
|
input_format = session_config["input_audio_format"]
|
|
self.input_sample_rate_hertz = self._map_audio_format_to_sample_rate(
|
|
input_format, is_output=False
|
|
)
|
|
|
|
# Allow direct override of sample rates if provided (custom extension)
|
|
if "output_sample_rate_hertz" in session_config:
|
|
self.output_sample_rate_hertz = session_config["output_sample_rate_hertz"]
|
|
if "input_sample_rate_hertz" in session_config:
|
|
self.input_sample_rate_hertz = session_config["input_sample_rate_hertz"]
|
|
|
|
# Send session start
|
|
session_start = {
|
|
"event": {
|
|
"sessionStart": {
|
|
"inferenceConfiguration": {
|
|
"maxTokens": self.max_tokens,
|
|
"topP": self.top_p,
|
|
"temperature": self.temperature,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(session_start))
|
|
|
|
# Send prompt start
|
|
prompt_start_config = {
|
|
"promptName": self.prompt_name,
|
|
"textOutputConfiguration": {"mediaType": self.text_media_type},
|
|
"audioOutputConfiguration": {
|
|
"mediaType": self.output_media_type,
|
|
"sampleRateHertz": self.output_sample_rate_hertz,
|
|
"sampleSizeBits": self.output_sample_size_bits,
|
|
"channelCount": self.output_channel_count,
|
|
"voiceId": self.voice_id,
|
|
"encoding": self.output_encoding,
|
|
"audioType": self.output_audio_type,
|
|
},
|
|
}
|
|
|
|
# Add tool configuration if tools are provided
|
|
tools = session_config.get("tools")
|
|
if tools:
|
|
prompt_start_config["toolUseOutputConfiguration"] = {
|
|
"mediaType": "application/json"
|
|
}
|
|
prompt_start_config["toolConfiguration"] = {
|
|
"tools": self._transform_tools_to_bedrock_format(tools)
|
|
}
|
|
|
|
prompt_start = {"event": {"promptStart": prompt_start_config}}
|
|
messages.append(json.dumps(prompt_start))
|
|
|
|
# Send system prompt if provided
|
|
instructions = session_config.get("instructions")
|
|
if instructions:
|
|
text_content_name = str(uuid_lib.uuid4())
|
|
|
|
# Content start
|
|
text_content_start = {
|
|
"event": {
|
|
"contentStart": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": text_content_name,
|
|
"type": "TEXT",
|
|
"interactive": False,
|
|
"role": "SYSTEM",
|
|
"textInputConfiguration": {"mediaType": self.text_media_type},
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(text_content_start))
|
|
|
|
# Text input
|
|
text_input = {
|
|
"event": {
|
|
"textInput": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": text_content_name,
|
|
"content": instructions,
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(text_input))
|
|
|
|
# Content end
|
|
text_content_end = {
|
|
"event": {
|
|
"contentEnd": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": text_content_name,
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(text_content_end))
|
|
|
|
return messages
|
|
|
|
def transform_input_audio_buffer_append_event(
|
|
self, json_message: dict
|
|
) -> List[str]:
|
|
"""
|
|
Transform input_audio_buffer.append event to Bedrock audio input.
|
|
|
|
Args:
|
|
json_message: OpenAI input_audio_buffer.append message
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling input_audio_buffer.append")
|
|
messages: List[str] = []
|
|
|
|
# Check if we need to start audio content
|
|
if not hasattr(self, "_audio_content_started"):
|
|
audio_content_start = {
|
|
"event": {
|
|
"contentStart": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": self.audio_content_name,
|
|
"type": "AUDIO",
|
|
"interactive": True,
|
|
"role": "USER",
|
|
"audioInputConfiguration": {
|
|
"mediaType": self.input_media_type,
|
|
"sampleRateHertz": self.input_sample_rate_hertz,
|
|
"sampleSizeBits": self.input_sample_size_bits,
|
|
"channelCount": self.input_channel_count,
|
|
"audioType": self.input_audio_type,
|
|
"encoding": self.input_encoding,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(audio_content_start))
|
|
self._audio_content_started = True
|
|
|
|
# Send audio chunk
|
|
audio_data = json_message.get("audio", "")
|
|
audio_event = {
|
|
"event": {
|
|
"audioInput": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": self.audio_content_name,
|
|
"content": audio_data,
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(audio_event))
|
|
|
|
return messages
|
|
|
|
def transform_input_audio_buffer_commit_event(
|
|
self, json_message: dict
|
|
) -> List[str]:
|
|
"""
|
|
Transform input_audio_buffer.commit event to Bedrock audio content end.
|
|
|
|
Args:
|
|
json_message: OpenAI input_audio_buffer.commit message
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling input_audio_buffer.commit")
|
|
messages: List[str] = []
|
|
|
|
if hasattr(self, "_audio_content_started"):
|
|
audio_content_end = {
|
|
"event": {
|
|
"contentEnd": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": self.audio_content_name,
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(audio_content_end))
|
|
delattr(self, "_audio_content_started")
|
|
|
|
return messages
|
|
|
|
def transform_conversation_item_create_event(self, json_message: dict) -> List[str]:
|
|
"""
|
|
Transform conversation.item.create event to Bedrock text input or tool result.
|
|
|
|
Args:
|
|
json_message: OpenAI conversation.item.create message
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling conversation.item.create")
|
|
messages: List[str] = []
|
|
|
|
item = json_message.get("item", {})
|
|
item_type = item.get("type")
|
|
|
|
# Handle tool result
|
|
if item_type == "function_call_output":
|
|
return self.transform_conversation_item_create_tool_result_event(
|
|
json_message
|
|
)
|
|
|
|
# Handle regular message
|
|
if item_type == "message":
|
|
content = item.get("content", [])
|
|
for content_part in content:
|
|
if content_part.get("type") == "input_text":
|
|
text_content_name = str(uuid_lib.uuid4())
|
|
|
|
# Content start
|
|
text_content_start = {
|
|
"event": {
|
|
"contentStart": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": text_content_name,
|
|
"type": "TEXT",
|
|
"interactive": True,
|
|
"role": "USER",
|
|
"textInputConfiguration": {
|
|
"mediaType": self.text_media_type
|
|
},
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(text_content_start))
|
|
|
|
# Text input
|
|
text_input = {
|
|
"event": {
|
|
"textInput": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": text_content_name,
|
|
"content": content_part.get("text", ""),
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(text_input))
|
|
|
|
# Content end
|
|
text_content_end = {
|
|
"event": {
|
|
"contentEnd": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": text_content_name,
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(text_content_end))
|
|
|
|
return messages
|
|
|
|
def transform_response_create_event(self, json_message: dict) -> List[str]:
|
|
"""
|
|
Transform response.create event to Bedrock format.
|
|
|
|
Args:
|
|
json_message: OpenAI response.create message
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling response.create")
|
|
# Bedrock starts generating automatically, no explicit trigger needed
|
|
return []
|
|
|
|
def transform_response_cancel_event(self, json_message: dict) -> List[str]:
|
|
"""
|
|
Transform response.cancel event to Bedrock format.
|
|
|
|
Args:
|
|
json_message: OpenAI response.cancel message
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling response.cancel")
|
|
# Send interrupt signal if needed
|
|
return []
|
|
|
|
def transform_realtime_request(
|
|
self,
|
|
message: str,
|
|
model: str,
|
|
session_configuration_request: Optional[str] = None,
|
|
) -> List[str]:
|
|
"""
|
|
Transform OpenAI realtime request to Bedrock Nova Sonic format.
|
|
|
|
Args:
|
|
message: OpenAI format message (JSON string)
|
|
model: Model ID
|
|
session_configuration_request: Previous session config
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
try:
|
|
json_message = json.loads(message)
|
|
except json.JSONDecodeError:
|
|
verbose_logger.warning(f"Invalid JSON message: {message[:200]}")
|
|
return []
|
|
|
|
message_type = json_message.get("type")
|
|
|
|
# Route to appropriate transformation method
|
|
if message_type == "session.update":
|
|
return self.transform_session_update_event(json_message)
|
|
elif message_type == "input_audio_buffer.append":
|
|
return self.transform_input_audio_buffer_append_event(json_message)
|
|
elif message_type == "input_audio_buffer.commit":
|
|
return self.transform_input_audio_buffer_commit_event(json_message)
|
|
elif message_type == "conversation.item.create":
|
|
return self.transform_conversation_item_create_event(json_message)
|
|
elif message_type == "response.create":
|
|
return self.transform_response_create_event(json_message)
|
|
elif message_type == "response.cancel":
|
|
return self.transform_response_cancel_event(json_message)
|
|
else:
|
|
verbose_logger.warning(f"Unknown message type: {message_type}")
|
|
return []
|
|
|
|
def transform_session_start_event(
|
|
self,
|
|
event: dict,
|
|
model: str,
|
|
logging_obj: LiteLLMLoggingObj,
|
|
) -> OpenAIRealtimeStreamSessionEvents:
|
|
"""
|
|
Transform Bedrock sessionStart event to OpenAI session.created.
|
|
|
|
Args:
|
|
event: Bedrock sessionStart event
|
|
model: Model ID
|
|
logging_obj: Logging object
|
|
|
|
Returns:
|
|
OpenAI session.created event
|
|
"""
|
|
verbose_logger.debug("Handling sessionStart")
|
|
|
|
session = OpenAIRealtimeStreamSession(
|
|
id=logging_obj.litellm_trace_id,
|
|
modalities=["text", "audio"],
|
|
)
|
|
if model is not None and isinstance(model, str):
|
|
session["model"] = model
|
|
|
|
return OpenAIRealtimeStreamSessionEvents(
|
|
type="session.created",
|
|
session=session,
|
|
event_id=str(uuid.uuid4()),
|
|
)
|
|
|
|
def transform_content_start_event(
|
|
self,
|
|
event: dict,
|
|
current_response_id: Optional[str],
|
|
current_output_item_id: Optional[str],
|
|
current_conversation_id: Optional[str],
|
|
) -> tuple[
|
|
List[OpenAIRealtimeEvents],
|
|
Optional[str],
|
|
Optional[str],
|
|
Optional[str],
|
|
Optional[ALL_DELTA_TYPES],
|
|
]:
|
|
"""
|
|
Transform Bedrock contentStart event to OpenAI response events.
|
|
|
|
Args:
|
|
event: Bedrock contentStart event
|
|
current_response_id: Current response ID
|
|
current_output_item_id: Current output item ID
|
|
current_conversation_id: Current conversation ID
|
|
|
|
Returns:
|
|
Tuple of (events, response_id, output_item_id, conversation_id, delta_type)
|
|
"""
|
|
content_start = event["contentStart"]
|
|
role = content_start.get("role")
|
|
|
|
if role != "ASSISTANT":
|
|
return (
|
|
[],
|
|
current_response_id,
|
|
current_output_item_id,
|
|
current_conversation_id,
|
|
None,
|
|
)
|
|
|
|
verbose_logger.debug("Handling ASSISTANT contentStart")
|
|
|
|
# Initialize IDs if needed
|
|
if not current_response_id:
|
|
current_response_id = f"resp_{uuid.uuid4()}"
|
|
if not current_output_item_id:
|
|
current_output_item_id = f"item_{uuid.uuid4()}"
|
|
if not current_conversation_id:
|
|
current_conversation_id = f"conv_{uuid.uuid4()}"
|
|
|
|
# Determine content type
|
|
content_type = content_start.get("type", "TEXT")
|
|
current_delta_type: ALL_DELTA_TYPES = (
|
|
"text" if content_type == "TEXT" else "audio"
|
|
)
|
|
|
|
returned_messages: List[OpenAIRealtimeEvents] = []
|
|
|
|
# Send response.created
|
|
response_created = OpenAIRealtimeStreamResponseBaseObject(
|
|
type="response.created",
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
response={
|
|
"object": "realtime.response",
|
|
"id": current_response_id,
|
|
"status": "in_progress",
|
|
"output": [],
|
|
"conversation_id": current_conversation_id,
|
|
},
|
|
)
|
|
returned_messages.append(response_created)
|
|
|
|
# Send response.output_item.added
|
|
output_item_added = OpenAIRealtimeStreamResponseOutputItemAdded(
|
|
type="response.output_item.added",
|
|
response_id=current_response_id,
|
|
output_index=0,
|
|
item={
|
|
"id": current_output_item_id,
|
|
"object": "realtime.item",
|
|
"type": "message",
|
|
"status": "in_progress",
|
|
"role": "assistant",
|
|
"content": [],
|
|
},
|
|
)
|
|
returned_messages.append(output_item_added)
|
|
|
|
# Send response.content_part.added
|
|
content_part_added = OpenAIRealtimeResponseContentPartAdded(
|
|
type="response.content_part.added",
|
|
content_index=0,
|
|
output_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
part=(
|
|
{"type": "text", "text": ""}
|
|
if current_delta_type == "text"
|
|
else {"type": "audio", "transcript": ""}
|
|
),
|
|
response_id=current_response_id,
|
|
)
|
|
returned_messages.append(content_part_added)
|
|
|
|
return (
|
|
returned_messages,
|
|
current_response_id,
|
|
current_output_item_id,
|
|
current_conversation_id,
|
|
current_delta_type,
|
|
)
|
|
|
|
def transform_text_output_event(
|
|
self,
|
|
event: dict,
|
|
current_output_item_id: Optional[str],
|
|
current_response_id: Optional[str],
|
|
current_delta_chunks: Optional[List[OpenAIRealtimeResponseDelta]],
|
|
) -> tuple[List[OpenAIRealtimeEvents], Optional[List[OpenAIRealtimeResponseDelta]]]:
|
|
"""
|
|
Transform Bedrock textOutput event to OpenAI response.text.delta.
|
|
|
|
Args:
|
|
event: Bedrock textOutput event
|
|
current_output_item_id: Current output item ID
|
|
current_response_id: Current response ID
|
|
current_delta_chunks: Current delta chunks
|
|
|
|
Returns:
|
|
Tuple of (events, updated_delta_chunks)
|
|
"""
|
|
verbose_logger.debug("Handling textOutput")
|
|
text_content = event["textOutput"].get("content", "")
|
|
|
|
if not current_output_item_id or not current_response_id:
|
|
return [], current_delta_chunks
|
|
|
|
text_delta = OpenAIRealtimeResponseDelta(
|
|
type="response.text.delta",
|
|
content_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
output_index=0,
|
|
response_id=current_response_id,
|
|
delta=text_content,
|
|
)
|
|
|
|
# Track delta chunks
|
|
if current_delta_chunks is None:
|
|
current_delta_chunks = []
|
|
current_delta_chunks.append(text_delta)
|
|
|
|
return [text_delta], current_delta_chunks
|
|
|
|
def transform_audio_output_event(
|
|
self,
|
|
event: dict,
|
|
current_output_item_id: Optional[str],
|
|
current_response_id: Optional[str],
|
|
) -> List[OpenAIRealtimeEvents]:
|
|
"""
|
|
Transform Bedrock audioOutput event to OpenAI response.audio.delta.
|
|
|
|
Args:
|
|
event: Bedrock audioOutput event
|
|
current_output_item_id: Current output item ID
|
|
current_response_id: Current response ID
|
|
|
|
Returns:
|
|
List of OpenAI events
|
|
"""
|
|
verbose_logger.debug("Handling audioOutput")
|
|
audio_content = event["audioOutput"].get("content", "")
|
|
|
|
if not current_output_item_id or not current_response_id:
|
|
return []
|
|
|
|
audio_delta = OpenAIRealtimeResponseDelta(
|
|
type="response.audio.delta",
|
|
content_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
output_index=0,
|
|
response_id=current_response_id,
|
|
delta=audio_content,
|
|
)
|
|
|
|
return [audio_delta]
|
|
|
|
def transform_content_end_event(
|
|
self,
|
|
event: dict,
|
|
current_output_item_id: Optional[str],
|
|
current_response_id: Optional[str],
|
|
current_delta_type: Optional[str],
|
|
current_delta_chunks: Optional[List[OpenAIRealtimeResponseDelta]],
|
|
) -> tuple[List[OpenAIRealtimeEvents], Optional[List[OpenAIRealtimeResponseDelta]]]:
|
|
"""
|
|
Transform Bedrock contentEnd event to OpenAI response done events.
|
|
|
|
Args:
|
|
event: Bedrock contentEnd event
|
|
current_output_item_id: Current output item ID
|
|
current_response_id: Current response ID
|
|
current_delta_type: Current delta type (text or audio)
|
|
current_delta_chunks: Current delta chunks
|
|
|
|
Returns:
|
|
Tuple of (events, reset_delta_chunks)
|
|
"""
|
|
content_end = event["contentEnd"]
|
|
verbose_logger.debug(f"Handling contentEnd: {content_end}")
|
|
|
|
if not current_output_item_id or not current_response_id:
|
|
return [], current_delta_chunks
|
|
|
|
returned_messages: List[OpenAIRealtimeEvents] = []
|
|
|
|
# Send appropriate done event based on type
|
|
if current_delta_type == "text":
|
|
# Accumulate text
|
|
accumulated_text = ""
|
|
if current_delta_chunks:
|
|
accumulated_text = "".join(
|
|
[chunk.get("delta", "") for chunk in current_delta_chunks]
|
|
)
|
|
|
|
text_done = OpenAIRealtimeResponseTextDone(
|
|
type="response.text.done",
|
|
content_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
output_index=0,
|
|
response_id=current_response_id,
|
|
text=accumulated_text,
|
|
)
|
|
returned_messages.append(text_done)
|
|
|
|
# Send content_part.done
|
|
content_part_done = OpenAIRealtimeContentPartDone(
|
|
type="response.content_part.done",
|
|
content_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
output_index=0,
|
|
part={"type": "text", "text": accumulated_text},
|
|
response_id=current_response_id,
|
|
)
|
|
returned_messages.append(content_part_done)
|
|
|
|
elif current_delta_type == "audio":
|
|
audio_done = OpenAIRealtimeResponseAudioDone(
|
|
type="response.audio.done",
|
|
content_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
output_index=0,
|
|
response_id=current_response_id,
|
|
)
|
|
returned_messages.append(audio_done)
|
|
|
|
# Send content_part.done
|
|
content_part_done = OpenAIRealtimeContentPartDone(
|
|
type="response.content_part.done",
|
|
content_index=0,
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
item_id=current_output_item_id,
|
|
output_index=0,
|
|
part={"type": "audio", "transcript": ""},
|
|
response_id=current_response_id,
|
|
)
|
|
returned_messages.append(content_part_done)
|
|
|
|
# Send output_item.done
|
|
output_item_done = OpenAIRealtimeOutputItemDone(
|
|
type="response.output_item.done",
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
output_index=0,
|
|
response_id=current_response_id,
|
|
item={
|
|
"id": current_output_item_id,
|
|
"object": "realtime.item",
|
|
"type": "message",
|
|
"status": "completed",
|
|
"role": "assistant",
|
|
"content": [],
|
|
},
|
|
)
|
|
returned_messages.append(output_item_done)
|
|
|
|
# Reset delta chunks
|
|
return returned_messages, None
|
|
|
|
def transform_prompt_end_event(
|
|
self,
|
|
event: dict,
|
|
current_response_id: Optional[str],
|
|
current_conversation_id: Optional[str],
|
|
) -> tuple[
|
|
List[OpenAIRealtimeEvents],
|
|
Optional[str],
|
|
Optional[str],
|
|
Optional[ALL_DELTA_TYPES],
|
|
]:
|
|
"""
|
|
Transform Bedrock promptEnd event to OpenAI response.done.
|
|
|
|
Args:
|
|
event: Bedrock promptEnd event
|
|
current_response_id: Current response ID
|
|
current_conversation_id: Current conversation ID
|
|
|
|
Returns:
|
|
Tuple of (events, reset_output_item_id, reset_response_id, reset_delta_type)
|
|
"""
|
|
verbose_logger.debug("Handling promptEnd")
|
|
|
|
if not current_response_id or not current_conversation_id:
|
|
return [], None, None, None
|
|
|
|
usage_obj = get_empty_usage()
|
|
response_done = OpenAIRealtimeDoneEvent(
|
|
type="response.done",
|
|
event_id=f"event_{uuid.uuid4()}",
|
|
response=OpenAIRealtimeResponseDoneObject(
|
|
object="realtime.response",
|
|
id=current_response_id,
|
|
status="completed",
|
|
output=[],
|
|
conversation_id=current_conversation_id,
|
|
usage={
|
|
"prompt_tokens": usage_obj.prompt_tokens,
|
|
"completion_tokens": usage_obj.completion_tokens,
|
|
"total_tokens": usage_obj.total_tokens,
|
|
},
|
|
),
|
|
)
|
|
|
|
# Reset state for next response
|
|
return [response_done], None, None, None
|
|
|
|
def transform_tool_use_event(
|
|
self,
|
|
event: dict,
|
|
current_output_item_id: Optional[str],
|
|
current_response_id: Optional[str],
|
|
) -> tuple[List[OpenAIRealtimeEvents], str, str]:
|
|
"""
|
|
Transform Bedrock toolUse event to OpenAI format.
|
|
|
|
Args:
|
|
event: Bedrock toolUse event
|
|
current_output_item_id: Current output item ID
|
|
current_response_id: Current response ID
|
|
|
|
Returns:
|
|
Tuple of (events, tool_call_id, tool_name) for tracking
|
|
"""
|
|
verbose_logger.debug("Handling toolUse")
|
|
tool_use = event["toolUse"]
|
|
|
|
if not current_output_item_id or not current_response_id:
|
|
return [], "", ""
|
|
|
|
# Parse the tool input
|
|
tool_input = {}
|
|
if "input" in tool_use:
|
|
try:
|
|
tool_input = (
|
|
json.loads(tool_use["input"])
|
|
if isinstance(tool_use["input"], str)
|
|
else tool_use["input"]
|
|
)
|
|
except json.JSONDecodeError:
|
|
tool_input = {}
|
|
|
|
tool_call_id = tool_use.get("toolUseId", "")
|
|
tool_name = tool_use.get("toolName", "")
|
|
|
|
# Create a function call arguments done event
|
|
# This is a custom event format that matches what clients expect
|
|
from typing import cast
|
|
|
|
function_call_event: dict[str, Any] = {
|
|
"type": "response.function_call_arguments.done",
|
|
"event_id": f"event_{uuid.uuid4()}",
|
|
"response_id": current_response_id,
|
|
"item_id": current_output_item_id,
|
|
"output_index": 0,
|
|
"call_id": tool_call_id,
|
|
"name": tool_name,
|
|
"arguments": json.dumps(tool_input),
|
|
}
|
|
|
|
return (
|
|
[cast(OpenAIRealtimeEvents, function_call_event)],
|
|
tool_call_id,
|
|
tool_name,
|
|
)
|
|
|
|
def transform_conversation_item_create_tool_result_event(
|
|
self, json_message: dict
|
|
) -> List[str]:
|
|
"""
|
|
Transform conversation.item.create with tool result to Bedrock format.
|
|
|
|
Args:
|
|
json_message: OpenAI conversation.item.create message with tool result
|
|
|
|
Returns:
|
|
List of Bedrock format messages (JSON strings)
|
|
"""
|
|
verbose_logger.debug("Handling conversation.item.create for tool result")
|
|
messages: List[str] = []
|
|
|
|
item = json_message.get("item", {})
|
|
if item.get("type") == "function_call_output":
|
|
tool_content_name = str(uuid_lib.uuid4())
|
|
call_id = item.get("call_id", "")
|
|
output = item.get("output", "")
|
|
|
|
# Content start for tool result
|
|
tool_content_start = {
|
|
"event": {
|
|
"contentStart": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": tool_content_name,
|
|
"interactive": False,
|
|
"type": "TOOL",
|
|
"role": "TOOL",
|
|
"toolResultInputConfiguration": {
|
|
"toolUseId": call_id,
|
|
"type": "TEXT",
|
|
"textInputConfiguration": {"mediaType": "text/plain"},
|
|
},
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(tool_content_start))
|
|
|
|
# Tool result
|
|
tool_result = {
|
|
"event": {
|
|
"toolResult": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": tool_content_name,
|
|
"content": output
|
|
if isinstance(output, str)
|
|
else json.dumps(output),
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(tool_result))
|
|
|
|
# Content end
|
|
tool_content_end = {
|
|
"event": {
|
|
"contentEnd": {
|
|
"promptName": self.prompt_name,
|
|
"contentName": tool_content_name,
|
|
}
|
|
}
|
|
}
|
|
messages.append(json.dumps(tool_content_end))
|
|
|
|
return messages
|
|
|
|
def transform_realtime_response(
|
|
self,
|
|
message: Union[str, bytes],
|
|
model: str,
|
|
logging_obj: LiteLLMLoggingObj,
|
|
realtime_response_transform_input: RealtimeResponseTransformInput,
|
|
) -> RealtimeResponseTypedDict:
|
|
"""
|
|
Transform Bedrock Nova Sonic response to OpenAI realtime format.
|
|
|
|
Args:
|
|
message: Bedrock format message (JSON string)
|
|
model: Model ID
|
|
logging_obj: Logging object
|
|
realtime_response_transform_input: Current state
|
|
|
|
Returns:
|
|
Transformed response with updated state
|
|
"""
|
|
try:
|
|
json_message = json.loads(message)
|
|
except json.JSONDecodeError:
|
|
message_preview = (
|
|
message[:200].decode("utf-8", errors="replace")
|
|
if isinstance(message, bytes)
|
|
else message[:200]
|
|
)
|
|
verbose_logger.warning(f"Invalid JSON message: {message_preview}")
|
|
return {
|
|
"response": [],
|
|
"current_output_item_id": realtime_response_transform_input.get(
|
|
"current_output_item_id"
|
|
),
|
|
"current_response_id": realtime_response_transform_input.get(
|
|
"current_response_id"
|
|
),
|
|
"current_delta_chunks": realtime_response_transform_input.get(
|
|
"current_delta_chunks"
|
|
),
|
|
"current_conversation_id": realtime_response_transform_input.get(
|
|
"current_conversation_id"
|
|
),
|
|
"current_item_chunks": realtime_response_transform_input.get(
|
|
"current_item_chunks"
|
|
),
|
|
"current_delta_type": realtime_response_transform_input.get(
|
|
"current_delta_type"
|
|
),
|
|
"session_configuration_request": realtime_response_transform_input.get(
|
|
"session_configuration_request"
|
|
),
|
|
}
|
|
|
|
# Extract state
|
|
current_output_item_id = realtime_response_transform_input.get(
|
|
"current_output_item_id"
|
|
)
|
|
current_response_id = realtime_response_transform_input.get(
|
|
"current_response_id"
|
|
)
|
|
current_conversation_id = realtime_response_transform_input.get(
|
|
"current_conversation_id"
|
|
)
|
|
current_delta_chunks = realtime_response_transform_input.get(
|
|
"current_delta_chunks"
|
|
)
|
|
current_delta_type = realtime_response_transform_input.get("current_delta_type")
|
|
session_configuration_request = realtime_response_transform_input.get(
|
|
"session_configuration_request"
|
|
)
|
|
|
|
returned_messages: List[OpenAIRealtimeEvents] = []
|
|
|
|
# Parse Bedrock event
|
|
event = json_message.get("event", {})
|
|
|
|
# Route to appropriate transformation method
|
|
if "sessionStart" in event:
|
|
session_created = self.transform_session_start_event(
|
|
event, model, logging_obj
|
|
)
|
|
returned_messages.append(session_created)
|
|
session_configuration_request = json.dumps({"configured": True})
|
|
|
|
elif "contentStart" in event:
|
|
(
|
|
events,
|
|
current_response_id,
|
|
current_output_item_id,
|
|
current_conversation_id,
|
|
current_delta_type,
|
|
) = self.transform_content_start_event(
|
|
event,
|
|
current_response_id,
|
|
current_output_item_id,
|
|
current_conversation_id,
|
|
)
|
|
returned_messages.extend(events)
|
|
|
|
elif "textOutput" in event:
|
|
events, current_delta_chunks = self.transform_text_output_event(
|
|
event,
|
|
current_output_item_id,
|
|
current_response_id,
|
|
current_delta_chunks,
|
|
)
|
|
returned_messages.extend(events)
|
|
|
|
elif "audioOutput" in event:
|
|
events = self.transform_audio_output_event(
|
|
event, current_output_item_id, current_response_id
|
|
)
|
|
returned_messages.extend(events)
|
|
|
|
elif "contentEnd" in event:
|
|
events, current_delta_chunks = self.transform_content_end_event(
|
|
event,
|
|
current_output_item_id,
|
|
current_response_id,
|
|
current_delta_type,
|
|
current_delta_chunks,
|
|
)
|
|
returned_messages.extend(events)
|
|
|
|
elif "toolUse" in event:
|
|
events, tool_call_id, tool_name = self.transform_tool_use_event(
|
|
event, current_output_item_id, current_response_id
|
|
)
|
|
returned_messages.extend(events)
|
|
# Store tool call info for potential use
|
|
verbose_logger.debug(f"Tool use event: {tool_name} (ID: {tool_call_id})")
|
|
|
|
elif "promptEnd" in event:
|
|
(
|
|
events,
|
|
current_output_item_id,
|
|
current_response_id,
|
|
current_delta_type,
|
|
) = self.transform_prompt_end_event(
|
|
event, current_response_id, current_conversation_id
|
|
)
|
|
returned_messages.extend(events)
|
|
|
|
return {
|
|
"response": returned_messages,
|
|
"current_output_item_id": current_output_item_id,
|
|
"current_response_id": current_response_id,
|
|
"current_delta_chunks": current_delta_chunks,
|
|
"current_conversation_id": current_conversation_id,
|
|
"current_item_chunks": realtime_response_transform_input.get(
|
|
"current_item_chunks"
|
|
),
|
|
"current_delta_type": current_delta_type,
|
|
"session_configuration_request": session_configuration_request,
|
|
}
|