chore: initial snapshot for gitea/github upload

This commit is contained in:
Your Name
2026-03-26 16:04:46 +08:00
commit a699a1ac98
3497 changed files with 1586237 additions and 0 deletions

View File

@@ -0,0 +1,200 @@
"""
Translates from OpenAI's `/v1/audio/transcriptions` to ElevenLabs's `/v1/speech-to-text`
"""
from typing import List, Optional, Union
from httpx import Headers, Response
import litellm
from litellm.litellm_core_utils.audio_utils.utils import process_audio_file
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import (
AllMessageValues,
OpenAIAudioTranscriptionOptionalParams,
)
from litellm.types.utils import FileTypes, TranscriptionResponse
from ...base_llm.audio_transcription.transformation import (
AudioTranscriptionRequestData,
BaseAudioTranscriptionConfig,
)
from ..common_utils import ElevenLabsException
class ElevenLabsAudioTranscriptionConfig(BaseAudioTranscriptionConfig):
@property
def custom_llm_provider(self) -> str:
return litellm.LlmProviders.ELEVENLABS.value
def get_supported_openai_params(
self, model: str
) -> List[OpenAIAudioTranscriptionOptionalParams]:
return ["language", "temperature"]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
supported_params = self.get_supported_openai_params(model)
for k, v in non_default_params.items():
if k in supported_params:
if k == "language":
# Map OpenAI language format to ElevenLabs language_code
optional_params["language_code"] = v
else:
optional_params[k] = v
return optional_params
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, Headers]
) -> BaseLLMException:
return ElevenLabsException(
message=error_message, status_code=status_code, headers=headers
)
def transform_audio_transcription_request(
self,
model: str,
audio_file: FileTypes,
optional_params: dict,
litellm_params: dict,
) -> AudioTranscriptionRequestData:
"""
Transforms the audio transcription request for ElevenLabs API.
Returns AudioTranscriptionRequestData with both form data and files.
Returns:
AudioTranscriptionRequestData: Structured data with form data and files
"""
# Use common utility to process the audio file
processed_audio = process_audio_file(audio_file)
# Prepare form data
form_data = {"model_id": model}
#########################################################
# Add OpenAI Compatible Parameters
#########################################################
for key, value in optional_params.items():
if key in self.get_supported_openai_params(model) and value is not None:
# Convert values to strings for form data, but skip None values
form_data[key] = str(value)
#########################################################
# Add Provider Specific Parameters
#########################################################
provider_specific_params = self.get_provider_specific_params(
model=model,
optional_params=optional_params,
openai_params=self.get_supported_openai_params(model),
)
for key, value in provider_specific_params.items():
form_data[key] = str(value)
#########################################################
#########################################################
# Prepare files
files = {
"file": (
processed_audio.filename,
processed_audio.file_content,
processed_audio.content_type,
)
}
return AudioTranscriptionRequestData(data=form_data, files=files)
def transform_audio_transcription_response(
self,
raw_response: Response,
) -> TranscriptionResponse:
"""
Transforms the raw response from ElevenLabs to the TranscriptionResponse format
"""
try:
response_json = raw_response.json()
# Extract the main transcript text
text = response_json.get("text", "")
# Create TranscriptionResponse object
response = TranscriptionResponse(text=text)
# Add additional metadata matching OpenAI format
response["task"] = "transcribe"
response["language"] = response_json.get("language_code", "unknown")
# Map ElevenLabs words to OpenAI format
if "words" in response_json:
response["words"] = []
for word_data in response_json["words"]:
# Only include actual words, skip spacing and audio events
if word_data.get("type") == "word":
response["words"].append(
{
"word": word_data.get("text", ""),
"start": word_data.get("start", 0),
"end": word_data.get("end", 0),
}
)
# Store full response in hidden params
response._hidden_params = response_json
return response
except Exception as e:
raise ValueError(
f"Error transforming ElevenLabs response: {str(e)}\nResponse: {raw_response.text}"
)
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
if api_base is None:
api_base = (
get_secret_str("ELEVENLABS_API_BASE") or "https://api.elevenlabs.io"
)
api_base = api_base.rstrip("/") # Remove trailing slash if present
# ElevenLabs speech-to-text endpoint
url = f"{api_base}/v1/speech-to-text"
return url
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
api_key = api_key or get_secret_str("ELEVENLABS_API_KEY")
if api_key is None:
raise ValueError(
"ElevenLabs API key is required. Set ELEVENLABS_API_KEY environment variable."
)
auth_header = {
"xi-api-key": api_key,
}
headers.update(auth_header)
return headers

View File

@@ -0,0 +1,5 @@
from litellm.llms.base_llm.chat.transformation import BaseLLMException
class ElevenLabsException(BaseLLMException):
pass

View File

@@ -0,0 +1,330 @@
"""
Elevenlabs Text-to-Speech transformation
Maps OpenAI TTS spec to Elevenlabs TTS API
"""
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
from urllib.parse import urlencode
import httpx
from httpx import Headers
import litellm
from litellm.types.utils import all_litellm_params
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.llms.base_llm.text_to_speech.transformation import (
BaseTextToSpeechConfig,
TextToSpeechRequestData,
)
from litellm.secret_managers.main import get_secret_str
from ..common_utils import ElevenLabsException
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.types.llms.openai import HttpxBinaryResponseContent
else:
LiteLLMLoggingObj = Any
HttpxBinaryResponseContent = Any
class ElevenLabsTextToSpeechConfig(BaseTextToSpeechConfig):
"""
Configuration for ElevenLabs Text-to-Speech
Reference: https://elevenlabs.io/docs/api-reference/text-to-speech/convert
"""
TTS_BASE_URL = "https://api.elevenlabs.io"
TTS_ENDPOINT_PATH = "/v1/text-to-speech"
DEFAULT_OUTPUT_FORMAT = "pcm_44100"
VOICE_MAPPINGS = {
"alloy": "21m00Tcm4TlvDq8ikWAM", # Rachel
"amber": "5Q0t7uMcjvnagumLfvZi", # Paul
"ash": "AZnzlk1XvdvUeBnXmlld", # Domi
"august": "D38z5RcWu1voky8WS1ja", # Fin
"blue": "2EiwWnXFnvU5JabPnv8n", # Clyde
"coral": "9BWtsMINqrJLrRacOk9x", # Aria
"lily": "EXAVITQu4vr4xnSDxMaL", # Sarah
"onyx": "29vD33N1CtxCmqQRPOHJ", # Drew
"sage": "CwhRBWXzGAHq8TQ4Fs17", # Roger
"verse": "CYw3kZ02Hs0563khs1Fj", # Dave
}
# Response format mappings from OpenAI to ElevenLabs
FORMAT_MAPPINGS = {
"mp3": "mp3_44100_128",
"pcm": "pcm_44100",
"opus": "opus_48000_128",
# ElevenLabs does not support WAV, AAC, or FLAC formats.
}
ELEVENLABS_QUERY_PARAMS_KEY = "__elevenlabs_query_params__"
ELEVENLABS_VOICE_ID_KEY = "__elevenlabs_voice_id__"
def get_supported_openai_params(self, model: str) -> list:
"""
ElevenLabs TTS supports these OpenAI parameters
"""
return ["voice", "response_format", "speed"]
def _extract_voice_id(self, voice: str) -> str:
"""
Normalize the provided voice information into an ElevenLabs voice_id.
"""
normalized_voice = voice.strip()
mapped_voice = self.VOICE_MAPPINGS.get(normalized_voice.lower())
return mapped_voice or normalized_voice
def _resolve_voice_id(
self,
voice: Optional[Union[str, Dict[str, Any]]],
params: Dict[str, Any],
) -> str:
"""
Determine the ElevenLabs voice_id based on provided voice input or parameters.
"""
mapped_voice: Optional[str] = None
if isinstance(voice, str) and voice.strip():
mapped_voice = self._extract_voice_id(voice)
elif isinstance(voice, dict):
for key in ("voice_id", "id", "name"):
candidate = voice.get(key)
if isinstance(candidate, str) and candidate.strip():
mapped_voice = self._extract_voice_id(candidate)
break
elif voice is not None:
mapped_voice = self._extract_voice_id(str(voice))
if mapped_voice is None:
voice_override = params.pop("voice_id", None)
if isinstance(voice_override, str) and voice_override.strip():
mapped_voice = self._extract_voice_id(voice_override)
if mapped_voice is None:
raise ValueError(
"ElevenLabs voice_id is required. Pass `voice` when calling `litellm.speech()`."
)
return mapped_voice
def map_openai_params(
self,
model: str,
optional_params: Dict,
voice: Optional[Union[str, Dict]] = None,
drop_params: bool = False,
kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[Optional[str], Dict]:
"""
Map OpenAI parameters to ElevenLabs TTS parameters
"""
mapped_params: Dict[str, Any] = {}
query_params: Dict[str, Any] = {}
# Work on a copy so we don't mutate the caller's dictionary
params = dict(optional_params) if optional_params else {}
passthrough_kwargs: Dict[str, Any] = kwargs if kwargs is not None else {}
# Extract voice identifier
mapped_voice = self._resolve_voice_id(voice, params)
# Response/output format → query parameter
response_format = params.pop("response_format", None)
if isinstance(response_format, str):
mapped_format = self.FORMAT_MAPPINGS.get(response_format, response_format)
query_params["output_format"] = mapped_format
# ElevenLabs does not support OpenAI speed directly.
# Drop it to avoid sending unsupported keys unless caller already provided voice_settings.
speed = params.pop("speed", None)
if speed is not None:
speed_value: Optional[float]
try:
speed_value = float(speed)
except (TypeError, ValueError):
speed_value = None
if speed_value is not None:
if isinstance(params.get("voice_settings"), dict):
params["voice_settings"]["speed"] = speed_value # type: ignore[index]
else:
params["voice_settings"] = {"speed": speed_value}
# Instructions parameter is OpenAI-specific; omit to prevent API errors.
params.pop("instructions", None)
self._add_elevenlabs_specific_params(
mapped_voice=mapped_voice,
query_params=query_params,
mapped_params=mapped_params,
kwargs=passthrough_kwargs,
remaining_params=params,
)
return mapped_voice, mapped_params
def validate_environment(
self,
headers: dict,
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
"""
Validate Azure environment and set up authentication headers
"""
api_key = (
api_key
or litellm.api_key
or litellm.openai_key
or get_secret_str("ELEVENLABS_API_KEY")
)
if api_key is None:
raise ValueError(
"ElevenLabs API key is required. Set ELEVENLABS_API_KEY environment variable."
)
headers.update(
{
"xi-api-key": api_key,
"Content-Type": "application/json",
}
)
return headers
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, Headers]
) -> BaseLLMException:
return ElevenLabsException(
message=error_message, status_code=status_code, headers=headers
)
def transform_text_to_speech_request(
self,
model: str,
input: str,
voice: Optional[str],
optional_params: Dict,
litellm_params: Dict,
headers: dict,
) -> TextToSpeechRequestData:
"""
Build the ElevenLabs TTS request payload.
"""
params = dict(optional_params) if optional_params else {}
extra_body = params.pop("extra_body", None)
request_body: Dict[str, Any] = {
"text": input,
"model_id": model,
}
for key, value in params.items():
if value is None:
continue
request_body[key] = value
if isinstance(extra_body, dict):
for key, value in extra_body.items():
if value is None:
continue
request_body[key] = value
return TextToSpeechRequestData(
dict_body=request_body,
headers={"Content-Type": "application/json"},
)
def _add_elevenlabs_specific_params(
self,
mapped_voice: str,
query_params: Dict[str, Any],
mapped_params: Dict[str, Any],
kwargs: Optional[Dict[str, Any]],
remaining_params: Dict[str, Any],
) -> None:
if kwargs is None:
kwargs = {}
for key, value in remaining_params.items():
if value is None:
continue
mapped_params[key] = value
reserved_kwarg_keys = set(all_litellm_params) | {
self.ELEVENLABS_QUERY_PARAMS_KEY,
self.ELEVENLABS_VOICE_ID_KEY,
"voice",
"model",
"response_format",
"output_format",
"extra_body",
"user",
}
extra_body_from_kwargs = kwargs.pop("extra_body", None)
if isinstance(extra_body_from_kwargs, dict):
for key, value in extra_body_from_kwargs.items():
if value is None:
continue
mapped_params[key] = value
for key in list(kwargs.keys()):
if key in reserved_kwarg_keys:
continue
value = kwargs[key]
if value is None:
continue
mapped_params[key] = value
kwargs.pop(key, None)
if query_params:
kwargs[self.ELEVENLABS_QUERY_PARAMS_KEY] = query_params
else:
kwargs.pop(self.ELEVENLABS_QUERY_PARAMS_KEY, None)
kwargs[self.ELEVENLABS_VOICE_ID_KEY] = mapped_voice
def transform_text_to_speech_response(
self,
model: str,
raw_response: httpx.Response,
logging_obj: LiteLLMLoggingObj,
) -> "HttpxBinaryResponseContent":
"""
Wrap ElevenLabs binary audio response.
"""
from litellm.types.llms.openai import HttpxBinaryResponseContent
return HttpxBinaryResponseContent(raw_response)
def get_complete_url(
self,
model: str,
api_base: Optional[str],
litellm_params: dict,
) -> str:
"""
Construct the ElevenLabs endpoint URL, including path voice_id and query params.
"""
base_url = (
api_base or get_secret_str("ELEVENLABS_API_BASE") or self.TTS_BASE_URL
)
base_url = base_url.rstrip("/")
voice_id = litellm_params.get(self.ELEVENLABS_VOICE_ID_KEY)
if not isinstance(voice_id, str) or not voice_id.strip():
raise ValueError(
"ElevenLabs voice_id is required. Pass `voice` when calling `litellm.speech()`."
)
url = f"{base_url}{self.TTS_ENDPOINT_PATH}/{voice_id}"
query_params = litellm_params.get(self.ELEVENLABS_QUERY_PARAMS_KEY, {})
if query_params:
url = f"{url}?{urlencode(query_params)}"
return url