chore: initial public snapshot for github upload

This commit is contained in:
Your Name
2026-03-26 20:06:14 +08:00
commit 0e5ecd930e
3497 changed files with 1586236 additions and 0 deletions

View File

@@ -0,0 +1,777 @@
"""
DataDog Integration - sends logs to /api/v2/log
DD Reference API: https://docs.datadoghq.com/api/latest/logs
`async_log_success_event` - used by litellm proxy to send logs to datadog
`log_success_event` - sync version of logging to DataDog, only used on litellm Python SDK, if user opts in to using sync functions
async_log_success_event: will store batch of DD_MAX_BATCH_SIZE in memory and flush to Datadog once it reaches DD_MAX_BATCH_SIZE or every 5 seconds
async_service_failure_hook: Logs failures from Redis, Postgres (Adjacent systems), as 'WARNING' on DataDog
For batching specific details see CustomBatchLogger class
"""
import asyncio
import datetime
import os
import traceback
from datetime import datetime as datetimeObj
from typing import Any, Dict, List, Optional, Union
import httpx
from httpx import Response
import litellm
from litellm._logging import verbose_logger
from litellm._uuid import uuid
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.datadog.datadog_mock_client import (
should_use_datadog_mock,
create_mock_datadog_client,
)
from litellm.integrations.datadog.datadog_handler import (
get_datadog_hostname,
get_datadog_service,
get_datadog_source,
get_datadog_tags,
get_datadog_base_url_from_env,
)
from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.llms.custom_httpx.http_handler import (
_get_httpx_client,
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
from litellm.types.integrations.datadog import (
DD_ERRORS,
DD_MAX_BATCH_SIZE,
DataDogStatus,
DatadogInitParams,
DatadogPayload,
DatadogProxyFailureHookJsonMessage,
)
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
from litellm.types.utils import StandardLoggingPayload
from ..additional_logging_utils import AdditionalLoggingUtils
# max number of logs DD API can accept
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
DD_LOGGED_SUCCESS_SERVICE_TYPES = [
ServiceTypes.RESET_BUDGET_JOB,
]
class DataDogLogger(
CustomBatchLogger,
AdditionalLoggingUtils,
):
# Class variables or attributes
def __init__(
self,
**kwargs,
):
"""
Initializes the datadog logger, checks if the correct env variables are set
Required environment variables (Direct API):
`DD_API_KEY` - your datadog api key
`DD_SITE` - your datadog site, example = `"us5.datadoghq.com"`
Optional environment variables (DataDog Agent):
`LITELLM_DD_AGENT_HOST` - hostname or IP of DataDog agent, example = `"localhost"`
`LITELLM_DD_AGENT_PORT` - port of DataDog agent (default: 10518 for logs)
Note: We use LITELLM_DD_AGENT_HOST instead of DD_AGENT_HOST to avoid conflicts
with ddtrace which automatically sets DD_AGENT_HOST for APM tracing.
"""
try:
verbose_logger.debug("Datadog: in init datadog logger")
self.is_mock_mode = should_use_datadog_mock()
if self.is_mock_mode:
create_mock_datadog_client()
verbose_logger.debug(
"[DATADOG MOCK] Datadog logger initialized in mock mode"
)
#########################################################
# Handle datadog_params set as litellm.datadog_params
#########################################################
dict_datadog_params = self._get_datadog_params()
kwargs.update(dict_datadog_params)
self.async_client = get_async_httpx_client(
llm_provider=httpxSpecialProvider.LoggingCallback
)
# Configure DataDog endpoint (Agent or Direct API)
# Use LITELLM_DD_AGENT_HOST to avoid conflicts with ddtrace's DD_AGENT_HOST
dd_agent_host = os.getenv("LITELLM_DD_AGENT_HOST")
if dd_agent_host:
self._configure_dd_agent(dd_agent_host=dd_agent_host)
else:
self._configure_dd_direct_api()
# Optional override for testing
dd_base_url = get_datadog_base_url_from_env()
if dd_base_url:
self.intake_url = f"{dd_base_url}/api/v2/logs"
self.sync_client = _get_httpx_client()
asyncio.create_task(self.periodic_flush())
self.flush_lock = asyncio.Lock()
super().__init__(
**kwargs, flush_lock=self.flush_lock, batch_size=DD_MAX_BATCH_SIZE
)
except Exception as e:
verbose_logger.exception(
f"Datadog: Got exception on init Datadog client {str(e)}"
)
raise e
def _get_datadog_params(self) -> Dict:
"""
Get the datadog_params from litellm.datadog_params
These are params specific to initializing the DataDogLogger e.g. turn_off_message_logging
"""
dict_datadog_params: Dict = {}
if litellm.datadog_params is not None:
if isinstance(litellm.datadog_params, DatadogInitParams):
dict_datadog_params = litellm.datadog_params.model_dump()
elif isinstance(litellm.datadog_params, Dict):
# only allow params that are of DatadogInitParams
dict_datadog_params = DatadogInitParams(
**litellm.datadog_params
).model_dump()
return dict_datadog_params
def _configure_dd_agent(self, dd_agent_host: str) -> None:
"""
Configure DataDog Agent for log forwarding
Args:
dd_agent_host: Hostname or IP of DataDog agent
"""
dd_agent_port = os.getenv(
"LITELLM_DD_AGENT_PORT", "10518"
) # default port for logs
self.intake_url = f"http://{dd_agent_host}:{dd_agent_port}/api/v2/logs"
self.DD_API_KEY = os.getenv("DD_API_KEY") # Optional when using agent
verbose_logger.debug(f"Datadog: Using DD Agent at {self.intake_url}")
def _configure_dd_direct_api(self) -> None:
"""
Configure direct DataDog API connection
Raises:
Exception: If required environment variables are not set
"""
if os.getenv("DD_API_KEY", None) is None:
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>")
if os.getenv("DD_SITE", None) is None:
raise Exception("DD_SITE is not set in .env, set 'DD_SITE=<>")
self.DD_API_KEY = os.getenv("DD_API_KEY")
self.intake_url = f"https://http-intake.logs.{os.getenv('DD_SITE')}/api/v2/logs"
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
"""
Async Log success events to Datadog
- Creates a Datadog payload
- Adds the Payload to the in memory logs queue
- Payload is flushed every 10 seconds or when batch size is greater than 100
Raises:
Raises a NON Blocking verbose_logger.exception if an error occurs
"""
try:
verbose_logger.debug(
"Datadog: Logging - Enters logging function for model %s", kwargs
)
await self._log_async_event(kwargs, response_obj, start_time, end_time)
except Exception as e:
verbose_logger.exception(
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
)
pass
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
try:
verbose_logger.debug(
"Datadog: Logging - Enters logging function for model %s", kwargs
)
await self._log_async_event(kwargs, response_obj, start_time, end_time)
except Exception as e:
verbose_logger.exception(
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
)
pass
async def async_post_call_failure_hook(
self,
request_data: dict,
original_exception: Exception,
user_api_key_dict: Any,
traceback_str: Optional[str] = None,
) -> Optional[Any]:
"""
Log proxy-level failures (e.g. 401 auth, DB connection errors) to Datadog.
Ensures failures that occur before or outside the LLM completion flow
(e.g. ConnectError during auth when DB is down) are visible in Datadog
alongside Prometheus.
"""
try:
from litellm.litellm_core_utils.litellm_logging import (
StandardLoggingPayloadSetup,
)
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
error_information = StandardLoggingPayloadSetup.get_error_information(
original_exception=original_exception,
traceback_str=traceback_str,
)
_code = error_information.get("error_code") or ""
status_code: Optional[int] = None
if _code and str(_code).strip().isdigit():
status_code = int(_code)
# Use project-standard sanitized user context when running in proxy
user_context: Dict[str, Any] = {}
try:
from litellm.proxy.litellm_pre_call_utils import (
LiteLLMProxyRequestSetup,
)
_meta = (
LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
user_api_key_dict=user_api_key_dict
)
)
user_context = dict(_meta) if isinstance(_meta, dict) else _meta
except Exception:
# Fallback if proxy not available (e.g. SDK-only): minimal safe fields
if hasattr(user_api_key_dict, "request_route"):
user_context["request_route"] = getattr(
user_api_key_dict, "request_route", None
)
if hasattr(user_api_key_dict, "team_id"):
user_context["team_id"] = getattr(
user_api_key_dict, "team_id", None
)
if hasattr(user_api_key_dict, "user_id"):
user_context["user_id"] = getattr(
user_api_key_dict, "user_id", None
)
if hasattr(user_api_key_dict, "end_user_id"):
user_context["end_user_id"] = getattr(
user_api_key_dict, "end_user_id", None
)
message_payload: DatadogProxyFailureHookJsonMessage = {
"exception": error_information.get("error_message")
or str(original_exception),
"error_class": error_information.get("error_class")
or original_exception.__class__.__name__,
"status_code": status_code,
"traceback": error_information.get("traceback") or "",
"user_api_key_dict": user_context,
}
dd_payload = DatadogPayload(
ddsource=get_datadog_source(),
ddtags=get_datadog_tags(),
hostname=get_datadog_hostname(),
message=safe_dumps(message_payload),
service=get_datadog_service(),
status=DataDogStatus.ERROR,
)
self._add_trace_context_to_payload(dd_payload=dd_payload)
self.log_queue.append(dd_payload)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
f"Datadog: async_post_call_failure_hook - {str(e)}\n{traceback.format_exc()}"
)
return None
async def async_send_batch(self):
"""
Sends the in memory logs queue to datadog api
Logs sent to /api/v2/logs
DD Ref: https://docs.datadoghq.com/api/latest/logs/
Raises:
Raises a NON Blocking verbose_logger.exception if an error occurs
"""
try:
if not self.log_queue:
verbose_logger.exception("Datadog: log_queue does not exist")
return
verbose_logger.debug(
"Datadog - about to flush %s events on %s",
len(self.log_queue),
self.intake_url,
)
if self.is_mock_mode:
verbose_logger.debug(
"[DATADOG MOCK] Mock mode enabled - API calls will be intercepted"
)
response = await self.async_send_compressed_data(self.log_queue)
if response.status_code == 413:
verbose_logger.exception(DD_ERRORS.DATADOG_413_ERROR.value)
return
response.raise_for_status()
if response.status_code != 202:
raise Exception(
f"Response from datadog API status_code: {response.status_code}, text: {response.text}"
)
if self.is_mock_mode:
verbose_logger.debug(
f"[DATADOG MOCK] Batch of {len(self.log_queue)} events successfully mocked"
)
else:
verbose_logger.debug(
"Datadog: Response from datadog API status_code: %s, text: %s",
response.status_code,
response.text,
)
except Exception as e:
verbose_logger.exception(
f"Datadog Error sending batch API - {str(e)}\n{traceback.format_exc()}"
)
def log_success_event(self, kwargs, response_obj, start_time, end_time):
"""
Sync Log success events to Datadog
- Creates a Datadog payload
- instantly logs it on DD API
"""
try:
if litellm.datadog_use_v1 is True:
dd_payload = self._create_v0_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
else:
dd_payload = self.create_datadog_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
# Build headers
headers = {}
# Add API key if available (required for direct API, optional for agent)
if self.DD_API_KEY:
headers["DD-API-KEY"] = self.DD_API_KEY
response = self.sync_client.post(
url=self.intake_url,
json=dd_payload, # type: ignore
headers=headers,
)
response.raise_for_status()
if response.status_code != 202:
raise Exception(
f"Response from datadog API status_code: {response.status_code}, text: {response.text}"
)
verbose_logger.debug(
"Datadog: Response from datadog API status_code: %s, text: %s",
response.status_code,
response.text,
)
except Exception as e:
verbose_logger.exception(
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
)
pass
pass
async def _log_async_event(self, kwargs, response_obj, start_time, end_time):
dd_payload = self.create_datadog_logging_payload(
kwargs=kwargs,
response_obj=response_obj,
start_time=start_time,
end_time=end_time,
)
self.log_queue.append(dd_payload)
verbose_logger.debug(
f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..."
)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
def _create_datadog_logging_payload_helper(
self,
standard_logging_object: StandardLoggingPayload,
status: DataDogStatus,
) -> DatadogPayload:
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
json_payload = safe_dumps(standard_logging_object)
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
dd_payload = DatadogPayload(
ddsource=get_datadog_source(),
ddtags=get_datadog_tags(standard_logging_object=standard_logging_object),
hostname=get_datadog_hostname(),
message=json_payload,
service=get_datadog_service(),
status=status,
)
self._add_trace_context_to_payload(dd_payload=dd_payload)
return dd_payload
def create_datadog_logging_payload(
self,
kwargs: Union[dict, Any],
response_obj: Any,
start_time: datetime.datetime,
end_time: datetime.datetime,
) -> DatadogPayload:
"""
Helper function to create a datadog payload for logging
Args:
kwargs (Union[dict, Any]): request kwargs
response_obj (Any): llm api response
start_time (datetime.datetime): start time of request
end_time (datetime.datetime): end time of request
Returns:
DatadogPayload: defined in types.py
"""
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if standard_logging_object is None:
raise ValueError("standard_logging_object not found in kwargs")
status = DataDogStatus.INFO
if standard_logging_object.get("status") == "failure":
status = DataDogStatus.ERROR
# Build the initial payload
self.truncate_standard_logging_payload_content(standard_logging_object)
dd_payload = self._create_datadog_logging_payload_helper(
standard_logging_object=standard_logging_object,
status=status,
)
return dd_payload
async def async_send_compressed_data(self, data: List) -> Response:
"""
Async helper to send compressed data to datadog self.intake_url
Datadog recommends using gzip to compress data
https://docs.datadoghq.com/api/latest/logs/
"Datadog recommends sending your logs compressed. Add the Content-Encoding: gzip header to the request when sending"
"""
import gzip
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
compressed_data = gzip.compress(safe_dumps(data).encode("utf-8"))
# Build headers
headers = {
"Content-Encoding": "gzip",
"Content-Type": "application/json",
}
# Add API key if available (required for direct API, optional for agent)
if self.DD_API_KEY:
headers["DD-API-KEY"] = self.DD_API_KEY
response = await self.async_client.post(
url=self.intake_url,
data=compressed_data, # type: ignore
headers=headers,
)
return response
async def async_service_failure_hook(
self,
payload: ServiceLoggerPayload,
error: Optional[str] = "",
parent_otel_span: Optional[Any] = None,
start_time: Optional[Union[datetimeObj, float]] = None,
end_time: Optional[Union[float, datetimeObj]] = None,
event_metadata: Optional[dict] = None,
):
"""
Logs failures from Redis, Postgres (Adjacent systems), as 'WARNING' on DataDog
- example - Redis is failing / erroring, will be logged on DataDog
"""
try:
_payload_dict = payload.model_dump()
_payload_dict.update(event_metadata or {})
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
_dd_message_str = safe_dumps(_payload_dict)
_dd_payload = DatadogPayload(
ddsource=get_datadog_source(),
ddtags=get_datadog_tags(),
hostname=get_datadog_hostname(),
message=_dd_message_str,
service=get_datadog_service(),
status=DataDogStatus.WARN,
)
self.log_queue.append(_dd_payload)
except Exception as e:
verbose_logger.exception(
f"Datadog: Logger - Exception in async_service_failure_hook: {e}"
)
pass
async def async_service_success_hook(
self,
payload: ServiceLoggerPayload,
error: Optional[str] = "",
parent_otel_span: Optional[Any] = None,
start_time: Optional[Union[datetimeObj, float]] = None,
end_time: Optional[Union[float, datetimeObj]] = None,
event_metadata: Optional[dict] = None,
):
"""
Logs success from Redis, Postgres (Adjacent systems), as 'INFO' on DataDog
No user has asked for this so far, this might be spammy on datatdog. If need arises we can implement this
"""
try:
# intentionally done. Don't want to log all service types to DD
if payload.service not in DD_LOGGED_SUCCESS_SERVICE_TYPES:
return
_payload_dict = payload.model_dump()
_payload_dict.update(event_metadata or {})
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
_dd_message_str = safe_dumps(_payload_dict)
_dd_payload = DatadogPayload(
ddsource=get_datadog_source(),
ddtags=get_datadog_tags(),
hostname=get_datadog_hostname(),
message=_dd_message_str,
service=get_datadog_service(),
status=DataDogStatus.INFO,
)
self.log_queue.append(_dd_payload)
except Exception as e:
verbose_logger.exception(
f"Datadog: Logger - Exception in async_service_failure_hook: {e}"
)
def _create_v0_logging_payload(
self,
kwargs: Union[dict, Any],
response_obj: Any,
start_time: datetime.datetime,
end_time: datetime.datetime,
) -> DatadogPayload:
"""
Note: This is our V1 Version of DataDog Logging Payload
(Not Recommended) If you want this to get logged set `litellm.datadog_use_v1 = True`
"""
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
messages = kwargs.get("messages")
optional_params = kwargs.get("optional_params", {})
call_type = kwargs.get("call_type", "litellm.completion")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj["usage"]
id = response_obj.get("id", str(uuid.uuid4()))
usage = dict(usage)
try:
response_time = (end_time - start_time).total_seconds() * 1000
except Exception:
response_time = None
try:
response_obj = dict(response_obj)
except Exception:
response_obj = response_obj
# Clean Metadata before logging - never log raw metadata
# the raw metadata can contain circular references which leads to infinite recursion
# we clean out all extra litellm metadata params before logging
clean_metadata = {}
if isinstance(metadata, dict):
for key, value in metadata.items():
# clean litellm metadata before logging
if key in [
"endpoint",
"caching_groups",
"previous_models",
]:
continue
else:
clean_metadata[key] = value
# Build the initial payload
payload = {
"id": id,
"call_type": call_type,
"cache_hit": cache_hit,
"start_time": start_time,
"end_time": end_time,
"response_time": response_time,
"model": kwargs.get("model", ""),
"user": kwargs.get("user", ""),
"model_parameters": optional_params,
"spend": kwargs.get("response_cost", 0),
"messages": messages,
"response": response_obj,
"usage": usage,
"metadata": clean_metadata,
}
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
json_payload = safe_dumps(payload)
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
dd_payload = DatadogPayload(
ddsource=get_datadog_source(),
ddtags=get_datadog_tags(),
hostname=get_datadog_hostname(),
message=json_payload,
service=get_datadog_service(),
status=DataDogStatus.INFO,
)
return dd_payload
def _add_trace_context_to_payload(
self,
dd_payload: DatadogPayload,
) -> None:
"""Attach Datadog APM trace context if one is active."""
try:
trace_context = self._get_active_trace_context()
if trace_context is None:
return
dd_payload["dd.trace_id"] = trace_context["trace_id"]
span_id = trace_context.get("span_id")
if span_id is not None:
dd_payload["dd.span_id"] = span_id
except Exception:
verbose_logger.exception(
"Datadog: Failed to attach trace context to payload"
)
def _get_active_trace_context(self) -> Optional[Dict[str, str]]:
try:
current_span = None
current_span_fn = getattr(tracer, "current_span", None)
if callable(current_span_fn):
current_span = current_span_fn()
if current_span is None:
current_root_span_fn = getattr(tracer, "current_root_span", None)
if callable(current_root_span_fn):
current_span = current_root_span_fn()
if current_span is None:
return None
trace_id = getattr(current_span, "trace_id", None)
if trace_id is None:
return None
span_id = getattr(current_span, "span_id", None)
trace_context: Dict[str, str] = {"trace_id": str(trace_id)}
if span_id is not None:
trace_context["span_id"] = str(span_id)
return trace_context
except Exception:
verbose_logger.exception(
"Datadog: Failed to retrieve active trace context from tracer"
)
return None
async def async_health_check(self) -> IntegrationHealthCheckStatus:
"""
Check if the service is healthy
"""
from litellm.litellm_core_utils.litellm_logging import (
create_dummy_standard_logging_payload,
)
standard_logging_object = create_dummy_standard_logging_payload()
dd_payload = self._create_datadog_logging_payload_helper(
standard_logging_object=standard_logging_object,
status=DataDogStatus.INFO,
)
log_queue = [dd_payload]
response = await self.async_send_compressed_data(log_queue)
try:
response.raise_for_status()
return IntegrationHealthCheckStatus(
status="healthy",
error_message=None,
)
except httpx.HTTPStatusError as e:
return IntegrationHealthCheckStatus(
status="unhealthy",
error_message=e.response.text,
)
except Exception as e:
return IntegrationHealthCheckStatus(
status="unhealthy",
error_message=str(e),
)
async def get_request_response_payload(
self,
request_id: str,
start_time_utc: Optional[datetimeObj],
end_time_utc: Optional[datetimeObj],
) -> Optional[dict]:
pass

View File

@@ -0,0 +1,216 @@
import asyncio
import os
import time
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.types.integrations.datadog_cost_management import (
DatadogFOCUSCostEntry,
)
from litellm.types.utils import StandardLoggingPayload
class DatadogCostManagementLogger(CustomBatchLogger):
def __init__(self, **kwargs):
self.dd_api_key = os.getenv("DD_API_KEY")
self.dd_app_key = os.getenv("DD_APP_KEY")
self.dd_site = os.getenv("DD_SITE", "datadoghq.com")
if not self.dd_api_key or not self.dd_app_key:
verbose_logger.warning(
"Datadog Cost Management: DD_API_KEY and DD_APP_KEY are required. Integration will not work."
)
self.upload_url = f"https://api.{self.dd_site}/api/v2/cost/custom_costs"
self.async_client = get_async_httpx_client(
llm_provider=httpxSpecialProvider.LoggingCallback
)
# Initialize lock and start periodic flush task
self.flush_lock = asyncio.Lock()
asyncio.create_task(self.periodic_flush())
# Check if flush_lock is already in kwargs to avoid double passing (unlikely but safe)
if "flush_lock" not in kwargs:
kwargs["flush_lock"] = self.flush_lock
super().__init__(**kwargs)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if standard_logging_object is None:
return
# Only log if there is a cost associated
if standard_logging_object.get("response_cost", 0) > 0:
self.log_queue.append(standard_logging_object)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
f"Datadog Cost Management: Error in async_log_success_event: {str(e)}"
)
async def async_send_batch(self):
if not self.log_queue:
return
try:
# Aggregate costs from the batch
aggregated_entries = self._aggregate_costs(self.log_queue)
if not aggregated_entries:
return
# Send to Datadog
await self._upload_to_datadog(aggregated_entries)
# Clear queue only on success (or if we decide to drop on failure)
# CustomBatchLogger clears queue in flush_queue, so we just process here
except Exception as e:
verbose_logger.exception(
f"Datadog Cost Management: Error in async_send_batch: {str(e)}"
)
def _aggregate_costs(
self, logs: List[StandardLoggingPayload]
) -> List[DatadogFOCUSCostEntry]:
"""
Aggregates costs by Provider, Model, and Date.
Returns a list of DatadogFOCUSCostEntry.
"""
aggregator: Dict[
Tuple[str, str, str, Tuple[Tuple[str, str], ...]], DatadogFOCUSCostEntry
] = {}
for log in logs:
try:
# Extract keys for aggregation
provider = log.get("custom_llm_provider") or "unknown"
model = log.get("model") or "unknown"
cost = log.get("response_cost", 0)
if cost == 0:
continue
# Get date strings (FOCUS format requires specific keys, but for aggregation we group by Day)
# UTC date
# We interpret "ChargePeriod" as the day of the request.
ts = log.get("startTime") or time.time()
dt = datetime.fromtimestamp(ts)
date_str = dt.strftime("%Y-%m-%d")
# ChargePeriodStart and End
# If we want daily granularity, end date is usually same day or next day?
# Datadog Custom Costs usually expects periods.
# "ChargePeriodStart": "2023-01-01", "ChargePeriodEnd": "2023-12-31" in example.
# If we send daily, we can say Start=Date, End=Date.
# Grouping Key: Provider + Model + Date + Tags?
# For simplicity, let's aggregate by Provider + Model + Date first.
# If we handle tags, we need to include them in the key.
tags = self._extract_tags(log)
tags_key = tuple(sorted(tags.items())) if tags else ()
key = (provider, model, date_str, tags_key)
if key not in aggregator:
aggregator[key] = {
"ProviderName": provider,
"ChargeDescription": f"LLM Usage for {model}",
"ChargePeriodStart": date_str,
"ChargePeriodEnd": date_str,
"BilledCost": 0.0,
"BillingCurrency": "USD",
"Tags": tags if tags else None,
}
aggregator[key]["BilledCost"] += cost
except Exception as e:
verbose_logger.warning(
f"Error processing log for cost aggregation: {e}"
)
continue
return list(aggregator.values())
def _extract_tags(self, log: StandardLoggingPayload) -> Dict[str, str]:
from litellm.integrations.datadog.datadog_handler import (
get_datadog_env,
get_datadog_hostname,
get_datadog_pod_name,
get_datadog_service,
)
tags = {
"env": get_datadog_env(),
"service": get_datadog_service(),
"host": get_datadog_hostname(),
"pod_name": get_datadog_pod_name(),
}
# Add metadata as tags
metadata = log.get("metadata", {})
if metadata:
# Add user info
# Add user info
if metadata.get("user_api_key_alias"):
tags["user"] = str(metadata["user_api_key_alias"])
# Add Team Tag
team_tag = (
metadata.get("user_api_key_team_alias")
or metadata.get("team_alias") # type: ignore
or metadata.get("user_api_key_team_id")
or metadata.get("team_id") # type: ignore
)
if team_tag:
tags["team"] = str(team_tag)
# model_group is not in StandardLoggingMetadata TypedDict, so we need to access it via dict.get()
model_group = metadata.get("model_group") # type: ignore[misc]
if model_group:
tags["model_group"] = str(model_group)
return tags
async def _upload_to_datadog(self, payload: List[Dict]):
if not self.dd_api_key or not self.dd_app_key:
return
headers = {
"Content-Type": "application/json",
"DD-API-KEY": self.dd_api_key,
"DD-APPLICATION-KEY": self.dd_app_key,
}
# The API endpoint expects a list of objects directly in the body (file content behavior)
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
data_json = safe_dumps(payload)
response = await self.async_client.put(
self.upload_url, content=data_json, headers=headers
)
response.raise_for_status()
verbose_logger.debug(
f"Datadog Cost Management: Uploaded {len(payload)} cost entries. Status: {response.status_code}"
)

View File

@@ -0,0 +1,69 @@
"""Shared helpers for Datadog integrations."""
from __future__ import annotations
import os
from typing import List, Optional
from litellm.types.utils import StandardLoggingPayload
def get_datadog_source() -> str:
return os.getenv("DD_SOURCE", "litellm")
def get_datadog_service() -> str:
return os.getenv("DD_SERVICE", "litellm-server")
def get_datadog_hostname() -> str:
return os.getenv("HOSTNAME", "")
def get_datadog_base_url_from_env() -> Optional[str]:
"""
Get base URL override from common DD_BASE_URL env var.
This is useful for testing or custom endpoints.
"""
return os.getenv("DD_BASE_URL")
def get_datadog_env() -> str:
return os.getenv("DD_ENV", "unknown")
def get_datadog_pod_name() -> str:
return os.getenv("POD_NAME", "unknown")
def get_datadog_tags(
standard_logging_object: Optional[StandardLoggingPayload] = None,
) -> str:
"""Build Datadog tags string used by multiple integrations."""
base_tags = {
"env": get_datadog_env(),
"service": get_datadog_service(),
"version": os.getenv("DD_VERSION", "unknown"),
"HOSTNAME": get_datadog_hostname(),
"POD_NAME": get_datadog_pod_name(),
}
tags: List[str] = [f"{k}:{v}" for k, v in base_tags.items()]
if standard_logging_object:
request_tags = standard_logging_object.get("request_tags", []) or []
tags.extend(f"request_tag:{tag}" for tag in request_tags)
# Add Team Tag
metadata = standard_logging_object.get("metadata", {}) or {}
team_tag = (
metadata.get("user_api_key_team_alias")
or metadata.get("team_alias")
or metadata.get("user_api_key_team_id")
or metadata.get("team_id")
)
if team_tag:
tags.append(f"team:{team_tag}")
return ",".join(tags)

View File

@@ -0,0 +1,856 @@
"""
Implements logging integration with Datadog's LLM Observability Service
API Reference: https://docs.datadoghq.com/llm_observability/setup/api/?tab=example#api-standards
"""
import asyncio
import json
import os
from litellm._uuid import uuid
from datetime import datetime
from typing import Any, Dict, List, Literal, Optional, Union
import httpx
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.datadog.datadog_mock_client import (
should_use_datadog_mock,
create_mock_datadog_client,
)
from litellm.integrations.datadog.datadog_handler import (
get_datadog_service,
get_datadog_tags,
get_datadog_base_url_from_env,
)
from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.litellm_core_utils.prompt_templates.common_utils import (
handle_any_messages_to_chat_completion_str_messages_conversion,
)
from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.types.integrations.datadog_llm_obs import *
from litellm.types.utils import (
CallTypes,
StandardLoggingGuardrailInformation,
StandardLoggingPayload,
StandardLoggingPayloadErrorInformation,
)
class DataDogLLMObsLogger(CustomBatchLogger):
def __init__(self, **kwargs):
try:
verbose_logger.debug("DataDogLLMObs: Initializing logger")
self.is_mock_mode = should_use_datadog_mock()
if self.is_mock_mode:
create_mock_datadog_client()
verbose_logger.debug(
"[DATADOG MOCK] DataDogLLMObs logger initialized in mock mode"
)
# Configure DataDog endpoint (Agent or Direct API)
# Use LITELLM_DD_AGENT_HOST to avoid conflicts with ddtrace's DD_AGENT_HOST
# Check for agent mode FIRST - agent mode doesn't require DD_API_KEY or DD_SITE
dd_agent_host = os.getenv("LITELLM_DD_AGENT_HOST")
self.async_client = get_async_httpx_client(
llm_provider=httpxSpecialProvider.LoggingCallback
)
self.DD_API_KEY = os.getenv("DD_API_KEY")
if dd_agent_host:
self._configure_dd_agent(dd_agent_host=dd_agent_host)
else:
# Only require DD_API_KEY and DD_SITE for direct API mode
if os.getenv("DD_API_KEY", None) is None:
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>'")
if os.getenv("DD_SITE", None) is None:
raise Exception(
"DD_SITE is not set, set 'DD_SITE=<>', example sit = `us5.datadoghq.com`"
)
self._configure_dd_direct_api()
# Optional override for testing
dd_base_url = get_datadog_base_url_from_env()
if dd_base_url:
self.intake_url = f"{dd_base_url}/api/intake/llm-obs/v1/trace/spans"
asyncio.create_task(self.periodic_flush())
self.flush_lock = asyncio.Lock()
self.log_queue: List[LLMObsPayload] = []
#########################################################
# Handle datadog_llm_observability_params set as litellm.datadog_llm_observability_params
#########################################################
dict_datadog_llm_obs_params = self._get_datadog_llm_obs_params()
kwargs.update(dict_datadog_llm_obs_params)
CustomBatchLogger.__init__(self, **kwargs, flush_lock=self.flush_lock)
except Exception as e:
verbose_logger.exception(f"DataDogLLMObs: Error initializing - {str(e)}")
raise e
def _configure_dd_agent(self, dd_agent_host: str):
"""
Configure the Datadog logger to send traces to the Agent.
"""
# When using the Agent, LLM Observability Intake does NOT require the API Key
# Reference: https://docs.datadoghq.com/llm_observability/setup/sdk/#agent-setup
# Use specific port for LLM Obs (Trace Agent) to avoid conflict with Logs Agent (10518)
agent_port = os.getenv("LITELLM_DD_LLM_OBS_PORT", "8126")
self.DD_SITE = "localhost" # Not used for URL construction in agent mode
self.intake_url = (
f"http://{dd_agent_host}:{agent_port}/api/intake/llm-obs/v1/trace/spans"
)
verbose_logger.debug(f"DataDogLLMObs: Using DD Agent at {self.intake_url}")
def _configure_dd_direct_api(self):
"""
Configure the Datadog logger to send traces directly to the Datadog API.
"""
if not self.DD_API_KEY:
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>'")
self.DD_SITE = os.getenv("DD_SITE")
if not self.DD_SITE:
raise Exception(
"DD_SITE is not set, set 'DD_SITE=<>', example site = `us5.datadoghq.com`"
)
self.intake_url = (
f"https://api.{self.DD_SITE}/api/intake/llm-obs/v1/trace/spans"
)
def _get_datadog_llm_obs_params(self) -> Dict:
"""
Get the datadog_llm_observability_params from litellm.datadog_llm_observability_params
These are params specific to initializing the DataDogLLMObsLogger e.g. turn_off_message_logging
"""
dict_datadog_llm_obs_params: Dict = {}
if litellm.datadog_llm_observability_params is not None:
if isinstance(
litellm.datadog_llm_observability_params, DatadogLLMObsInitParams
):
dict_datadog_llm_obs_params = (
litellm.datadog_llm_observability_params.model_dump()
)
elif isinstance(litellm.datadog_llm_observability_params, Dict):
# only allow params that are of DatadogLLMObsInitParams
dict_datadog_llm_obs_params = DatadogLLMObsInitParams(
**litellm.datadog_llm_observability_params
).model_dump()
return dict_datadog_llm_obs_params
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
verbose_logger.debug(
f"DataDogLLMObs: Logging success event for model {kwargs.get('model', 'unknown')}"
)
payload = self.create_llm_obs_payload(kwargs, start_time, end_time)
verbose_logger.debug(f"DataDogLLMObs: Payload: {payload}")
self.log_queue.append(payload)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
f"DataDogLLMObs: Error logging success event - {str(e)}"
)
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
try:
verbose_logger.debug(
f"DataDogLLMObs: Logging failure event for model {kwargs.get('model', 'unknown')}"
)
payload = self.create_llm_obs_payload(kwargs, start_time, end_time)
verbose_logger.debug(f"DataDogLLMObs: Payload: {payload}")
self.log_queue.append(payload)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
f"DataDogLLMObs: Error logging failure event - {str(e)}"
)
async def async_send_batch(self):
try:
if not self.log_queue:
return
verbose_logger.debug(
f"DataDogLLMObs: Flushing {len(self.log_queue)} events"
)
if self.is_mock_mode:
verbose_logger.debug(
"[DATADOG MOCK] Mock mode enabled - API calls will be intercepted"
)
# Prepare the payload
payload = {
"data": DDIntakePayload(
type="span",
attributes=DDSpanAttributes(
ml_app=get_datadog_service(),
tags=[get_datadog_tags()],
spans=self.log_queue,
),
),
}
# serialize datetime objects - for budget reset time in spend metrics
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
try:
verbose_logger.debug("payload %s", safe_dumps(payload))
except Exception as debug_error:
verbose_logger.debug(
"payload serialization failed: %s", str(debug_error)
)
json_payload = safe_dumps(payload)
headers = {"Content-Type": "application/json"}
if self.DD_API_KEY:
headers["DD-API-KEY"] = self.DD_API_KEY
response = await self.async_client.post(
url=self.intake_url,
content=json_payload,
headers=headers,
)
if response.status_code != 202:
raise Exception(
f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}"
)
if self.is_mock_mode:
verbose_logger.debug(
f"[DATADOG MOCK] Batch of {len(self.log_queue)} events successfully mocked"
)
else:
verbose_logger.debug(
f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}"
)
self.log_queue.clear()
except httpx.HTTPStatusError as e:
verbose_logger.exception(
f"DataDogLLMObs: Error sending batch - {e.response.text}"
)
except Exception as e:
verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}")
def create_llm_obs_payload(
self, kwargs: Dict, start_time: datetime, end_time: datetime
) -> LLMObsPayload:
standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object"
)
if standard_logging_payload is None:
raise Exception("DataDogLLMObs: standard_logging_object is not set")
messages = standard_logging_payload["messages"]
messages = self._ensure_string_content(messages=messages)
metadata = kwargs.get("litellm_params", {}).get("metadata", {})
input_meta = InputMeta(
messages=handle_any_messages_to_chat_completion_str_messages_conversion(
messages
)
)
output_meta = OutputMeta(
messages=self._get_response_messages(
standard_logging_payload=standard_logging_payload,
call_type=standard_logging_payload.get("call_type"),
)
)
error_info = self._assemble_error_info(standard_logging_payload)
metadata_parent_id: Optional[str] = None
if isinstance(metadata, dict):
metadata_parent_id = metadata.get("parent_id")
meta = Meta(
kind=self._get_datadog_span_kind(
standard_logging_payload.get("call_type"), metadata_parent_id
),
input=input_meta,
output=output_meta,
metadata=self._get_dd_llm_obs_payload_metadata(standard_logging_payload),
error=error_info,
)
# Calculate metrics (you may need to adjust these based on available data)
metrics = LLMMetrics(
input_tokens=float(standard_logging_payload.get("prompt_tokens", 0)),
output_tokens=float(standard_logging_payload.get("completion_tokens", 0)),
total_tokens=float(standard_logging_payload.get("total_tokens", 0)),
total_cost=float(standard_logging_payload.get("response_cost", 0)),
time_to_first_token=self._get_time_to_first_token_seconds(
standard_logging_payload
),
)
payload: LLMObsPayload = LLMObsPayload(
parent_id=metadata_parent_id if metadata_parent_id else "undefined",
trace_id=standard_logging_payload.get("trace_id", str(uuid.uuid4())),
span_id=metadata.get("span_id", str(uuid.uuid4())),
name=metadata.get("name", "litellm_llm_call"),
meta=meta,
start_ns=int(start_time.timestamp() * 1e9),
duration=int((end_time - start_time).total_seconds() * 1e9),
metrics=metrics,
status="error" if error_info else "ok",
tags=[get_datadog_tags(standard_logging_object=standard_logging_payload)],
)
apm_trace_id = self._get_apm_trace_id()
if apm_trace_id is not None:
payload["apm_id"] = apm_trace_id
return payload
def _get_apm_trace_id(self) -> Optional[str]:
"""Retrieve the current APM trace ID if available."""
try:
current_span_fn = getattr(tracer, "current_span", None)
if callable(current_span_fn):
current_span = current_span_fn()
if current_span is not None:
trace_id = getattr(current_span, "trace_id", None)
if trace_id is not None:
return str(trace_id)
except Exception:
pass
return None
def _assemble_error_info(
self, standard_logging_payload: StandardLoggingPayload
) -> Optional[DDLLMObsError]:
"""
Assemble error information for failure cases according to DD LLM Obs API spec
"""
# Handle error information for failure cases according to DD LLM Obs API spec
error_info: Optional[DDLLMObsError] = None
if standard_logging_payload.get("status") == "failure":
# Try to get structured error information first
error_information: Optional[
StandardLoggingPayloadErrorInformation
] = standard_logging_payload.get("error_information")
if error_information:
error_info = DDLLMObsError(
message=error_information.get("error_message")
or standard_logging_payload.get("error_str")
or "Unknown error",
type=error_information.get("error_class"),
stack=error_information.get("traceback"),
)
return error_info
def _get_time_to_first_token_seconds(
self, standard_logging_payload: StandardLoggingPayload
) -> float:
"""
Get the time to first token in seconds
CompletionStartTime - StartTime = Time to first token
For non streaming calls, CompletionStartTime is time we get the response back
"""
start_time: Optional[float] = standard_logging_payload.get("startTime")
completion_start_time: Optional[float] = standard_logging_payload.get(
"completionStartTime"
)
end_time: Optional[float] = standard_logging_payload.get("endTime")
if completion_start_time is not None and start_time is not None:
return completion_start_time - start_time
elif end_time is not None and start_time is not None:
return end_time - start_time
else:
return 0.0
def _get_response_messages(
self, standard_logging_payload: StandardLoggingPayload, call_type: Optional[str]
) -> List[Any]:
"""
Get the messages from the response object
for now this handles logging /chat/completions responses
"""
response_obj = standard_logging_payload.get("response")
if response_obj is None:
return []
# edge case: handle response_obj is a string representation of a dict
if isinstance(response_obj, str):
try:
import ast
response_obj = ast.literal_eval(response_obj)
except (ValueError, SyntaxError):
try:
# fallback to json parsing
response_obj = json.loads(str(response_obj))
except json.JSONDecodeError:
return []
if call_type in [
CallTypes.completion.value,
CallTypes.acompletion.value,
CallTypes.text_completion.value,
CallTypes.atext_completion.value,
CallTypes.generate_content.value,
CallTypes.agenerate_content.value,
CallTypes.generate_content_stream.value,
CallTypes.agenerate_content_stream.value,
CallTypes.anthropic_messages.value,
]:
try:
# Safely extract message from response_obj, handle failure cases
if isinstance(response_obj, dict) and "choices" in response_obj:
choices = response_obj["choices"]
if choices and len(choices) > 0 and "message" in choices[0]:
return [choices[0]["message"]]
return []
except (KeyError, IndexError, TypeError):
# In case of any error accessing the response structure, return empty list
return []
return []
def _get_datadog_span_kind(
self, call_type: Optional[str], parent_id: Optional[str] = None
) -> Literal["llm", "tool", "task", "embedding", "retrieval"]:
"""
Map liteLLM call_type to appropriate DataDog LLM Observability span kind.
Available DataDog span kinds: "llm", "tool", "task", "embedding", "retrieval"
see: https://docs.datadoghq.com/ja/llm_observability/terms/
"""
# Non llm/workflow/agent kinds cannot be root spans, so fallback to "llm" when parent metadata is missing
if call_type is None or parent_id is None:
return "llm"
# Embedding operations
if call_type in [CallTypes.embedding.value, CallTypes.aembedding.value]:
return "embedding"
# LLM completion operations
if call_type in [
CallTypes.completion.value,
CallTypes.acompletion.value,
CallTypes.text_completion.value,
CallTypes.atext_completion.value,
CallTypes.generate_content.value,
CallTypes.agenerate_content.value,
CallTypes.generate_content_stream.value,
CallTypes.agenerate_content_stream.value,
CallTypes.anthropic_messages.value,
CallTypes.responses.value,
CallTypes.aresponses.value,
]:
return "llm"
# Tool operations
if call_type in [CallTypes.call_mcp_tool.value]:
return "tool"
# Retrieval operations
if call_type in [
CallTypes.get_assistants.value,
CallTypes.aget_assistants.value,
CallTypes.get_thread.value,
CallTypes.aget_thread.value,
CallTypes.get_messages.value,
CallTypes.aget_messages.value,
CallTypes.afile_retrieve.value,
CallTypes.file_retrieve.value,
CallTypes.afile_list.value,
CallTypes.file_list.value,
CallTypes.afile_content.value,
CallTypes.file_content.value,
CallTypes.retrieve_batch.value,
CallTypes.aretrieve_batch.value,
CallTypes.retrieve_fine_tuning_job.value,
CallTypes.aretrieve_fine_tuning_job.value,
CallTypes.alist_input_items.value,
]:
return "retrieval"
# Task operations (batch, fine-tuning, file operations, etc.)
if call_type in [
CallTypes.create_batch.value,
CallTypes.acreate_batch.value,
CallTypes.create_fine_tuning_job.value,
CallTypes.acreate_fine_tuning_job.value,
CallTypes.cancel_fine_tuning_job.value,
CallTypes.acancel_fine_tuning_job.value,
CallTypes.list_fine_tuning_jobs.value,
CallTypes.alist_fine_tuning_jobs.value,
CallTypes.create_assistants.value,
CallTypes.acreate_assistants.value,
CallTypes.delete_assistant.value,
CallTypes.adelete_assistant.value,
CallTypes.create_thread.value,
CallTypes.acreate_thread.value,
CallTypes.add_message.value,
CallTypes.a_add_message.value,
CallTypes.run_thread.value,
CallTypes.arun_thread.value,
CallTypes.run_thread_stream.value,
CallTypes.arun_thread_stream.value,
CallTypes.file_delete.value,
CallTypes.afile_delete.value,
CallTypes.create_file.value,
CallTypes.acreate_file.value,
CallTypes.image_generation.value,
CallTypes.aimage_generation.value,
CallTypes.image_edit.value,
CallTypes.aimage_edit.value,
CallTypes.moderation.value,
CallTypes.amoderation.value,
CallTypes.transcription.value,
CallTypes.atranscription.value,
CallTypes.speech.value,
CallTypes.aspeech.value,
CallTypes.rerank.value,
CallTypes.arerank.value,
]:
return "task"
# Default fallback for unknown or passthrough operations
return "llm"
def _ensure_string_content(
self, messages: Optional[Union[str, List[Any], Dict[Any, Any]]]
) -> List[Any]:
if messages is None:
return []
if isinstance(messages, str):
return [messages]
elif isinstance(messages, list):
return [message for message in messages]
elif isinstance(messages, dict):
return [str(messages.get("content", ""))]
return []
def _get_dd_llm_obs_payload_metadata(
self, standard_logging_payload: StandardLoggingPayload
) -> Dict[str, Any]:
"""
Fields to track in DD LLM Observability metadata from litellm standard logging payload
"""
_metadata: Dict[str, Any] = {
"model_name": standard_logging_payload.get("model", "unknown"),
"model_provider": standard_logging_payload.get(
"custom_llm_provider", "unknown"
),
"id": standard_logging_payload.get("id", "unknown"),
"trace_id": standard_logging_payload.get("trace_id", "unknown"),
"cache_hit": standard_logging_payload.get("cache_hit", "unknown"),
"cache_key": standard_logging_payload.get("cache_key", "unknown"),
"saved_cache_cost": standard_logging_payload.get("saved_cache_cost", 0),
"guardrail_information": standard_logging_payload.get(
"guardrail_information", None
),
"is_streamed_request": self._get_stream_value_from_payload(
standard_logging_payload
),
}
#########################################################
# Add latency metrics to metadata
#########################################################
latency_metrics = self._get_latency_metrics(standard_logging_payload)
_metadata.update({"latency_metrics": dict(latency_metrics)})
#########################################################
# Add spend metrics to metadata
#########################################################
spend_metrics = self._get_spend_metrics(standard_logging_payload)
_metadata.update({"spend_metrics": dict(spend_metrics)})
## extract tool calls and add to metadata
tool_call_metadata = self._extract_tool_call_metadata(standard_logging_payload)
_metadata.update(tool_call_metadata)
_standard_logging_metadata: dict = (
dict(standard_logging_payload.get("metadata", {})) or {}
)
_metadata.update(_standard_logging_metadata)
return _metadata
def _get_latency_metrics(
self, standard_logging_payload: StandardLoggingPayload
) -> DDLLMObsLatencyMetrics:
"""
Get the latency metrics from the standard logging payload
"""
latency_metrics: DDLLMObsLatencyMetrics = DDLLMObsLatencyMetrics()
# Add latency metrics to metadata
# Time to first token (convert from seconds to milliseconds for consistency)
time_to_first_token_seconds = self._get_time_to_first_token_seconds(
standard_logging_payload
)
if time_to_first_token_seconds > 0:
latency_metrics["time_to_first_token_ms"] = (
time_to_first_token_seconds * 1000
)
# LiteLLM overhead time
hidden_params = standard_logging_payload.get("hidden_params", {})
litellm_overhead_ms = hidden_params.get("litellm_overhead_time_ms")
if litellm_overhead_ms is not None:
latency_metrics["litellm_overhead_time_ms"] = litellm_overhead_ms
# Guardrail overhead latency
guardrail_info: Optional[
list[StandardLoggingGuardrailInformation]
] = standard_logging_payload.get("guardrail_information")
if guardrail_info is not None:
total_duration = 0.0
for info in guardrail_info:
_guardrail_duration_seconds: Optional[float] = info.get("duration")
if _guardrail_duration_seconds is not None:
total_duration += float(_guardrail_duration_seconds)
if total_duration > 0:
# Convert from seconds to milliseconds for consistency
latency_metrics["guardrail_overhead_time_ms"] = total_duration * 1000
return latency_metrics
def _get_stream_value_from_payload(
self, standard_logging_payload: StandardLoggingPayload
) -> bool:
"""
Extract the stream value from standard logging payload.
The stream field in StandardLoggingPayload is only set to True for completed streaming responses.
For non-streaming requests, it's None. The original stream parameter is in model_parameters.
Returns:
bool: True if this was a streaming request, False otherwise
"""
# Check top-level stream field first (only True for completed streaming)
stream_value = standard_logging_payload.get("stream")
if stream_value is True:
return True
# Fallback to model_parameters.stream for original request parameters
model_params = standard_logging_payload.get("model_parameters", {})
if isinstance(model_params, dict):
stream_value = model_params.get("stream")
if stream_value is True:
return True
# Default to False for non-streaming requests
return False
def _get_spend_metrics(
self, standard_logging_payload: StandardLoggingPayload
) -> DDLLMObsSpendMetrics:
"""
Get the spend metrics from the standard logging payload
"""
spend_metrics: DDLLMObsSpendMetrics = DDLLMObsSpendMetrics()
# send response cost
spend_metrics["response_cost"] = standard_logging_payload.get(
"response_cost", 0.0
)
# Get budget information from metadata
metadata = standard_logging_payload.get("metadata", {})
# API key max budget
user_api_key_max_budget = metadata.get("user_api_key_max_budget")
if user_api_key_max_budget is not None:
spend_metrics["user_api_key_max_budget"] = float(user_api_key_max_budget)
# API key spend
user_api_key_spend = metadata.get("user_api_key_spend")
if user_api_key_spend is not None:
try:
spend_metrics["user_api_key_spend"] = float(user_api_key_spend)
except (ValueError, TypeError):
verbose_logger.debug(
f"Invalid user_api_key_spend value: {user_api_key_spend}"
)
# API key budget reset datetime
user_api_key_budget_reset_at = metadata.get("user_api_key_budget_reset_at")
if user_api_key_budget_reset_at is not None:
try:
from datetime import datetime, timezone
budget_reset_at = None
if isinstance(user_api_key_budget_reset_at, str):
# Handle ISO format strings that might have 'Z' suffix
iso_string = user_api_key_budget_reset_at.replace("Z", "+00:00")
budget_reset_at = datetime.fromisoformat(iso_string)
elif isinstance(user_api_key_budget_reset_at, datetime):
budget_reset_at = user_api_key_budget_reset_at
if budget_reset_at is not None:
# Preserve timezone info if already present
if budget_reset_at.tzinfo is None:
budget_reset_at = budget_reset_at.replace(tzinfo=timezone.utc)
# Convert to ISO string format for JSON serialization
# This prevents circular reference issues and ensures proper timezone representation
iso_string = budget_reset_at.isoformat()
spend_metrics["user_api_key_budget_reset_at"] = iso_string
# Debug logging to verify the conversion
verbose_logger.debug(
f"Converted budget_reset_at to ISO format: {iso_string}"
)
except Exception as e:
verbose_logger.debug(f"Error processing budget reset datetime: {e}")
verbose_logger.debug(f"Original value: {user_api_key_budget_reset_at}")
return spend_metrics
def _process_input_messages_preserving_tool_calls(
self, messages: List[Any]
) -> List[Dict[str, Any]]:
"""
Process input messages while preserving tool_calls and tool message types.
This bypasses the lossy string conversion when tool calls are present,
allowing complex nested tool_calls objects to be preserved for Datadog.
"""
processed = []
for msg in messages:
if isinstance(msg, dict):
# Preserve messages with tool_calls or tool role as-is
if "tool_calls" in msg or msg.get("role") == "tool":
processed.append(msg)
else:
# For regular messages, still apply string conversion
converted = (
handle_any_messages_to_chat_completion_str_messages_conversion(
[msg]
)
)
processed.extend(converted)
else:
# For non-dict messages, apply string conversion
converted = (
handle_any_messages_to_chat_completion_str_messages_conversion(
[msg]
)
)
processed.extend(converted)
return processed
@staticmethod
def _tool_calls_kv_pair(tool_calls: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Extract tool call information into key-value pairs for Datadog metadata.
Similar to OpenTelemetry's implementation but adapted for Datadog's format.
"""
kv_pairs: Dict[str, Any] = {}
for idx, tool_call in enumerate(tool_calls):
try:
# Extract tool call ID
tool_id = tool_call.get("id")
if tool_id:
kv_pairs[f"tool_calls.{idx}.id"] = tool_id
# Extract tool call type
tool_type = tool_call.get("type")
if tool_type:
kv_pairs[f"tool_calls.{idx}.type"] = tool_type
# Extract function information
function = tool_call.get("function")
if function:
function_name = function.get("name")
if function_name:
kv_pairs[f"tool_calls.{idx}.function.name"] = function_name
function_arguments = function.get("arguments")
if function_arguments:
# Store arguments as JSON string for Datadog
if isinstance(function_arguments, str):
kv_pairs[
f"tool_calls.{idx}.function.arguments"
] = function_arguments
else:
import json
kv_pairs[
f"tool_calls.{idx}.function.arguments"
] = json.dumps(function_arguments)
except (KeyError, TypeError, ValueError) as e:
verbose_logger.debug(
f"DataDogLLMObs: Error processing tool call {idx}: {str(e)}"
)
continue
return kv_pairs
def _extract_tool_call_metadata(
self, standard_logging_payload: StandardLoggingPayload
) -> Dict[str, Any]:
"""
Extract tool call information from both input messages and response for Datadog metadata.
"""
tool_call_metadata: Dict[str, Any] = {}
try:
# Extract tool calls from input messages
messages = standard_logging_payload.get("messages", [])
if messages and isinstance(messages, list):
for message in messages:
if isinstance(message, dict) and "tool_calls" in message:
tool_calls = message.get("tool_calls")
if tool_calls:
input_tool_calls_kv = self._tool_calls_kv_pair(tool_calls)
# Prefix with "input_" to distinguish from response tool calls
for key, value in input_tool_calls_kv.items():
tool_call_metadata[f"input_{key}"] = value
# Extract tool calls from response
response_obj = standard_logging_payload.get("response")
if response_obj and isinstance(response_obj, dict):
choices = response_obj.get("choices", [])
for choice in choices:
if isinstance(choice, dict):
message = choice.get("message")
if message and isinstance(message, dict):
tool_calls = message.get("tool_calls")
if tool_calls:
response_tool_calls_kv = self._tool_calls_kv_pair(
tool_calls
)
# Prefix with "output_" to distinguish from input tool calls
for key, value in response_tool_calls_kv.items():
tool_call_metadata[f"output_{key}"] = value
except Exception as e:
verbose_logger.debug(
f"DataDogLLMObs: Error extracting tool call metadata: {str(e)}"
)
return tool_call_metadata

View File

@@ -0,0 +1,286 @@
import asyncio
import gzip
import os
import time
from datetime import datetime
from typing import List, Optional, Union
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.integrations.datadog.datadog_handler import (
get_datadog_env,
get_datadog_hostname,
get_datadog_pod_name,
get_datadog_service,
)
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
from litellm.types.integrations.datadog_metrics import (
DatadogMetricPoint,
DatadogMetricSeries,
DatadogMetricsPayload,
)
from litellm.types.utils import StandardLoggingPayload
class DatadogMetricsLogger(CustomBatchLogger):
def __init__(self, start_periodic_flush: bool = True, **kwargs):
self.dd_api_key = os.getenv("DD_API_KEY")
self.dd_app_key = os.getenv("DD_APP_KEY")
self.dd_site = os.getenv("DD_SITE", "datadoghq.com")
if not self.dd_api_key:
verbose_logger.warning(
"Datadog Metrics: DD_API_KEY is required. Integration will not work."
)
self.upload_url = f"https://api.{self.dd_site}/api/v2/series"
self.async_client = get_async_httpx_client(
llm_provider=httpxSpecialProvider.LoggingCallback
)
# Initialize lock
self.flush_lock = asyncio.Lock()
# Only set flush_lock if not already provided by caller
if "flush_lock" not in kwargs:
kwargs["flush_lock"] = self.flush_lock
# Send metrics more quickly to datadog (every 5 seconds)
if "flush_interval" not in kwargs:
kwargs["flush_interval"] = 5
super().__init__(**kwargs)
# Start periodic flush task only if instructed
if start_periodic_flush:
asyncio.create_task(self.periodic_flush())
def _extract_tags(
self,
log: StandardLoggingPayload,
status_code: Optional[Union[str, int]] = None,
) -> List[str]:
"""
Builds the list of tags for a Datadog metric point
"""
# Base tags
tags = [
f"env:{get_datadog_env()}",
f"service:{get_datadog_service()}",
f"version:{os.getenv('DD_VERSION', 'unknown')}",
f"HOSTNAME:{get_datadog_hostname()}",
f"POD_NAME:{get_datadog_pod_name()}",
]
# Add metric-specific tags
if provider := log.get("custom_llm_provider"):
tags.append(f"provider:{provider}")
if model := log.get("model"):
tags.append(f"model_name:{model}")
if model_group := log.get("model_group"):
tags.append(f"model_group:{model_group}")
if status_code is not None:
tags.append(f"status_code:{status_code}")
# Extract team tag
metadata = log.get("metadata", {}) or {}
team_tag = (
metadata.get("user_api_key_team_alias")
or metadata.get("team_alias") # type: ignore
or metadata.get("user_api_key_team_id")
or metadata.get("team_id") # type: ignore
)
if team_tag:
tags.append(f"team:{team_tag}")
return tags
def _add_metrics_from_log(
self,
log: StandardLoggingPayload,
kwargs: dict,
status_code: Union[str, int] = "200",
):
"""
Extracts latencies and appends Datadog metric series to the queue
"""
tags = self._extract_tags(log, status_code=status_code)
# We record metrics with the end_time as the timestamp for the point
end_time_dt = kwargs.get("end_time") or datetime.now()
timestamp = int(end_time_dt.timestamp())
# 1. Total Request Latency Metric (End to End)
start_time_dt = kwargs.get("start_time")
if start_time_dt and end_time_dt:
total_duration = (end_time_dt - start_time_dt).total_seconds()
series_total_latency: DatadogMetricSeries = {
"metric": "litellm.request.total_latency",
"type": 3, # gauge
"points": [{"timestamp": timestamp, "value": total_duration}],
"tags": tags,
}
self.log_queue.append(series_total_latency)
# 2. LLM API Latency Metric (Provider alone)
api_call_start_time = kwargs.get("api_call_start_time")
if api_call_start_time and end_time_dt:
llm_api_duration = (end_time_dt - api_call_start_time).total_seconds()
series_llm_latency: DatadogMetricSeries = {
"metric": "litellm.llm_api.latency",
"type": 3, # gauge
"points": [{"timestamp": timestamp, "value": llm_api_duration}],
"tags": tags,
}
self.log_queue.append(series_llm_latency)
# 3. Request Count / Status Code
series_count: DatadogMetricSeries = {
"metric": "litellm.llm_api.request_count",
"type": 1, # count
"points": [{"timestamp": timestamp, "value": 1.0}],
"tags": tags,
"interval": self.flush_interval,
}
self.log_queue.append(series_count)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if standard_logging_object is None:
return
self._add_metrics_from_log(
log=standard_logging_object, kwargs=kwargs, status_code="200"
)
if len(self.log_queue) >= self.batch_size:
await self.flush_queue()
except Exception as e:
verbose_logger.exception(
f"Datadog Metrics: Error in async_log_success_event: {str(e)}"
)
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
try:
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object", None
)
if standard_logging_object is None:
return
# Extract status code from error information
status_code = "500" # default
error_information = (
standard_logging_object.get("error_information", {}) or {}
)
error_code = error_information.get("error_code") # type: ignore
if error_code is not None:
status_code = str(error_code)
self._add_metrics_from_log(
log=standard_logging_object, kwargs=kwargs, status_code=status_code
)
if len(self.log_queue) >= self.batch_size:
await self.flush_queue()
except Exception as e:
verbose_logger.exception(
f"Datadog Metrics: Error in async_log_failure_event: {str(e)}"
)
async def async_send_batch(self):
if not self.log_queue:
return
batch = self.log_queue.copy()
payload_data: DatadogMetricsPayload = {"series": batch}
try:
await self._upload_to_datadog(payload_data)
except Exception as e:
verbose_logger.exception(
f"Datadog Metrics: Error in async_send_batch: {str(e)}"
)
raise
async def _upload_to_datadog(self, payload: DatadogMetricsPayload):
if not self.dd_api_key:
return
headers = {
"Content-Type": "application/json",
"DD-API-KEY": self.dd_api_key,
}
if self.dd_app_key:
headers["DD-APPLICATION-KEY"] = self.dd_app_key
json_data = safe_dumps(payload)
compressed_data = gzip.compress(json_data.encode("utf-8"))
headers["Content-Encoding"] = "gzip"
response = await self.async_client.post(
self.upload_url, content=compressed_data, headers=headers # type: ignore
)
response.raise_for_status()
verbose_logger.debug(
f"Datadog Metrics: Uploaded {len(payload['series'])} metric points. Status: {response.status_code}"
)
async def async_health_check(self) -> IntegrationHealthCheckStatus:
"""
Check if the service is healthy
"""
try:
# Send a test metric point to Datadog
test_metric_point: DatadogMetricPoint = {
"timestamp": int(time.time()),
"value": 1.0,
}
test_metric_series: DatadogMetricSeries = {
"metric": "litellm.health_check",
"type": 3, # Gauge
"points": [test_metric_point],
"tags": ["env:health_check"],
}
payload_data: DatadogMetricsPayload = {"series": [test_metric_series]}
await self._upload_to_datadog(payload_data)
return IntegrationHealthCheckStatus(
status="healthy",
error_message=None,
)
except Exception as e:
return IntegrationHealthCheckStatus(
status="unhealthy",
error_message=str(e),
)
async def get_request_response_payload(
self,
request_id: str,
start_time_utc: Optional[datetime],
end_time_utc: Optional[datetime],
) -> Optional[dict]:
pass

View File

@@ -0,0 +1,33 @@
"""
Mock client for Datadog integration testing.
This module intercepts Datadog API calls and returns successful mock responses,
allowing full code execution without making actual network calls.
Usage:
Set DATADOG_MOCK=true in environment variables or config to enable mock mode.
"""
from litellm.integrations.mock_client_factory import (
MockClientConfig,
create_mock_client_factory,
)
# Create mock client using factory
_config = MockClientConfig(
name="DATADOG",
env_var="DATADOG_MOCK",
default_latency_ms=100,
default_status_code=202,
default_json_data={"status": "ok"},
url_matchers=[
".datadoghq.com",
"datadoghq.com",
],
patch_async_handler=True,
patch_sync_client=True,
)
create_mock_datadog_client, should_use_datadog_mock = create_mock_client_factory(
_config
)