import asyncio import gzip import os import time from datetime import datetime from typing import List, Optional, Union from litellm._logging import verbose_logger from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.integrations.datadog.datadog_handler import ( get_datadog_env, get_datadog_hostname, get_datadog_pod_name, get_datadog_service, ) from litellm.litellm_core_utils.safe_json_dumps import safe_dumps from litellm.llms.custom_httpx.http_handler import ( get_async_httpx_client, httpxSpecialProvider, ) from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus from litellm.types.integrations.datadog_metrics import ( DatadogMetricPoint, DatadogMetricSeries, DatadogMetricsPayload, ) from litellm.types.utils import StandardLoggingPayload class DatadogMetricsLogger(CustomBatchLogger): def __init__(self, start_periodic_flush: bool = True, **kwargs): self.dd_api_key = os.getenv("DD_API_KEY") self.dd_app_key = os.getenv("DD_APP_KEY") self.dd_site = os.getenv("DD_SITE", "datadoghq.com") if not self.dd_api_key: verbose_logger.warning( "Datadog Metrics: DD_API_KEY is required. Integration will not work." ) self.upload_url = f"https://api.{self.dd_site}/api/v2/series" self.async_client = get_async_httpx_client( llm_provider=httpxSpecialProvider.LoggingCallback ) # Initialize lock self.flush_lock = asyncio.Lock() # Only set flush_lock if not already provided by caller if "flush_lock" not in kwargs: kwargs["flush_lock"] = self.flush_lock # Send metrics more quickly to datadog (every 5 seconds) if "flush_interval" not in kwargs: kwargs["flush_interval"] = 5 super().__init__(**kwargs) # Start periodic flush task only if instructed if start_periodic_flush: asyncio.create_task(self.periodic_flush()) def _extract_tags( self, log: StandardLoggingPayload, status_code: Optional[Union[str, int]] = None, ) -> List[str]: """ Builds the list of tags for a Datadog metric point """ # Base tags tags = [ f"env:{get_datadog_env()}", f"service:{get_datadog_service()}", f"version:{os.getenv('DD_VERSION', 'unknown')}", f"HOSTNAME:{get_datadog_hostname()}", f"POD_NAME:{get_datadog_pod_name()}", ] # Add metric-specific tags if provider := log.get("custom_llm_provider"): tags.append(f"provider:{provider}") if model := log.get("model"): tags.append(f"model_name:{model}") if model_group := log.get("model_group"): tags.append(f"model_group:{model_group}") if status_code is not None: tags.append(f"status_code:{status_code}") # Extract team tag metadata = log.get("metadata", {}) or {} team_tag = ( metadata.get("user_api_key_team_alias") or metadata.get("team_alias") # type: ignore or metadata.get("user_api_key_team_id") or metadata.get("team_id") # type: ignore ) if team_tag: tags.append(f"team:{team_tag}") return tags def _add_metrics_from_log( self, log: StandardLoggingPayload, kwargs: dict, status_code: Union[str, int] = "200", ): """ Extracts latencies and appends Datadog metric series to the queue """ tags = self._extract_tags(log, status_code=status_code) # We record metrics with the end_time as the timestamp for the point end_time_dt = kwargs.get("end_time") or datetime.now() timestamp = int(end_time_dt.timestamp()) # 1. Total Request Latency Metric (End to End) start_time_dt = kwargs.get("start_time") if start_time_dt and end_time_dt: total_duration = (end_time_dt - start_time_dt).total_seconds() series_total_latency: DatadogMetricSeries = { "metric": "litellm.request.total_latency", "type": 3, # gauge "points": [{"timestamp": timestamp, "value": total_duration}], "tags": tags, } self.log_queue.append(series_total_latency) # 2. LLM API Latency Metric (Provider alone) api_call_start_time = kwargs.get("api_call_start_time") if api_call_start_time and end_time_dt: llm_api_duration = (end_time_dt - api_call_start_time).total_seconds() series_llm_latency: DatadogMetricSeries = { "metric": "litellm.llm_api.latency", "type": 3, # gauge "points": [{"timestamp": timestamp, "value": llm_api_duration}], "tags": tags, } self.log_queue.append(series_llm_latency) # 3. Request Count / Status Code series_count: DatadogMetricSeries = { "metric": "litellm.llm_api.request_count", "type": 1, # count "points": [{"timestamp": timestamp, "value": 1.0}], "tags": tags, "interval": self.flush_interval, } self.log_queue.append(series_count) async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): try: standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get( "standard_logging_object", None ) if standard_logging_object is None: return self._add_metrics_from_log( log=standard_logging_object, kwargs=kwargs, status_code="200" ) if len(self.log_queue) >= self.batch_size: await self.flush_queue() except Exception as e: verbose_logger.exception( f"Datadog Metrics: Error in async_log_success_event: {str(e)}" ) async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): try: standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get( "standard_logging_object", None ) if standard_logging_object is None: return # Extract status code from error information status_code = "500" # default error_information = ( standard_logging_object.get("error_information", {}) or {} ) error_code = error_information.get("error_code") # type: ignore if error_code is not None: status_code = str(error_code) self._add_metrics_from_log( log=standard_logging_object, kwargs=kwargs, status_code=status_code ) if len(self.log_queue) >= self.batch_size: await self.flush_queue() except Exception as e: verbose_logger.exception( f"Datadog Metrics: Error in async_log_failure_event: {str(e)}" ) async def async_send_batch(self): if not self.log_queue: return batch = self.log_queue.copy() payload_data: DatadogMetricsPayload = {"series": batch} try: await self._upload_to_datadog(payload_data) except Exception as e: verbose_logger.exception( f"Datadog Metrics: Error in async_send_batch: {str(e)}" ) raise async def _upload_to_datadog(self, payload: DatadogMetricsPayload): if not self.dd_api_key: return headers = { "Content-Type": "application/json", "DD-API-KEY": self.dd_api_key, } if self.dd_app_key: headers["DD-APPLICATION-KEY"] = self.dd_app_key json_data = safe_dumps(payload) compressed_data = gzip.compress(json_data.encode("utf-8")) headers["Content-Encoding"] = "gzip" response = await self.async_client.post( self.upload_url, content=compressed_data, headers=headers # type: ignore ) response.raise_for_status() verbose_logger.debug( f"Datadog Metrics: Uploaded {len(payload['series'])} metric points. Status: {response.status_code}" ) async def async_health_check(self) -> IntegrationHealthCheckStatus: """ Check if the service is healthy """ try: # Send a test metric point to Datadog test_metric_point: DatadogMetricPoint = { "timestamp": int(time.time()), "value": 1.0, } test_metric_series: DatadogMetricSeries = { "metric": "litellm.health_check", "type": 3, # Gauge "points": [test_metric_point], "tags": ["env:health_check"], } payload_data: DatadogMetricsPayload = {"series": [test_metric_series]} await self._upload_to_datadog(payload_data) return IntegrationHealthCheckStatus( status="healthy", error_message=None, ) except Exception as e: return IntegrationHealthCheckStatus( status="unhealthy", error_message=str(e), ) async def get_request_response_payload( self, request_id: str, start_time_utc: Optional[datetime], end_time_utc: Optional[datetime], ) -> Optional[dict]: pass