chore: initial snapshot for gitea/github upload

This commit is contained in:
Your Name
2026-03-26 16:04:46 +08:00
commit a699a1ac98
3497 changed files with 1586237 additions and 0 deletions

View File

@@ -0,0 +1,422 @@
import os
from datetime import datetime
from typing import TYPE_CHECKING, Any, List, Optional, cast
import litellm
from litellm._logging import verbose_logger
from litellm.constants import CLOUDZERO_EXPORT_INTERVAL_MINUTES
from litellm.integrations.custom_logger import CustomLogger
if TYPE_CHECKING:
from apscheduler.schedulers.asyncio import AsyncIOScheduler
else:
AsyncIOScheduler = Any
class CloudZeroLogger(CustomLogger):
"""
CloudZero Logger for exporting LiteLLM usage data to CloudZero AnyCost API.
Environment Variables:
CLOUDZERO_API_KEY: CloudZero API key for authentication
CLOUDZERO_CONNECTION_ID: CloudZero connection ID for data submission
CLOUDZERO_TIMEZONE: Timezone for date handling (default: UTC)
"""
def __init__(
self,
api_key: Optional[str] = None,
connection_id: Optional[str] = None,
timezone: Optional[str] = None,
**kwargs,
):
"""Initialize CloudZero logger with configuration from parameters or environment variables."""
super().__init__(**kwargs)
# Get configuration from parameters first, fall back to environment variables
self.api_key = api_key or os.getenv("CLOUDZERO_API_KEY")
self.connection_id = connection_id or os.getenv("CLOUDZERO_CONNECTION_ID")
self.timezone = timezone or os.getenv("CLOUDZERO_TIMEZONE", "UTC")
verbose_logger.debug(
f"CloudZero Logger initialized with connection ID: {self.connection_id}, timezone: {self.timezone}"
)
async def initialize_cloudzero_export_job(self):
"""
Handler for initializing CloudZero export job.
Runs when CloudZero logger starts up.
- If redis cache is available, we use the pod lock manager to acquire a lock and export the data.
- Ensures only one pod exports the data at a time.
- If redis cache is not available, we export the data directly.
"""
from litellm.constants import (
CLOUDZERO_EXPORT_USAGE_DATA_JOB_NAME,
)
from litellm.proxy.proxy_server import proxy_logging_obj
pod_lock_manager = proxy_logging_obj.db_spend_update_writer.pod_lock_manager
# if using redis, ensure only one pod exports the data at a time
if pod_lock_manager and pod_lock_manager.redis_cache:
if await pod_lock_manager.acquire_lock(
cronjob_id=CLOUDZERO_EXPORT_USAGE_DATA_JOB_NAME
):
try:
await self._hourly_usage_data_export()
finally:
await pod_lock_manager.release_lock(
cronjob_id=CLOUDZERO_EXPORT_USAGE_DATA_JOB_NAME
)
else:
# if not using redis, export the data directly
await self._hourly_usage_data_export()
async def _hourly_usage_data_export(self):
"""
Exports the hourly usage data to CloudZero.
Start time: 1 hour ago
End time: current time
"""
from datetime import timedelta, timezone
from litellm.constants import CLOUDZERO_MAX_FETCHED_DATA_RECORDS
current_time_utc = datetime.now(timezone.utc)
# Mitigates the possibility of missing spend if an hour is skipped due to a restart in an ephemeral environment
one_hour_ago_utc = current_time_utc - timedelta(
minutes=CLOUDZERO_EXPORT_INTERVAL_MINUTES * 2
)
await self.export_usage_data(
limit=CLOUDZERO_MAX_FETCHED_DATA_RECORDS,
operation="replace_hourly",
start_time_utc=one_hour_ago_utc,
end_time_utc=current_time_utc,
)
async def export_usage_data(
self,
limit: Optional[int] = None,
operation: str = "replace_hourly",
start_time_utc: Optional[datetime] = None,
end_time_utc: Optional[datetime] = None,
):
"""
Exports the usage data to CloudZero.
- Reads data from the DB
- Transforms the data to the CloudZero format
- Sends the data to CloudZero
Args:
limit: Optional limit on number of records to export
operation: CloudZero operation type ("replace_hourly" or "sum")
"""
from litellm.integrations.cloudzero.cz_stream_api import CloudZeroStreamer
from litellm.integrations.cloudzero.database import LiteLLMDatabase
from litellm.integrations.cloudzero.transform import CBFTransformer
try:
verbose_logger.debug("CloudZero Logger: Starting usage data export")
# Validate required configuration
if not self.api_key or not self.connection_id:
raise ValueError(
"CloudZero configuration missing. Please set CLOUDZERO_API_KEY and CLOUDZERO_CONNECTION_ID environment variables."
)
# Initialize database connection and load data
database = LiteLLMDatabase()
verbose_logger.debug("CloudZero Logger: Loading usage data from database")
data = await database.get_usage_data(
limit=limit, start_time_utc=start_time_utc, end_time_utc=end_time_utc
)
if data.is_empty():
verbose_logger.debug("CloudZero Logger: No usage data found to export")
return
verbose_logger.debug(f"CloudZero Logger: Processing {len(data)} records")
# Transform data to CloudZero CBF format
transformer = CBFTransformer()
cbf_data = transformer.transform(data)
if cbf_data.is_empty():
verbose_logger.warning(
"CloudZero Logger: No valid data after transformation"
)
return
# Send data to CloudZero
streamer = CloudZeroStreamer(
api_key=self.api_key,
connection_id=self.connection_id,
user_timezone=self.timezone,
)
verbose_logger.debug(
f"CloudZero Logger: Transmitting {len(cbf_data)} records to CloudZero"
)
streamer.send_batched(cbf_data, operation=operation)
verbose_logger.debug(
f"CloudZero Logger: Successfully exported {len(cbf_data)} records to CloudZero"
)
except Exception as e:
verbose_logger.error(
f"CloudZero Logger: Error exporting usage data: {str(e)}"
)
raise
async def dry_run_export_usage_data(self, limit: Optional[int] = 10000):
"""
Returns the data that would be exported to CloudZero without actually sending it.
Args:
limit: Limit number of records to display (default: 10000)
Returns:
dict: Contains usage_data, cbf_data, and summary statistics
"""
from litellm.integrations.cloudzero.database import LiteLLMDatabase
from litellm.integrations.cloudzero.transform import CBFTransformer
try:
verbose_logger.debug("CloudZero Logger: Starting dry run export")
# Initialize database connection and load data
database = LiteLLMDatabase()
verbose_logger.debug("CloudZero Logger: Loading usage data for dry run")
data = await database.get_usage_data(limit=limit)
if data.is_empty():
verbose_logger.warning("CloudZero Dry Run: No usage data found")
return {
"usage_data": [],
"cbf_data": [],
"summary": {
"total_records": 0,
"total_cost": 0,
"total_tokens": 0,
"unique_accounts": 0,
"unique_services": 0,
},
}
verbose_logger.debug(
f"CloudZero Dry Run: Processing {len(data)} records..."
)
# Convert usage data to dict format for response
usage_data_sample = data.head(50).to_dicts() # Return first 50 rows
# Transform data to CloudZero CBF format
transformer = CBFTransformer()
cbf_data = transformer.transform(data)
if cbf_data.is_empty():
verbose_logger.warning(
"CloudZero Dry Run: No valid data after transformation"
)
return {
"usage_data": usage_data_sample,
"cbf_data": [],
"summary": {
"total_records": len(usage_data_sample),
"total_cost": sum(
row.get("spend", 0) for row in usage_data_sample
),
"total_tokens": sum(
row.get("prompt_tokens", 0)
+ row.get("completion_tokens", 0)
for row in usage_data_sample
),
"unique_accounts": 0,
"unique_services": 0,
},
}
# Convert CBF data to dict format for response
cbf_data_dict = cbf_data.to_dicts()
# Calculate summary statistics
total_cost = sum(record.get("cost/cost", 0) for record in cbf_data_dict)
unique_accounts = len(
set(
record.get("resource/account", "")
for record in cbf_data_dict
if record.get("resource/account")
)
)
unique_services = len(
set(
record.get("resource/service", "")
for record in cbf_data_dict
if record.get("resource/service")
)
)
total_tokens = sum(
record.get("usage/amount", 0) for record in cbf_data_dict
)
verbose_logger.debug(
f"CloudZero Logger: Dry run completed for {len(cbf_data)} records"
)
return {
"usage_data": usage_data_sample,
"cbf_data": cbf_data_dict,
"summary": {
"total_records": len(cbf_data_dict),
"total_cost": total_cost,
"total_tokens": total_tokens,
"unique_accounts": unique_accounts,
"unique_services": unique_services,
},
}
except Exception as e:
verbose_logger.error(f"CloudZero Logger: Error in dry run export: {str(e)}")
verbose_logger.error(f"CloudZero Dry Run Error: {str(e)}")
raise
def _display_cbf_data_on_screen(self, cbf_data):
"""Display CBF transformed data in a formatted table on screen."""
from rich.box import SIMPLE
from rich.console import Console
from rich.table import Table
console = Console()
if cbf_data.is_empty():
console.print("[yellow]No CBF data to display[/yellow]")
return
console.print(
f"\n[bold green]💰 CloudZero CBF Transformed Data ({len(cbf_data)} records)[/bold green]"
)
# Convert to dicts for easier processing
records = cbf_data.to_dicts()
# Create main CBF table
cbf_table = Table(
show_header=True, header_style="bold cyan", box=SIMPLE, padding=(0, 1)
)
cbf_table.add_column("time/usage_start", style="blue", no_wrap=False)
cbf_table.add_column("cost/cost", style="green", justify="right", no_wrap=False)
cbf_table.add_column(
"entity_type", style="magenta", justify="right", no_wrap=False
)
cbf_table.add_column(
"entity_id", style="magenta", justify="right", no_wrap=False
)
cbf_table.add_column("team_id", style="cyan", no_wrap=False)
cbf_table.add_column("team_alias", style="cyan", no_wrap=False)
cbf_table.add_column("user_email", style="cyan", no_wrap=False)
cbf_table.add_column("api_key_alias", style="yellow", no_wrap=False)
cbf_table.add_column(
"usage/amount", style="yellow", justify="right", no_wrap=False
)
cbf_table.add_column("resource/id", style="magenta", no_wrap=False)
cbf_table.add_column("resource/service", style="cyan", no_wrap=False)
cbf_table.add_column("resource/account", style="white", no_wrap=False)
cbf_table.add_column("resource/region", style="dim", no_wrap=False)
for record in records:
# Use proper CBF field names
time_usage_start = str(record.get("time/usage_start", "N/A"))
cost_cost = str(record.get("cost/cost", 0))
usage_amount = str(record.get("usage/amount", 0))
resource_id = str(record.get("resource/id", "N/A"))
resource_service = str(record.get("resource/service", "N/A"))
resource_account = str(record.get("resource/account", "N/A"))
resource_region = str(record.get("resource/region", "N/A"))
entity_type = str(record.get("entity_type", "N/A"))
entity_id = str(record.get("entity_id", "N/A"))
team_id = str(record.get("resource/tag:team_id", "N/A"))
team_alias = str(record.get("resource/tag:team_alias", "N/A"))
user_email = str(record.get("resource/tag:user_email", "N/A"))
api_key_alias = str(record.get("resource/tag:api_key_alias", "N/A"))
cbf_table.add_row(
time_usage_start,
cost_cost,
entity_type,
entity_id,
team_id,
team_alias,
user_email,
api_key_alias,
usage_amount,
resource_id,
resource_service,
resource_account,
resource_region,
)
console.print(cbf_table)
# Show summary statistics
total_cost = sum(record.get("cost/cost", 0) for record in records)
unique_accounts = len(
set(
record.get("resource/account", "")
for record in records
if record.get("resource/account")
)
)
unique_services = len(
set(
record.get("resource/service", "")
for record in records
if record.get("resource/service")
)
)
# Count total tokens from usage metrics
total_tokens = sum(record.get("usage/amount", 0) for record in records)
console.print("\n[bold blue]📊 CBF Summary[/bold blue]")
console.print(f" Records: {len(records):,}")
console.print(f" Total Cost: ${total_cost:.2f}")
console.print(f" Total Tokens: {total_tokens:,}")
console.print(f" Unique Accounts: {unique_accounts}")
console.print(f" Unique Services: {unique_services}")
console.print(
"\n[dim]💡 This is the CloudZero CBF format ready for AnyCost ingestion[/dim]"
)
@staticmethod
async def init_cloudzero_background_job(scheduler: AsyncIOScheduler):
"""
Initialize the CloudZero background job.
Starts the background job that exports the usage data to CloudZero every hour.
"""
from litellm.constants import CLOUDZERO_EXPORT_INTERVAL_MINUTES
from litellm.integrations.custom_logger import CustomLogger
prometheus_loggers: List[
CustomLogger
] = litellm.logging_callback_manager.get_custom_loggers_for_type(
callback_type=CloudZeroLogger
)
# we need to get the initialized prometheus logger instance(s) and call logger.initialize_remaining_budget_metrics() on them
verbose_logger.debug("found %s cloudzero loggers", len(prometheus_loggers))
if len(prometheus_loggers) > 0:
cloudzero_logger = cast(CloudZeroLogger, prometheus_loggers[0])
verbose_logger.debug(
"Initializing remaining budget metrics as a cron job executing every %s minutes"
% CLOUDZERO_EXPORT_INTERVAL_MINUTES
)
scheduler.add_job(
cloudzero_logger.initialize_cloudzero_export_job,
"interval",
minutes=CLOUDZERO_EXPORT_INTERVAL_MINUTES,
)

View File

@@ -0,0 +1,161 @@
# Copyright 2025 CloudZero
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# CHANGELOG: 2025-01-19 - Initial CZRN module for CloudZero Resource Names (erik.peterson)
"""CloudZero Resource Names (CZRN) generation and validation for LiteLLM resources."""
import re
from enum import Enum
from typing import Any, cast
import litellm
class CZEntityType(str, Enum):
TEAM = "team"
class CZRNGenerator:
"""Generate CloudZero Resource Names (CZRNs) for LiteLLM resources."""
CZRN_REGEX = re.compile(
r"^czrn:([a-z0-9-]+):([a-zA-Z0-9-]+):([a-z0-9-]+):([a-z0-9-]+):([a-z0-9-]+):(.+)$"
)
def __init__(self):
"""Initialize CZRN generator."""
pass
def create_from_litellm_data(self, row: dict[str, Any]) -> str:
"""Create a CZRN from LiteLLM daily spend data.
CZRN format: czrn:<service-type>:<provider>:<region>:<owner-account-id>:<resource-type>:<cloud-local-id>
For LiteLLM resources, we map:
- service-type: 'litellm' (the service managing the LLM calls)
- provider: The custom_llm_provider (e.g., 'openai', 'anthropic', 'azure')
- region: 'cross-region' (LiteLLM operates across regions)
- owner-account-id: The team_id or user_id (entity_id)
- resource-type: 'llm-usage' (represents LLM usage/inference)
- cloud-local-id: model
"""
service_type = "litellm"
provider = self._normalize_provider(row.get("custom_llm_provider", "unknown"))
region = "cross-region"
# Use the actual entity_id (team_id or user_id) as the owner account
team_id = row.get("team_id", "unknown")
owner_account_id = self._normalize_component(team_id)
resource_type = "llm-usage"
# Create a unique identifier with just the model (entity info already in owner_account_id)
model = row.get("model", "unknown")
cloud_local_id = model
return self.create_from_components(
service_type=service_type,
provider=provider,
region=region,
owner_account_id=owner_account_id,
resource_type=resource_type,
cloud_local_id=cloud_local_id,
)
def create_from_components(
self,
service_type: str,
provider: str,
region: str,
owner_account_id: str,
resource_type: str,
cloud_local_id: str,
) -> str:
"""Create a CZRN from individual components."""
# Normalize components to ensure they meet CZRN requirements
service_type = self._normalize_component(service_type, allow_uppercase=True)
provider = self._normalize_component(provider)
region = self._normalize_component(region)
owner_account_id = self._normalize_component(owner_account_id)
resource_type = self._normalize_component(resource_type)
# cloud_local_id can contain pipes and other characters, so don't normalize it
czrn = f"czrn:{service_type}:{provider}:{region}:{owner_account_id}:{resource_type}:{cloud_local_id}"
if not self.is_valid(czrn):
raise ValueError(f"Generated CZRN is invalid: {czrn}")
return czrn
def is_valid(self, czrn: str) -> bool:
"""Validate a CZRN string against the standard format."""
return bool(self.CZRN_REGEX.match(czrn))
def extract_components(self, czrn: str) -> tuple[str, str, str, str, str, str]:
"""Extract all components from a CZRN.
Returns: (service_type, provider, region, owner_account_id, resource_type, cloud_local_id)
"""
match = self.CZRN_REGEX.match(czrn)
if not match:
raise ValueError(f"Invalid CZRN format: {czrn}")
return cast(tuple[str, str, str, str, str, str], match.groups())
def _normalize_provider(self, provider: str) -> str:
"""Normalize provider names to standard CZRN format."""
# Map common provider names to CZRN standards
provider_map = {
litellm.LlmProviders.AZURE.value: "azure",
litellm.LlmProviders.AZURE_AI.value: "azure",
litellm.LlmProviders.ANTHROPIC.value: "anthropic",
litellm.LlmProviders.BEDROCK.value: "aws",
litellm.LlmProviders.VERTEX_AI.value: "gcp",
litellm.LlmProviders.GEMINI.value: "google",
litellm.LlmProviders.COHERE.value: "cohere",
litellm.LlmProviders.HUGGINGFACE.value: "huggingface",
litellm.LlmProviders.REPLICATE.value: "replicate",
litellm.LlmProviders.TOGETHER_AI.value: "together-ai",
}
normalized = provider.lower().replace("_", "-")
# use litellm custom llm provider if not in provider_map
if normalized not in provider_map:
return normalized
return provider_map.get(normalized, normalized)
def _normalize_component(
self, component: str, allow_uppercase: bool = False
) -> str:
"""Normalize a CZRN component to meet format requirements."""
if not component:
return "unknown"
# Convert to lowercase unless uppercase is allowed
if not allow_uppercase:
component = component.lower()
# Replace invalid characters with hyphens
component = re.sub(r"[^a-zA-Z0-9-]", "-", component)
# Remove consecutive hyphens
component = re.sub(r"-+", "-", component)
# Remove leading/trailing hyphens
component = component.strip("-")
return component or "unknown"

View File

@@ -0,0 +1,278 @@
# Copyright 2025 CloudZero
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# CHANGELOG: 2025-01-19 - Added pathlib for filesystem operations (erik.peterson)
# CHANGELOG: 2025-01-19 - Migrated from pandas to polars and requests to httpx (erik.peterson)
# CHANGELOG: 2025-01-19 - Initial output module for CSV and CloudZero API (erik.peterson)
"""Output modules for writing CBF data to various destinations."""
import zoneinfo
from datetime import datetime, timezone
from typing import Any, Optional, Union
import httpx
import polars as pl
from rich.console import Console
class CloudZeroStreamer:
"""Stream CBF data to CloudZero AnyCost API with proper batching and timezone handling."""
def __init__(
self, api_key: str, connection_id: str, user_timezone: Optional[str] = None
):
"""Initialize CloudZero streamer with credentials."""
self.api_key = api_key
self.connection_id = connection_id
self.base_url = "https://api.cloudzero.com"
self.console = Console()
# Set timezone - default to UTC
self.user_timezone: Union[zoneinfo.ZoneInfo, timezone]
if user_timezone:
try:
self.user_timezone = zoneinfo.ZoneInfo(user_timezone)
except zoneinfo.ZoneInfoNotFoundError:
self.console.print(
f"[yellow]Warning: Unknown timezone '{user_timezone}', using UTC[/yellow]"
)
self.user_timezone = timezone.utc
else:
self.user_timezone = timezone.utc
def send_batched(
self, data: pl.DataFrame, operation: str = "replace_hourly"
) -> None:
"""Send CBF data in daily batches to CloudZero AnyCost API."""
if data.is_empty():
self.console.print("[yellow]No data to send to CloudZero[/yellow]")
return
# Group data by date and send each day as a batch
daily_batches = self._group_by_date(data)
if not daily_batches:
self.console.print("[yellow]No valid daily batches to send[/yellow]")
return
self.console.print(
f"[blue]Sending {len(daily_batches)} daily batch(es) with operation '{operation}'[/blue]"
)
for batch_date, batch_data in daily_batches.items():
self._send_daily_batch(batch_date, batch_data, operation)
def _group_by_date(self, data: pl.DataFrame) -> dict[str, pl.DataFrame]:
"""Group data by date, converting to UTC and validating dates."""
daily_batches: dict[str, list[dict[str, Any]]] = {}
# Ensure we have the required columns
if "time/usage_start" not in data.columns:
self.console.print(
"[red]Error: Missing 'time/usage_start' column for date grouping[/red]"
)
return {}
timestamp_str: Optional[str] = None
for row in data.iter_rows(named=True):
try:
# Parse the timestamp and convert to UTC
timestamp_str = row.get("time/usage_start")
if not timestamp_str:
continue
# Parse timestamp and handle timezone conversion
dt = self._parse_and_convert_timestamp(timestamp_str)
batch_date = dt.strftime("%Y-%m-%d")
if batch_date not in daily_batches:
daily_batches[batch_date] = []
daily_batches[batch_date].append(row)
except Exception as e:
self.console.print(
f"[yellow]Warning: Could not process timestamp '{timestamp_str}': {e}[/yellow]"
)
continue
# Convert lists back to DataFrames
return {
date_key: pl.DataFrame(records)
for date_key, records in daily_batches.items()
if records
}
def _parse_and_convert_timestamp(self, timestamp_str: str) -> datetime:
"""Parse timestamp string and convert to UTC."""
# Try to parse the timestamp string
try:
# Handle various ISO 8601 formats
if timestamp_str.endswith("Z"):
dt = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
elif "+" in timestamp_str or timestamp_str.endswith(
(
"-00:00",
"-01:00",
"-02:00",
"-03:00",
"-04:00",
"-05:00",
"-06:00",
"-07:00",
"-08:00",
"-09:00",
"-10:00",
"-11:00",
"-12:00",
"+01:00",
"+02:00",
"+03:00",
"+04:00",
"+05:00",
"+06:00",
"+07:00",
"+08:00",
"+09:00",
"+10:00",
"+11:00",
"+12:00",
)
):
dt = datetime.fromisoformat(timestamp_str)
else:
# Assume user timezone if no timezone info
dt = datetime.fromisoformat(timestamp_str)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=self.user_timezone)
# Convert to UTC
return dt.astimezone(timezone.utc)
except ValueError as e:
raise ValueError(f"Could not parse timestamp '{timestamp_str}': {e}")
def _send_daily_batch(
self, batch_date: str, batch_data: pl.DataFrame, operation: str
) -> None:
"""Send a single daily batch to CloudZero API."""
if batch_data.is_empty():
return
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
# Use the correct API endpoint format from documentation
url = f"{self.base_url}/v2/connections/billing/anycost/{self.connection_id}/billing_drops"
# Prepare the batch payload according to AnyCost API format
payload = self._prepare_batch_payload(batch_date, batch_data, operation)
try:
with httpx.Client(timeout=30.0) as client:
self.console.print(
f"[blue]Sending batch for {batch_date} ({len(batch_data)} records)[/blue]"
)
response = client.post(url, headers=headers, json=payload)
response.raise_for_status()
self.console.print(
f"[green]✓ Successfully sent batch for {batch_date} ({len(batch_data)} records)[/green]"
)
except httpx.RequestError as e:
self.console.print(
f"[red]✗ Network error sending batch for {batch_date}: {e}[/red]"
)
raise
except httpx.HTTPStatusError as e:
self.console.print(
f"[red]✗ HTTP error sending batch for {batch_date}: {e.response.status_code} {e.response.text}[/red]"
)
raise
def _prepare_batch_payload(
self, batch_date: str, batch_data: pl.DataFrame, operation: str
) -> dict[str, Any]:
"""Prepare batch payload according to CloudZero AnyCost API format."""
# Convert batch_date to month for the API (YYYY-MM format)
try:
date_obj = datetime.strptime(batch_date, "%Y-%m-%d")
month_str = date_obj.strftime("%Y-%m")
except ValueError:
# Fallback to current month
month_str = datetime.now().strftime("%Y-%m")
# Convert DataFrame rows to API format
data_records = []
for row in batch_data.iter_rows(named=True):
record = self._convert_cbf_to_api_format(row)
if record:
data_records.append(record)
payload = {"month": month_str, "operation": operation, "data": data_records}
return payload
def _convert_cbf_to_api_format(
self, row: dict[str, Any]
) -> Optional[dict[str, Any]]:
"""Convert CBF row to CloudZero API format - keeping CBF field names as CloudZero expects them."""
try:
# CloudZero expects CBF format field names directly, not converted names
api_record = {}
# Copy all CBF fields, converting numeric values to strings as required by CloudZero
for key, value in row.items():
if value is not None:
# CloudZero requires numeric values to be strings, but NOT in scientific notation
if isinstance(value, (int, float)):
# Format floats to avoid scientific notation
if isinstance(value, float):
# Use a reasonable precision that avoids scientific notation
api_record[key] = f"{value:.10f}".rstrip("0").rstrip(".")
else:
api_record[key] = str(value)
else:
api_record[key] = value
# Ensure timestamp is in UTC format
if "time/usage_start" in api_record:
api_record["time/usage_start"] = self._ensure_utc_timestamp(
api_record["time/usage_start"]
)
return api_record
except Exception as e:
self.console.print(
f"[yellow]Warning: Could not convert record to API format: {e}[/yellow]"
)
return None
def _ensure_utc_timestamp(self, timestamp_str: str) -> str:
"""Ensure timestamp is in UTC format for API."""
if not timestamp_str:
return datetime.now(timezone.utc).isoformat()
try:
dt = self._parse_and_convert_timestamp(timestamp_str)
return dt.isoformat().replace("+00:00", "Z")
except Exception:
# Fallback to current time in UTC
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

View File

@@ -0,0 +1,101 @@
# Copyright 2025 CloudZero
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# CHANGELOG: 2025-01-19 - Refactored to use daily spend tables for proper CBF mapping (erik.peterson)
# CHANGELOG: 2025-01-19 - Migrated from pandas to polars for database operations (erik.peterson)
# CHANGELOG: 2025-01-19 - Initial database module for LiteLLM data extraction (erik.peterson)
"""Database connection and data extraction for LiteLLM."""
from datetime import datetime
from typing import Any, Optional, List
import polars as pl
class LiteLLMDatabase:
"""Handle LiteLLM PostgreSQL database connections and queries."""
def _ensure_prisma_client(self):
from litellm.proxy.proxy_server import prisma_client
"""Ensure prisma client is available."""
if prisma_client is None:
raise Exception(
"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
)
return prisma_client
async def get_usage_data(
self,
limit: Optional[int] = None,
start_time_utc: Optional[datetime] = None,
end_time_utc: Optional[datetime] = None,
) -> pl.DataFrame:
"""Retrieve usage data from LiteLLM daily user spend table."""
client = self._ensure_prisma_client()
# Query to get user spend data with team information. Use parameter binding to
# avoid SQL injection from user-supplied timestamps or limits.
query = """
SELECT
dus.id,
dus.date,
dus.user_id,
dus.api_key,
dus.model,
dus.model_group,
dus.custom_llm_provider,
dus.prompt_tokens,
dus.completion_tokens,
dus.spend,
dus.api_requests,
dus.successful_requests,
dus.failed_requests,
dus.cache_creation_input_tokens,
dus.cache_read_input_tokens,
dus.created_at,
dus.updated_at,
vt.team_id,
vt.key_alias as api_key_alias,
tt.team_alias,
ut.user_email as user_email
FROM "LiteLLM_DailyUserSpend" dus
LEFT JOIN "LiteLLM_VerificationToken" vt ON dus.api_key = vt.token
LEFT JOIN "LiteLLM_TeamTable" tt ON vt.team_id = tt.team_id
LEFT JOIN "LiteLLM_UserTable" ut ON dus.user_id = ut.user_id
WHERE ($1::timestamptz IS NULL OR dus.updated_at >= $1::timestamptz)
AND ($2::timestamptz IS NULL OR dus.updated_at <= $2::timestamptz)
ORDER BY dus.date DESC, dus.created_at DESC
"""
params: List[Any] = [
start_time_utc,
end_time_utc,
]
if limit is not None:
try:
params.append(int(limit))
except (TypeError, ValueError):
raise ValueError("limit must be an integer")
query += " LIMIT $3"
try:
db_response = await client.db.query_raw(query, *params)
# Convert the response to polars DataFrame with full schema inference
# This prevents schema mismatch errors when data types vary across rows
return pl.DataFrame(db_response, infer_schema_length=None)
except Exception as e:
raise Exception(f"Error retrieving usage data: {str(e)}")

View File

@@ -0,0 +1,223 @@
# Copyright 2025 CloudZero
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# CHANGELOG: 2025-01-19 - Updated CBF transformation for daily spend tables and proper CloudZero mapping (erik.peterson)
# CHANGELOG: 2025-01-19 - Migrated from pandas to polars for data transformation (erik.peterson)
# CHANGELOG: 2025-01-19 - Initial CBF transformation module (erik.peterson)
"""Transform LiteLLM data to CloudZero AnyCost CBF format."""
from datetime import datetime
from typing import Any, Optional
import polars as pl
from ...types.integrations.cloudzero import CBFRecord
from .cz_resource_names import CZEntityType, CZRNGenerator
class CBFTransformer:
"""Transform LiteLLM usage data to CloudZero Billing Format (CBF)."""
def __init__(self):
"""Initialize transformer with CZRN generator."""
self.czrn_generator = CZRNGenerator()
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
"""Transform LiteLLM data to CBF format, dropping records with zero successful_requests or invalid CZRNs."""
if data.is_empty():
return pl.DataFrame()
# Filter out records with zero successful_requests first
original_count = len(data)
if "successful_requests" in data.columns:
filtered_data = data.filter(pl.col("successful_requests") > 0)
zero_requests_dropped = original_count - len(filtered_data)
else:
filtered_data = data
zero_requests_dropped = 0
cbf_data = []
czrn_dropped_count = 0
filtered_count = len(filtered_data)
for row in filtered_data.iter_rows(named=True):
try:
cbf_record = self._create_cbf_record(row)
# Only include the record if CZRN generation was successful
cbf_data.append(cbf_record)
except Exception:
# Skip records that fail CZRN generation
czrn_dropped_count += 1
continue
# Print summary of dropped records if any
from rich.console import Console
console = Console()
if zero_requests_dropped > 0:
console.print(
f"[yellow]⚠️ Dropped {zero_requests_dropped:,} of {original_count:,} records with zero successful_requests[/yellow]"
)
if czrn_dropped_count > 0:
console.print(
f"[yellow]⚠️ Dropped {czrn_dropped_count:,} of {filtered_count:,} filtered records due to invalid CZRNs[/yellow]"
)
if len(cbf_data) > 0:
console.print(
f"[green]✓ Successfully transformed {len(cbf_data):,} records[/green]"
)
return pl.DataFrame(cbf_data)
def _create_cbf_record(self, row: dict[str, Any]) -> CBFRecord:
"""Create a single CBF record from LiteLLM daily spend row."""
# Parse date (daily spend tables use date strings like '2025-04-19')
usage_date = self._parse_date(row.get("date"))
# Calculate total tokens
prompt_tokens = int(row.get("prompt_tokens", 0))
completion_tokens = int(row.get("completion_tokens", 0))
total_tokens = prompt_tokens + completion_tokens
# Create CloudZero Resource Name (CZRN) as resource_id
resource_id = self.czrn_generator.create_from_litellm_data(row)
# Build dimensions for CloudZero
model = str(row.get("model", ""))
api_key_hash = str(row.get("api_key", ""))[
:8
] # First 8 chars for identification
# Handle team information with fallbacks
team_id = row.get("team_id")
team_alias = row.get("team_alias")
user_email = row.get("user_email")
# Use team_alias if available, otherwise team_id, otherwise fallback to 'unknown'
entity_id = (
str(team_alias) if team_alias else (str(team_id) if team_id else "unknown")
)
# Get alias fields if they exist
api_key_alias = row.get("api_key_alias")
organization_alias = row.get("organization_alias")
project_alias = row.get("project_alias")
user_alias = row.get("user_alias")
dimensions = {
"entity_type": CZEntityType.TEAM.value,
"entity_id": entity_id,
"team_alias": str(team_alias) if team_alias else "unknown",
"model": model,
"model_group": str(row.get("model_group", "")),
"provider": str(row.get("custom_llm_provider", "")),
"api_key_prefix": api_key_hash,
"api_key_alias": str(row.get("api_key_alias", "")),
"user_email": str(user_email) if user_email else "",
"api_requests": str(row.get("api_requests", 0)),
"successful_requests": str(row.get("successful_requests", 0)),
"failed_requests": str(row.get("failed_requests", 0)),
"cache_creation_tokens": str(row.get("cache_creation_input_tokens", 0)),
"cache_read_tokens": str(row.get("cache_read_input_tokens", 0)),
"organization_alias": str(organization_alias) if organization_alias else "",
"project_alias": str(project_alias) if project_alias else "",
"user_alias": str(user_alias) if user_alias else "",
}
# Extract CZRN components to populate corresponding CBF columns
czrn_components = self.czrn_generator.extract_components(resource_id)
(
service_type,
provider,
region,
owner_account_id,
resource_type,
cloud_local_id,
) = czrn_components
# Build resource/account as concat of api_key_alias and api_key_prefix
resource_account = (
f"{api_key_alias}|{api_key_hash}" if api_key_alias else api_key_hash
)
# CloudZero CBF format with proper column names
cbf_record = {
# Required CBF fields
"time/usage_start": usage_date.isoformat()
if usage_date
else None, # Required: ISO-formatted UTC datetime
"cost/cost": float(row.get("spend", 0.0)), # Required: billed cost
"resource/id": resource_id, # CZRN (CloudZero Resource Name)
# Usage metrics for token consumption
"usage/amount": total_tokens, # Numeric value of tokens consumed
"usage/units": "tokens", # Description of token units
# CBF fields - updated per LIT-1907
"resource/service": str(row.get("model_group", "")), # Send model_group
"resource/account": resource_account, # Send api_key_alias|api_key_prefix
"resource/region": region, # Maps to CZRN region (cross-region)
"resource/usage_family": str(
row.get("custom_llm_provider", "")
), # Send provider
# Action field
"action/operation": str(team_id) if team_id else "", # Send team_id
# Line item details
"lineitem/type": "Usage", # Standard usage line item
}
# Add CZRN components that don't have direct CBF column mappings as resource tags
cbf_record["resource/tag:provider"] = provider # CZRN provider component
cbf_record[
"resource/tag:model"
] = cloud_local_id # CZRN cloud-local-id component (model)
# Add resource tags for all dimensions (using resource/tag:<key> format)
for key, value in dimensions.items():
if (
value and value != "N/A" and value != "unknown"
): # Only add meaningful tags
cbf_record[f"resource/tag:{key}"] = str(value)
# Add token breakdown as resource tags for analysis (excluding total_tokens per LIT-1907)
if prompt_tokens > 0:
cbf_record["resource/tag:prompt_tokens"] = str(prompt_tokens)
if completion_tokens > 0:
cbf_record["resource/tag:completion_tokens"] = str(completion_tokens)
return CBFRecord(cbf_record)
def _parse_date(self, date_str) -> Optional[datetime]:
"""Parse date string from daily spend tables (e.g., '2025-04-19')."""
if date_str is None:
return None
if isinstance(date_str, datetime):
return date_str
if isinstance(date_str, str):
try:
# Parse date string and set to midnight UTC for daily aggregation
return pl.Series([date_str]).str.to_datetime("%Y-%m-%d").item()
except Exception:
try:
# Fallback: try ISO format parsing
return pl.Series([date_str]).str.to_datetime().item()
except Exception:
return None
return None