chore: initial public snapshot for github upload
This commit is contained in:
@@ -0,0 +1,832 @@
|
||||
# Start tracing memory allocations
|
||||
import asyncio
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tracemalloc
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from litellm import get_secret_str
|
||||
from litellm._logging import verbose_proxy_logger
|
||||
from litellm.constants import PYTHON_GC_THRESHOLD
|
||||
from litellm.proxy._types import UserAPIKeyAuth
|
||||
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# Configure garbage collection thresholds from environment variables
|
||||
def configure_gc_thresholds():
|
||||
"""Configure Python garbage collection thresholds from environment variables."""
|
||||
gc_threshold_env = PYTHON_GC_THRESHOLD
|
||||
if gc_threshold_env:
|
||||
try:
|
||||
# Parse threshold string like "1000,50,50"
|
||||
thresholds = [int(x.strip()) for x in gc_threshold_env.split(",")]
|
||||
if len(thresholds) == 3:
|
||||
gc.set_threshold(*thresholds)
|
||||
verbose_proxy_logger.info(f"GC thresholds set to: {thresholds}")
|
||||
else:
|
||||
verbose_proxy_logger.warning(
|
||||
f"GC threshold not set: {gc_threshold_env}. Expected format: 'gen0,gen1,gen2'"
|
||||
)
|
||||
except ValueError as e:
|
||||
verbose_proxy_logger.warning(
|
||||
f"Failed to parse GC threshold: {gc_threshold_env}. Error: {e}"
|
||||
)
|
||||
|
||||
# Log current thresholds
|
||||
current_thresholds = gc.get_threshold()
|
||||
verbose_proxy_logger.info(
|
||||
f"Current GC thresholds: gen0={current_thresholds[0]}, gen1={current_thresholds[1]}, gen2={current_thresholds[2]}"
|
||||
)
|
||||
|
||||
|
||||
# Initialize GC configuration
|
||||
configure_gc_thresholds()
|
||||
|
||||
|
||||
@router.get("/debug/asyncio-tasks")
|
||||
async def get_active_tasks_stats():
|
||||
"""
|
||||
Returns:
|
||||
total_active_tasks: int
|
||||
by_name: { coroutine_name: count }
|
||||
"""
|
||||
MAX_TASKS_TO_CHECK = 5000
|
||||
# Gather all tasks in this event loop (including this endpoint’s own task).
|
||||
all_tasks = asyncio.all_tasks()
|
||||
|
||||
# Filter out tasks that are already done.
|
||||
active_tasks = [t for t in all_tasks if not t.done()]
|
||||
|
||||
# Count how many active tasks exist, grouped by coroutine function name.
|
||||
counter = Counter()
|
||||
for idx, task in enumerate(active_tasks):
|
||||
# reasonable max circuit breaker
|
||||
if idx >= MAX_TASKS_TO_CHECK:
|
||||
break
|
||||
coro = task.get_coro()
|
||||
# Derive a human‐readable name from the coroutine:
|
||||
name = (
|
||||
getattr(coro, "__qualname__", None)
|
||||
or getattr(coro, "__name__", None)
|
||||
or repr(coro)
|
||||
)
|
||||
counter[name] += 1
|
||||
|
||||
return {
|
||||
"total_active_tasks": len(active_tasks),
|
||||
"by_name": dict(counter),
|
||||
}
|
||||
|
||||
|
||||
if os.environ.get("LITELLM_PROFILE", "false").lower() == "true":
|
||||
try:
|
||||
import objgraph # type: ignore
|
||||
|
||||
print("growth of objects") # noqa
|
||||
objgraph.show_growth()
|
||||
print("\n\nMost common types") # noqa
|
||||
objgraph.show_most_common_types()
|
||||
roots = objgraph.get_leaking_objects()
|
||||
print("\n\nLeaking objects") # noqa
|
||||
objgraph.show_most_common_types(objects=roots)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"objgraph not found. Please install objgraph to use this feature."
|
||||
)
|
||||
|
||||
tracemalloc.start(10)
|
||||
|
||||
@router.get("/memory-usage", include_in_schema=False)
|
||||
async def memory_usage():
|
||||
# Take a snapshot of the current memory usage
|
||||
snapshot = tracemalloc.take_snapshot()
|
||||
top_stats = snapshot.statistics("lineno")
|
||||
verbose_proxy_logger.debug("TOP STATS: %s", top_stats)
|
||||
|
||||
# Get the top 50 memory usage lines
|
||||
top_50 = top_stats[:50]
|
||||
result = []
|
||||
for stat in top_50:
|
||||
result.append(f"{stat.traceback.format(limit=10)}: {stat.size / 1024} KiB")
|
||||
|
||||
return {"top_50_memory_usage": result}
|
||||
|
||||
|
||||
@router.get("/memory-usage-in-mem-cache", include_in_schema=False)
|
||||
async def memory_usage_in_mem_cache(
|
||||
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
):
|
||||
# returns the size of all in-memory caches on the proxy server
|
||||
"""
|
||||
1. user_api_key_cache
|
||||
2. router_cache
|
||||
3. proxy_logging_cache
|
||||
4. internal_usage_cache
|
||||
"""
|
||||
from litellm.proxy.proxy_server import (
|
||||
llm_router,
|
||||
proxy_logging_obj,
|
||||
user_api_key_cache,
|
||||
)
|
||||
|
||||
if llm_router is None:
|
||||
num_items_in_llm_router_cache = 0
|
||||
else:
|
||||
num_items_in_llm_router_cache = len(
|
||||
llm_router.cache.in_memory_cache.cache_dict
|
||||
) + len(llm_router.cache.in_memory_cache.ttl_dict)
|
||||
|
||||
num_items_in_user_api_key_cache = len(
|
||||
user_api_key_cache.in_memory_cache.cache_dict
|
||||
) + len(user_api_key_cache.in_memory_cache.ttl_dict)
|
||||
|
||||
num_items_in_proxy_logging_obj_cache = len(
|
||||
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
||||
) + len(proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.ttl_dict)
|
||||
|
||||
return {
|
||||
"num_items_in_user_api_key_cache": num_items_in_user_api_key_cache,
|
||||
"num_items_in_llm_router_cache": num_items_in_llm_router_cache,
|
||||
"num_items_in_proxy_logging_obj_cache": num_items_in_proxy_logging_obj_cache,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/memory-usage-in-mem-cache-items", include_in_schema=False)
|
||||
async def memory_usage_in_mem_cache_items(
|
||||
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
):
|
||||
# returns the size of all in-memory caches on the proxy server
|
||||
"""
|
||||
1. user_api_key_cache
|
||||
2. router_cache
|
||||
3. proxy_logging_cache
|
||||
4. internal_usage_cache
|
||||
"""
|
||||
from litellm.proxy.proxy_server import (
|
||||
llm_router,
|
||||
proxy_logging_obj,
|
||||
user_api_key_cache,
|
||||
)
|
||||
|
||||
if llm_router is None:
|
||||
llm_router_in_memory_cache_dict = {}
|
||||
llm_router_in_memory_ttl_dict = {}
|
||||
else:
|
||||
llm_router_in_memory_cache_dict = llm_router.cache.in_memory_cache.cache_dict
|
||||
llm_router_in_memory_ttl_dict = llm_router.cache.in_memory_cache.ttl_dict
|
||||
|
||||
return {
|
||||
"user_api_key_cache": user_api_key_cache.in_memory_cache.cache_dict,
|
||||
"user_api_key_ttl": user_api_key_cache.in_memory_cache.ttl_dict,
|
||||
"llm_router_cache": llm_router_in_memory_cache_dict,
|
||||
"llm_router_ttl": llm_router_in_memory_ttl_dict,
|
||||
"proxy_logging_obj_cache": proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict,
|
||||
"proxy_logging_obj_ttl": proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.ttl_dict,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/debug/memory/summary", include_in_schema=False)
|
||||
async def get_memory_summary(
|
||||
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get simplified memory usage summary for the proxy.
|
||||
|
||||
Returns:
|
||||
- worker_pid: Process ID
|
||||
- status: Overall health based on memory usage
|
||||
- memory: Process memory usage and RAM info
|
||||
- caches: Cache item counts and descriptions
|
||||
- garbage_collector: GC status and pending object counts
|
||||
|
||||
Example usage:
|
||||
curl http://localhost:4000/debug/memory/summary -H "Authorization: Bearer sk-1234"
|
||||
|
||||
For detailed analysis, call GET /debug/memory/details
|
||||
For cache management, use the cache management endpoints
|
||||
"""
|
||||
from litellm.proxy.proxy_server import (
|
||||
llm_router,
|
||||
proxy_logging_obj,
|
||||
user_api_key_cache,
|
||||
)
|
||||
|
||||
# Get process memory info
|
||||
process_memory = {}
|
||||
health_status = "healthy"
|
||||
|
||||
try:
|
||||
import psutil
|
||||
|
||||
process = psutil.Process()
|
||||
memory_info = process.memory_info()
|
||||
memory_mb = memory_info.rss / (1024 * 1024)
|
||||
memory_percent = process.memory_percent()
|
||||
|
||||
process_memory = {
|
||||
"summary": f"{memory_mb:.1f} MB ({memory_percent:.1f}% of system memory)",
|
||||
"ram_usage_mb": round(memory_mb, 2),
|
||||
"system_memory_percent": round(memory_percent, 2),
|
||||
}
|
||||
|
||||
# Check memory health status
|
||||
if memory_percent > 80:
|
||||
health_status = "critical"
|
||||
elif memory_percent > 60:
|
||||
health_status = "warning"
|
||||
else:
|
||||
health_status = "healthy"
|
||||
|
||||
except ImportError:
|
||||
process_memory[
|
||||
"error"
|
||||
] = "Install psutil for memory monitoring: pip install psutil"
|
||||
except Exception as e:
|
||||
process_memory["error"] = str(e)
|
||||
|
||||
# Get cache information
|
||||
caches: Dict[str, Any] = {}
|
||||
total_cache_items = 0
|
||||
|
||||
try:
|
||||
# User API key cache
|
||||
user_cache_items = len(user_api_key_cache.in_memory_cache.cache_dict)
|
||||
total_cache_items += user_cache_items
|
||||
caches["user_api_keys"] = {
|
||||
"count": user_cache_items,
|
||||
"count_readable": f"{user_cache_items:,}",
|
||||
"what_it_stores": "Validated API keys for faster authentication",
|
||||
}
|
||||
|
||||
# Router cache
|
||||
if llm_router is not None:
|
||||
router_cache_items = len(llm_router.cache.in_memory_cache.cache_dict)
|
||||
total_cache_items += router_cache_items
|
||||
caches["llm_responses"] = {
|
||||
"count": router_cache_items,
|
||||
"count_readable": f"{router_cache_items:,}",
|
||||
"what_it_stores": "LLM responses for identical requests",
|
||||
}
|
||||
|
||||
# Proxy logging cache
|
||||
logging_cache_items = len(
|
||||
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
||||
)
|
||||
total_cache_items += logging_cache_items
|
||||
caches["usage_tracking"] = {
|
||||
"count": logging_cache_items,
|
||||
"count_readable": f"{logging_cache_items:,}",
|
||||
"what_it_stores": "Usage metrics before database write",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
caches["error"] = str(e)
|
||||
|
||||
# Get garbage collector stats
|
||||
gc_enabled = gc.isenabled()
|
||||
objects_pending = gc.get_count()[0]
|
||||
uncollectable = len(gc.garbage)
|
||||
|
||||
gc_info = {
|
||||
"status": "enabled" if gc_enabled else "disabled",
|
||||
"objects_awaiting_collection": objects_pending,
|
||||
}
|
||||
|
||||
# Add warning if garbage collection issues detected
|
||||
if uncollectable > 0:
|
||||
gc_info[
|
||||
"warning"
|
||||
] = f"{uncollectable} uncollectable objects (possible memory leak)"
|
||||
|
||||
return {
|
||||
"worker_pid": os.getpid(),
|
||||
"status": health_status,
|
||||
"memory": process_memory,
|
||||
"caches": {
|
||||
"total_items": total_cache_items,
|
||||
"breakdown": caches,
|
||||
},
|
||||
"garbage_collector": gc_info,
|
||||
}
|
||||
|
||||
|
||||
def _get_gc_statistics() -> Dict[str, Any]:
|
||||
"""Get garbage collector statistics."""
|
||||
return {
|
||||
"enabled": gc.isenabled(),
|
||||
"thresholds": {
|
||||
"generation_0": gc.get_threshold()[0],
|
||||
"generation_1": gc.get_threshold()[1],
|
||||
"generation_2": gc.get_threshold()[2],
|
||||
"explanation": "Number of allocations before automatic collection for each generation",
|
||||
},
|
||||
"current_counts": {
|
||||
"generation_0": gc.get_count()[0],
|
||||
"generation_1": gc.get_count()[1],
|
||||
"generation_2": gc.get_count()[2],
|
||||
"explanation": "Current number of allocated objects in each generation",
|
||||
},
|
||||
"collection_history": [
|
||||
{
|
||||
"generation": i,
|
||||
"total_collections": stat["collections"],
|
||||
"total_collected": stat["collected"],
|
||||
"uncollectable": stat["uncollectable"],
|
||||
}
|
||||
for i, stat in enumerate(gc.get_stats())
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _get_object_type_counts(top_n: int) -> Tuple[int, List[Dict[str, Any]]]:
|
||||
"""Count objects by type and return total count and top N types."""
|
||||
type_counts: Counter = Counter()
|
||||
total_objects = 0
|
||||
|
||||
for obj in gc.get_objects():
|
||||
total_objects += 1
|
||||
obj_type = type(obj).__name__
|
||||
type_counts[obj_type] += 1
|
||||
|
||||
top_object_types = [
|
||||
{"type": obj_type, "count": count, "count_readable": f"{count:,}"}
|
||||
for obj_type, count in type_counts.most_common(top_n)
|
||||
]
|
||||
|
||||
return total_objects, top_object_types
|
||||
|
||||
|
||||
def _get_uncollectable_objects_info() -> Dict[str, Any]:
|
||||
"""Get information about uncollectable objects (potential memory leaks)."""
|
||||
uncollectable = gc.garbage
|
||||
return {
|
||||
"count": len(uncollectable),
|
||||
"sample_types": [type(obj).__name__ for obj in uncollectable[:10]],
|
||||
"warning": "If count > 0, you may have reference cycles preventing garbage collection"
|
||||
if len(uncollectable) > 0
|
||||
else None,
|
||||
}
|
||||
|
||||
|
||||
def _get_cache_memory_stats(
|
||||
user_api_key_cache, llm_router, proxy_logging_obj, redis_usage_cache
|
||||
) -> Dict[str, Any]:
|
||||
"""Calculate memory usage for all caches."""
|
||||
cache_stats: Dict[str, Any] = {}
|
||||
try:
|
||||
# User API key cache
|
||||
user_cache_size = sys.getsizeof(user_api_key_cache.in_memory_cache.cache_dict)
|
||||
user_ttl_size = sys.getsizeof(user_api_key_cache.in_memory_cache.ttl_dict)
|
||||
cache_stats["user_api_key_cache"] = {
|
||||
"num_items": len(user_api_key_cache.in_memory_cache.cache_dict),
|
||||
"cache_dict_size_bytes": user_cache_size,
|
||||
"ttl_dict_size_bytes": user_ttl_size,
|
||||
"total_size_mb": round(
|
||||
(user_cache_size + user_ttl_size) / (1024 * 1024), 2
|
||||
),
|
||||
}
|
||||
|
||||
# Router cache
|
||||
if llm_router is not None:
|
||||
router_cache_size = sys.getsizeof(
|
||||
llm_router.cache.in_memory_cache.cache_dict
|
||||
)
|
||||
router_ttl_size = sys.getsizeof(llm_router.cache.in_memory_cache.ttl_dict)
|
||||
cache_stats["llm_router_cache"] = {
|
||||
"num_items": len(llm_router.cache.in_memory_cache.cache_dict),
|
||||
"cache_dict_size_bytes": router_cache_size,
|
||||
"ttl_dict_size_bytes": router_ttl_size,
|
||||
"total_size_mb": round(
|
||||
(router_cache_size + router_ttl_size) / (1024 * 1024), 2
|
||||
),
|
||||
}
|
||||
|
||||
# Proxy logging cache
|
||||
logging_cache_size = sys.getsizeof(
|
||||
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
||||
)
|
||||
logging_ttl_size = sys.getsizeof(
|
||||
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.ttl_dict
|
||||
)
|
||||
cache_stats["proxy_logging_cache"] = {
|
||||
"num_items": len(
|
||||
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
||||
),
|
||||
"cache_dict_size_bytes": logging_cache_size,
|
||||
"ttl_dict_size_bytes": logging_ttl_size,
|
||||
"total_size_mb": round(
|
||||
(logging_cache_size + logging_ttl_size) / (1024 * 1024), 2
|
||||
),
|
||||
}
|
||||
|
||||
# Redis cache info
|
||||
if redis_usage_cache is not None:
|
||||
cache_stats["redis_usage_cache"] = {
|
||||
"enabled": True,
|
||||
"cache_type": type(redis_usage_cache).__name__,
|
||||
}
|
||||
# Try to get Redis connection pool info if available
|
||||
try:
|
||||
if (
|
||||
hasattr(redis_usage_cache, "redis_client")
|
||||
and redis_usage_cache.redis_client
|
||||
):
|
||||
if hasattr(redis_usage_cache.redis_client, "connection_pool"):
|
||||
pool_info = redis_usage_cache.redis_client.connection_pool # type: ignore
|
||||
cache_stats["redis_usage_cache"]["connection_pool"] = {
|
||||
"max_connections": pool_info.max_connections
|
||||
if hasattr(pool_info, "max_connections")
|
||||
else None,
|
||||
"connection_class": pool_info.connection_class.__name__
|
||||
if hasattr(pool_info, "connection_class")
|
||||
else None,
|
||||
}
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.debug(f"Error getting Redis pool info: {e}")
|
||||
else:
|
||||
cache_stats["redis_usage_cache"] = {"enabled": False}
|
||||
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.debug(f"Error calculating cache stats: {e}")
|
||||
cache_stats["error"] = str(e)
|
||||
|
||||
return cache_stats
|
||||
|
||||
|
||||
def _get_router_memory_stats(llm_router) -> Dict[str, Any]:
|
||||
"""Get memory usage statistics for LiteLLM router."""
|
||||
litellm_router_memory: Dict[str, Any] = {}
|
||||
try:
|
||||
if llm_router is not None:
|
||||
# Model list memory size
|
||||
if hasattr(llm_router, "model_list") and llm_router.model_list:
|
||||
model_list_size = sys.getsizeof(llm_router.model_list)
|
||||
litellm_router_memory["model_list"] = {
|
||||
"num_models": len(llm_router.model_list),
|
||||
"size_bytes": model_list_size,
|
||||
"size_mb": round(model_list_size / (1024 * 1024), 4),
|
||||
}
|
||||
|
||||
# Model names set
|
||||
if hasattr(llm_router, "model_names") and llm_router.model_names:
|
||||
model_names_size = sys.getsizeof(llm_router.model_names)
|
||||
litellm_router_memory["model_names_set"] = {
|
||||
"num_model_groups": len(llm_router.model_names),
|
||||
"size_bytes": model_names_size,
|
||||
"size_mb": round(model_names_size / (1024 * 1024), 4),
|
||||
}
|
||||
|
||||
# Deployment names list
|
||||
if hasattr(llm_router, "deployment_names") and llm_router.deployment_names:
|
||||
deployment_names_size = sys.getsizeof(llm_router.deployment_names)
|
||||
litellm_router_memory["deployment_names"] = {
|
||||
"num_deployments": len(llm_router.deployment_names),
|
||||
"size_bytes": deployment_names_size,
|
||||
"size_mb": round(deployment_names_size / (1024 * 1024), 4),
|
||||
}
|
||||
|
||||
# Deployment latency map
|
||||
if (
|
||||
hasattr(llm_router, "deployment_latency_map")
|
||||
and llm_router.deployment_latency_map
|
||||
):
|
||||
latency_map_size = sys.getsizeof(llm_router.deployment_latency_map)
|
||||
litellm_router_memory["deployment_latency_map"] = {
|
||||
"num_tracked_deployments": len(llm_router.deployment_latency_map),
|
||||
"size_bytes": latency_map_size,
|
||||
"size_mb": round(latency_map_size / (1024 * 1024), 4),
|
||||
}
|
||||
|
||||
# Fallback configuration
|
||||
if hasattr(llm_router, "fallbacks") and llm_router.fallbacks:
|
||||
fallbacks_size = sys.getsizeof(llm_router.fallbacks)
|
||||
litellm_router_memory["fallbacks"] = {
|
||||
"num_fallback_configs": len(llm_router.fallbacks),
|
||||
"size_bytes": fallbacks_size,
|
||||
"size_mb": round(fallbacks_size / (1024 * 1024), 4),
|
||||
}
|
||||
|
||||
# Total router object size
|
||||
router_obj_size = sys.getsizeof(llm_router)
|
||||
litellm_router_memory["router_object"] = {
|
||||
"size_bytes": router_obj_size,
|
||||
"size_mb": round(router_obj_size / (1024 * 1024), 4),
|
||||
}
|
||||
|
||||
else:
|
||||
litellm_router_memory = {"note": "Router not initialized"}
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.debug(f"Error getting router memory info: {e}")
|
||||
litellm_router_memory = {"error": str(e)}
|
||||
|
||||
return litellm_router_memory
|
||||
|
||||
|
||||
def _get_process_memory_info(
|
||||
worker_pid: int, include_process_info: bool
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Get process-level memory information using psutil."""
|
||||
if not include_process_info:
|
||||
return None
|
||||
|
||||
try:
|
||||
import psutil
|
||||
|
||||
process = psutil.Process()
|
||||
memory_info = process.memory_info()
|
||||
ram_usage_mb = round(memory_info.rss / (1024 * 1024), 2)
|
||||
virtual_memory_mb = round(memory_info.vms / (1024 * 1024), 2)
|
||||
memory_percent = round(process.memory_percent(), 2)
|
||||
|
||||
return {
|
||||
"pid": worker_pid,
|
||||
"summary": f"Worker PID {worker_pid} using {ram_usage_mb:.1f} MB of RAM ({memory_percent:.1f}% of system memory)",
|
||||
"ram_usage": {
|
||||
"megabytes": ram_usage_mb,
|
||||
"description": "Actual physical RAM used by this process",
|
||||
},
|
||||
"virtual_memory": {
|
||||
"megabytes": virtual_memory_mb,
|
||||
"description": "Total virtual memory allocated (includes swapped memory)",
|
||||
},
|
||||
"system_memory_percent": {
|
||||
"percent": memory_percent,
|
||||
"description": "Percentage of total system RAM being used",
|
||||
},
|
||||
"open_file_handles": {
|
||||
"count": process.num_fds()
|
||||
if hasattr(process, "num_fds")
|
||||
else "N/A (Windows)",
|
||||
"description": "Number of open file descriptors/handles",
|
||||
},
|
||||
"threads": {
|
||||
"count": process.num_threads(),
|
||||
"description": "Number of active threads in this process",
|
||||
},
|
||||
}
|
||||
except ImportError:
|
||||
return {
|
||||
"pid": worker_pid,
|
||||
"error": "psutil not installed. Install with: pip install psutil",
|
||||
}
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.debug(f"Error getting process info: {e}")
|
||||
return {"pid": worker_pid, "error": str(e)}
|
||||
|
||||
|
||||
@router.get("/debug/memory/details", include_in_schema=False)
|
||||
async def get_memory_details(
|
||||
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
top_n: int = Query(20, description="Number of top object types to return"),
|
||||
include_process_info: bool = Query(True, description="Include process memory info"),
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed memory diagnostics for deep debugging.
|
||||
|
||||
Returns:
|
||||
- worker_pid: Process ID
|
||||
- process_memory: RAM usage, virtual memory, file handles, threads
|
||||
- garbage_collector: GC thresholds, counts, collection history
|
||||
- objects: Total tracked objects and top object types
|
||||
- uncollectable: Objects that can't be garbage collected (potential leaks)
|
||||
- cache_memory: Memory usage of user_api_key, router, and logging caches
|
||||
- router_memory: Memory usage of router components (model_list, deployment_names, etc.)
|
||||
|
||||
Query Parameters:
|
||||
- top_n: Number of top object types to return (default: 20)
|
||||
- include_process_info: Include process-level memory info using psutil (default: true)
|
||||
|
||||
Example usage:
|
||||
curl "http://localhost:4000/debug/memory/details?top_n=30" -H "Authorization: Bearer sk-1234"
|
||||
|
||||
All memory sizes are reported in both bytes and MB.
|
||||
"""
|
||||
from litellm.proxy.proxy_server import (
|
||||
llm_router,
|
||||
proxy_logging_obj,
|
||||
user_api_key_cache,
|
||||
redis_usage_cache,
|
||||
)
|
||||
|
||||
worker_pid = os.getpid()
|
||||
|
||||
# Collect all diagnostics using helper functions
|
||||
gc_stats = _get_gc_statistics()
|
||||
total_objects, top_object_types = _get_object_type_counts(top_n)
|
||||
uncollectable_info = _get_uncollectable_objects_info()
|
||||
cache_stats = _get_cache_memory_stats(
|
||||
user_api_key_cache, llm_router, proxy_logging_obj, redis_usage_cache
|
||||
)
|
||||
litellm_router_memory = _get_router_memory_stats(llm_router)
|
||||
process_info = _get_process_memory_info(worker_pid, include_process_info)
|
||||
|
||||
return {
|
||||
"worker_pid": worker_pid,
|
||||
"process_memory": process_info,
|
||||
"garbage_collector": gc_stats,
|
||||
"objects": {
|
||||
"total_tracked": total_objects,
|
||||
"total_tracked_readable": f"{total_objects:,}",
|
||||
"top_types": top_object_types,
|
||||
},
|
||||
"uncollectable": uncollectable_info,
|
||||
"cache_memory": cache_stats,
|
||||
"router_memory": litellm_router_memory,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/debug/memory/gc/configure", include_in_schema=False)
|
||||
async def configure_gc_thresholds_endpoint(
|
||||
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
||||
generation_0: int = Query(700, description="Generation 0 threshold (default: 700)"),
|
||||
generation_1: int = Query(10, description="Generation 1 threshold (default: 10)"),
|
||||
generation_2: int = Query(10, description="Generation 2 threshold (default: 10)"),
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Configure Python garbage collection thresholds.
|
||||
|
||||
Lower thresholds mean more frequent GC cycles (less memory, more CPU overhead).
|
||||
Higher thresholds mean less frequent GC cycles (more memory, less CPU overhead).
|
||||
|
||||
Returns:
|
||||
- message: Confirmation message
|
||||
- previous_thresholds: Old threshold values
|
||||
- new_thresholds: New threshold values
|
||||
- objects_awaiting_collection: Current object count in gen-0
|
||||
- tip: Hint about when next collection will occur
|
||||
|
||||
Query Parameters:
|
||||
- generation_0: Number of allocations before gen-0 collection (default: 700)
|
||||
- generation_1: Number of gen-0 collections before gen-1 collection (default: 10)
|
||||
- generation_2: Number of gen-1 collections before gen-2 collection (default: 10)
|
||||
|
||||
Example for more aggressive collection:
|
||||
curl -X POST "http://localhost:4000/debug/memory/gc/configure?generation_0=500" -H "Authorization: Bearer sk-1234"
|
||||
|
||||
Example for less aggressive collection:
|
||||
curl -X POST "http://localhost:4000/debug/memory/gc/configure?generation_0=1000" -H "Authorization: Bearer sk-1234"
|
||||
|
||||
Monitor memory usage with GET /debug/memory/summary after changes.
|
||||
"""
|
||||
# Get current thresholds for logging
|
||||
old_thresholds = gc.get_threshold()
|
||||
|
||||
# Set new thresholds with error handling
|
||||
try:
|
||||
gc.set_threshold(generation_0, generation_1, generation_2)
|
||||
verbose_proxy_logger.info(
|
||||
f"GC thresholds updated from {old_thresholds} to "
|
||||
f"({generation_0}, {generation_1}, {generation_2})"
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.error(f"Failed to set GC thresholds: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to set GC thresholds: {str(e)}"
|
||||
)
|
||||
|
||||
# Get current object count to show immediate impact
|
||||
current_count = gc.get_count()[0]
|
||||
|
||||
return {
|
||||
"message": "GC thresholds updated",
|
||||
"previous_thresholds": f"{old_thresholds[0]}, {old_thresholds[1]}, {old_thresholds[2]}",
|
||||
"new_thresholds": f"{generation_0}, {generation_1}, {generation_2}",
|
||||
"objects_awaiting_collection": current_count,
|
||||
"tip": f"Next collection will run after {generation_0 - current_count} more allocations",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/otel-spans", include_in_schema=False)
|
||||
async def get_otel_spans():
|
||||
from litellm.proxy.proxy_server import open_telemetry_logger
|
||||
|
||||
if open_telemetry_logger is None:
|
||||
return {
|
||||
"otel_spans": [],
|
||||
"spans_grouped_by_parent": {},
|
||||
"most_recent_parent": None,
|
||||
}
|
||||
|
||||
otel_exporter = open_telemetry_logger.OTEL_EXPORTER
|
||||
if hasattr(otel_exporter, "get_finished_spans"):
|
||||
recorded_spans = otel_exporter.get_finished_spans() # type: ignore
|
||||
else:
|
||||
recorded_spans = []
|
||||
|
||||
print("Spans: ", recorded_spans) # noqa
|
||||
|
||||
most_recent_parent = None
|
||||
most_recent_start_time = 1000000
|
||||
spans_grouped_by_parent = {}
|
||||
for span in recorded_spans:
|
||||
if span.parent is not None:
|
||||
parent_trace_id = span.parent.trace_id
|
||||
if parent_trace_id not in spans_grouped_by_parent:
|
||||
spans_grouped_by_parent[parent_trace_id] = []
|
||||
spans_grouped_by_parent[parent_trace_id].append(span.name)
|
||||
|
||||
# check time of span
|
||||
if span.start_time > most_recent_start_time:
|
||||
most_recent_parent = parent_trace_id
|
||||
most_recent_start_time = span.start_time
|
||||
|
||||
# these are otel spans - get the span name
|
||||
span_names = [span.name for span in recorded_spans]
|
||||
return {
|
||||
"otel_spans": span_names,
|
||||
"spans_grouped_by_parent": spans_grouped_by_parent,
|
||||
"most_recent_parent": most_recent_parent,
|
||||
}
|
||||
|
||||
|
||||
# Helper functions for debugging
|
||||
def init_verbose_loggers():
|
||||
try:
|
||||
worker_config = get_secret_str("WORKER_CONFIG")
|
||||
# if not, assume it's a json string
|
||||
if worker_config is None:
|
||||
return
|
||||
if os.path.isfile(worker_config):
|
||||
return
|
||||
_settings = json.loads(worker_config)
|
||||
if not isinstance(_settings, dict):
|
||||
return
|
||||
|
||||
debug = _settings.get("debug", None)
|
||||
detailed_debug = _settings.get("detailed_debug", None)
|
||||
if debug is True: # this needs to be first, so users can see Router init debugg
|
||||
import logging
|
||||
|
||||
from litellm._logging import (
|
||||
verbose_logger,
|
||||
verbose_proxy_logger,
|
||||
verbose_router_logger,
|
||||
)
|
||||
|
||||
# this must ALWAYS remain logging.INFO, DO NOT MODIFY THIS
|
||||
verbose_logger.setLevel(level=logging.INFO) # sets package logs to info
|
||||
verbose_router_logger.setLevel(
|
||||
level=logging.INFO
|
||||
) # set router logs to info
|
||||
verbose_proxy_logger.setLevel(level=logging.INFO) # set proxy logs to info
|
||||
if detailed_debug is True:
|
||||
import logging
|
||||
|
||||
from litellm._logging import (
|
||||
verbose_logger,
|
||||
verbose_proxy_logger,
|
||||
verbose_router_logger,
|
||||
)
|
||||
|
||||
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
|
||||
verbose_router_logger.setLevel(
|
||||
level=logging.DEBUG
|
||||
) # set router logs to debug
|
||||
verbose_proxy_logger.setLevel(
|
||||
level=logging.DEBUG
|
||||
) # set proxy logs to debug
|
||||
elif debug is False and detailed_debug is False:
|
||||
# users can control proxy debugging using env variable = 'LITELLM_LOG'
|
||||
litellm_log_setting = os.environ.get("LITELLM_LOG", "")
|
||||
if litellm_log_setting is not None:
|
||||
if litellm_log_setting.upper() == "INFO":
|
||||
import logging
|
||||
|
||||
from litellm._logging import (
|
||||
verbose_proxy_logger,
|
||||
verbose_router_logger,
|
||||
)
|
||||
|
||||
# this must ALWAYS remain logging.INFO, DO NOT MODIFY THIS
|
||||
|
||||
verbose_router_logger.setLevel(
|
||||
level=logging.INFO
|
||||
) # set router logs to info
|
||||
verbose_proxy_logger.setLevel(
|
||||
level=logging.INFO
|
||||
) # set proxy logs to info
|
||||
elif litellm_log_setting.upper() == "DEBUG":
|
||||
import logging
|
||||
|
||||
from litellm._logging import (
|
||||
verbose_proxy_logger,
|
||||
verbose_router_logger,
|
||||
)
|
||||
|
||||
verbose_router_logger.setLevel(
|
||||
level=logging.DEBUG
|
||||
) # set router logs to info
|
||||
verbose_proxy_logger.setLevel(
|
||||
level=logging.DEBUG
|
||||
) # set proxy logs to debug
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.warning(f"Failed to init verbose loggers: {str(e)}")
|
||||
Reference in New Issue
Block a user