chore: initial snapshot for gitea/github upload

This commit is contained in:
Your Name
2026-03-26 16:04:46 +08:00
commit a699a1ac98
3497 changed files with 1586237 additions and 0 deletions

View File

@@ -0,0 +1,110 @@
"""
Nvidia NIM endpoint: https://docs.api.nvidia.com/nim/reference/databricks-dbrx-instruct-infer
This is OpenAI compatible
This file only contains param mapping logic
API calling is done using the OpenAI SDK with an api_base
"""
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
class NvidiaNimConfig(OpenAIGPTConfig):
"""
Reference: https://docs.api.nvidia.com/nim/reference/databricks-dbrx-instruct-infer
The class `NvidiaNimConfig` provides configuration for the Nvidia NIM's Chat Completions API interface. Below are the parameters:
"""
def get_supported_openai_params(self, model: str) -> list:
"""
Get the supported OpenAI params for the given model
Updated on July 5th, 2024 - based on https://docs.api.nvidia.com/nim/reference
"""
if model in [
"google/recurrentgemma-2b",
"google/gemma-2-27b-it",
"google/gemma-2-9b-it",
"gemma-2-9b-it",
]:
return ["stream", "temperature", "top_p", "max_tokens", "stop", "seed"]
elif model == "nvidia/nemotron-4-340b-instruct":
return [
"stream",
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
]
elif model == "nvidia/nemotron-4-340b-reward":
return [
"stream",
]
elif model in ["google/codegemma-1.1-7b"]:
# most params - but no 'seed' :(
return [
"stream",
"temperature",
"top_p",
"frequency_penalty",
"presence_penalty",
"max_tokens",
"max_completion_tokens",
"stop",
]
else:
# DEFAULT Case - The vast majority of Nvidia NIM Models lie here
# "upstage/solar-10.7b-instruct",
# "snowflake/arctic",
# "seallms/seallm-7b-v2.5",
# "nvidia/llama3-chatqa-1.5-8b",
# "nvidia/llama3-chatqa-1.5-70b",
# "mistralai/mistral-large",
# "mistralai/mixtral-8x22b-instruct-v0.1",
# "mistralai/mixtral-8x7b-instruct-v0.1",
# "mistralai/mistral-7b-instruct-v0.3",
# "mistralai/mistral-7b-instruct-v0.2",
# "mistralai/codestral-22b-instruct-v0.1",
# "microsoft/phi-3-small-8k-instruct",
# "microsoft/phi-3-small-128k-instruct",
# "microsoft/phi-3-mini-4k-instruct",
# "microsoft/phi-3-mini-128k-instruct",
# "microsoft/phi-3-medium-4k-instruct",
# "microsoft/phi-3-medium-128k-instruct",
# "meta/llama3-70b-instruct",
# "meta/llama3-8b-instruct",
# "meta/llama2-70b",
# "meta/codellama-70b",
return [
"stream",
"temperature",
"top_p",
"frequency_penalty",
"presence_penalty",
"max_tokens",
"max_completion_tokens",
"stop",
"seed",
"tools",
"tool_choice",
"parallel_tool_calls",
"response_format",
]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items():
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value
return optional_params

View File

@@ -0,0 +1,82 @@
"""
Nvidia NIM embeddings endpoint: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
This is OpenAI compatible
This file only contains param mapping logic
API calling is done using the OpenAI SDK with an api_base
"""
import types
from typing import Optional
class NvidiaNimEmbeddingConfig:
"""
Reference: https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5-infer
"""
# OpenAI params
encoding_format: Optional[str] = None
user: Optional[str] = None
# Nvidia NIM params
input_type: Optional[str] = None
truncate: Optional[str] = None
def __init__(
self,
encoding_format: Optional[str] = None,
user: Optional[str] = None,
input_type: Optional[str] = None,
truncate: Optional[str] = None,
) -> None:
locals_ = locals().copy()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(
self,
):
return ["encoding_format", "user", "dimensions"]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
kwargs: Optional[dict] = None,
):
if "extra_body" not in optional_params:
optional_params["extra_body"] = {}
for k, v in non_default_params.items():
if k == "input_type":
optional_params["extra_body"].update({"input_type": v})
elif k == "truncate":
optional_params["extra_body"].update({"truncate": v})
else:
optional_params[k] = v
if kwargs is not None:
# pass kwargs in extra_body
optional_params["extra_body"].update(kwargs)
return optional_params

View File

@@ -0,0 +1,27 @@
"""
Common utilities for NVIDIA NIM rerank provider.
"""
def get_nvidia_nim_rerank_config(model: str):
"""
Get the appropriate NVIDIA NIM rerank config based on the model.
Args:
model: The model string (e.g., "nvidia/llama-3.2-nv-rerankqa-1b-v2" or "ranking/nvidia/llama-3.2-nv-rerankqa-1b-v2")
Returns:
NvidiaNimRankingConfig if model starts with "ranking/", else NvidiaNimRerankConfig
Example:
- "ranking/nvidia/llama-3.2-nv-rerankqa-1b-v2" -> NvidiaNimRankingConfig
- "nvidia/llama-3.2-nv-rerankqa-1b-v2" -> NvidiaNimRerankConfig
"""
from litellm.llms.nvidia_nim.rerank.ranking_transformation import (
NvidiaNimRankingConfig,
)
from litellm.llms.nvidia_nim.rerank.transformation import NvidiaNimRerankConfig
if model.startswith("ranking/"):
return NvidiaNimRankingConfig()
return NvidiaNimRerankConfig()

View File

@@ -0,0 +1,78 @@
"""
Transformation for NVIDIA NIM Ranking models that use /v1/ranking endpoint.
Use this by passing "nvidia_nim/ranking/<model>" to force the /v1/ranking endpoint.
Reference: https://build.nvidia.com/nvidia/llama-3_2-nv-rerankqa-1b-v2/deploy
"""
from typing import Dict, Optional
from litellm.llms.nvidia_nim.rerank.transformation import NvidiaNimRerankConfig
class NvidiaNimRankingConfig(NvidiaNimRerankConfig):
"""
Configuration for NVIDIA NIM models that use the /v1/ranking endpoint.
Example:
curl -X "POST" 'https://ai.api.nvidia.com/v1/ranking' \
-H 'Accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
"query": {"text": "which way did the traveler go?"},
"passages": [{"text": "..."}, {"text": "..."}],
"truncate": "END"
}'
"""
def _get_clean_model_name(self, model: str) -> str:
"""Strip 'nvidia_nim/' and 'ranking/' prefixes from model name."""
# First strip nvidia_nim/ prefix if present
if model.startswith("nvidia_nim/"):
model = model[len("nvidia_nim/") :]
# Then strip ranking/ prefix if present
if model.startswith("ranking/"):
model = model[len("ranking/") :]
return model
def get_complete_url(
self,
api_base: Optional[str],
model: str,
optional_params: Optional[dict] = None,
) -> str:
"""
Construct the Nvidia NIM ranking URL.
Format: {api_base}/v1/ranking
"""
if not api_base:
api_base = self.DEFAULT_NIM_RERANK_API_BASE
api_base = api_base.rstrip("/")
if api_base.endswith("/ranking"):
return api_base
if api_base.endswith("/v1"):
api_base = api_base[:-3]
return f"{api_base}/v1/ranking"
def transform_rerank_request(
self,
model: str,
optional_rerank_params: Dict,
headers: dict,
) -> dict:
"""
Transform request, using clean model name without 'ranking/' prefix.
"""
clean_model = self._get_clean_model_name(model)
return super().transform_rerank_request(
model=clean_model,
optional_rerank_params=optional_rerank_params,
headers=headers,
)

View File

@@ -0,0 +1,341 @@
from typing import Any, Dict, List, Literal, Optional, Union
import httpx
from typing_extensions import Required, TypedDict
import litellm
from litellm._uuid import uuid
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
from litellm.secret_managers.main import get_secret_str
from litellm.types.rerank import (
RerankBilledUnits,
RerankResponse,
RerankResponseMeta,
RerankResponseResult,
)
class NvidiaNimQueryObject(TypedDict):
text: Required[str]
class NvidiaNimPassageObject(TypedDict):
text: Required[str]
class NvidiaNimRerankRequest(TypedDict, total=False):
model: Required[str]
query: Required[NvidiaNimQueryObject]
passages: Required[List[NvidiaNimPassageObject]]
truncate: Literal["NONE", "END"]
top_k: int
class NvidiaNimRankingResult(TypedDict):
index: Required[int]
logit: Required[float]
class NvidiaNimRerankResponse(TypedDict):
rankings: Required[List[NvidiaNimRankingResult]]
class NvidiaNimRerankConfig(BaseRerankConfig):
"""
Reference: https://docs.api.nvidia.com/nim/reference/nvidia-llama-3_2-nv-rerankqa-1b-v2-infer
Nvidia NIM rerank API uses a different format:
- query is an object with 'text' field
- documents are called 'passages' and have 'text' field
"""
DEFAULT_NIM_RERANK_API_BASE = "https://ai.api.nvidia.com"
def __init__(self) -> None:
pass
def _get_clean_model_name(self, model: str) -> str:
"""Strip 'nvidia_nim/' prefix from model name if present."""
if model.startswith("nvidia_nim/"):
return model[len("nvidia_nim/") :]
return model
def get_complete_url(
self,
api_base: Optional[str],
model: str,
optional_params: Optional[dict] = None,
) -> str:
"""
Construct the Nvidia NIM rerank URL.
Format: {api_base}/v1/retrieval/{model}/reranking
If the user provides a full URL (e.g., {api_base}/v1/retrieval/{model}/reranking),
it will be used as-is.
"""
if not api_base:
api_base = self.DEFAULT_NIM_RERANK_API_BASE
api_base = api_base.rstrip("/")
# Check if user already provided the full URL with /retrieval/ path
if "/retrieval/" in api_base:
return api_base
# Ensure we don't have duplicate /v1
if api_base.endswith("/v1"):
api_base = api_base[:-3]
# Strip nvidia_nim/ prefix from model name if present
clean_model = self._get_clean_model_name(model)
return f"{api_base}/v1/retrieval/{clean_model}/reranking"
def get_supported_cohere_rerank_params(self, model: str) -> list:
"""
Nvidia NIM supports these rerank parameters.
"""
return [
"query",
"documents",
"top_n",
]
def map_cohere_rerank_params(
self,
non_default_params: Optional[dict],
model: str,
drop_params: bool,
query: str,
documents: List[Union[str, Dict[str, Any]]],
custom_llm_provider: Optional[str] = None,
top_n: Optional[int] = None,
rank_fields: Optional[List[str]] = None,
return_documents: Optional[bool] = True,
max_chunks_per_doc: Optional[int] = None,
max_tokens_per_doc: Optional[int] = None,
) -> Dict:
"""
Map Cohere/OpenAI rerank params to Nvidia NIM format.
Parameter mapping:
- top_n (Cohere) -> top_k (Nvidia)
Nvidia NIM specific params (passed through as-is from non_default_params):
- truncate: How to truncate input if too long (NONE, END)
"""
optional_nvidia_nim_rerank_params: Dict[str, Any] = {
"query": query,
"documents": documents,
}
# Map Cohere's top_n to Nvidia's top_k
if top_n is not None:
optional_nvidia_nim_rerank_params["top_k"] = top_n
# Pass through Nvidia-specific params from non_default_params
if non_default_params:
optional_nvidia_nim_rerank_params.update(non_default_params)
return dict(optional_nvidia_nim_rerank_params)
def validate_environment(
self,
headers: dict,
model: str,
api_key: Optional[str] = None,
optional_params: Optional[dict] = None,
) -> dict:
"""
Validate that the Nvidia NIM API key is present.
"""
if api_key is None:
api_key = get_secret_str("NVIDIA_NIM_API_KEY") or litellm.api_key
if api_key is None:
raise ValueError(
"Nvidia NIM API key is required. Please set 'NVIDIA_NIM_API_KEY' in your environment"
)
default_headers = {
"Authorization": f"Bearer {api_key}",
"accept": "application/json",
"content-type": "application/json",
}
# If 'Authorization' is provided in headers, it overrides the default
if "Authorization" in headers:
default_headers["Authorization"] = headers["Authorization"]
# Merge other headers, overriding any default ones except Authorization
return {**default_headers, **headers}
def transform_rerank_request(
self,
model: str,
optional_rerank_params: Dict,
headers: dict,
) -> dict:
"""
Transform request to Nvidia NIM format.
Nvidia NIM expects:
- query as {text: "..."}
- documents as passages: [{text: "..."}, ...]
- Optional: truncate (NONE or END), top_k
Note: optional_rerank_params may contain provider-specific params like 'top_k' and 'truncate'
that aren't in the OptionalRerankParams TypedDict but are passed through at runtime.
The mapping from Cohere's 'top_n' to Nvidia's 'top_k' already happened in map_cohere_rerank_params.
"""
if "query" not in optional_rerank_params:
raise ValueError("query is required for Nvidia NIM rerank")
if "documents" not in optional_rerank_params:
raise ValueError("documents is required for Nvidia NIM rerank")
query = optional_rerank_params["query"]
documents = optional_rerank_params["documents"]
# Transform query to object format
query_obj: NvidiaNimQueryObject = {"text": query}
# Transform documents to passages format
passages: List[NvidiaNimPassageObject] = []
for doc in documents:
if isinstance(doc, str):
passages.append({"text": doc})
elif isinstance(doc, dict):
# If document is already a dict, check if it has 'text' field
if "text" in doc:
passages.append({"text": doc["text"]})
else:
# Otherwise, stringify the dict
import json
passages.append({"text": json.dumps(doc)})
else:
passages.append({"text": str(doc)})
# Strip nvidia_nim/ prefix from model name if present
clean_model = self._get_clean_model_name(model)
# Note: URL path uses underscores (llama-3_2) but JSON body uses periods (llama-3.2)
# Convert underscores back to periods for the model field in request body
model_for_body = clean_model.replace("_", ".")
# Build request using TypedDict
request_data: NvidiaNimRerankRequest = {
"model": model_for_body,
"query": query_obj,
"passages": passages,
}
# Add optional top_k parameter if provided (already mapped from top_n in map_cohere_rerank_params)
if "top_k" in optional_rerank_params and optional_rerank_params.get("top_k") is not None: # type: ignore
request_data["top_k"] = optional_rerank_params.get("top_k") # type: ignore
# Add Nvidia-specific truncate parameter if provided
# This is passed through from non_default_params, not in base OptionalRerankParams
if "truncate" in optional_rerank_params and optional_rerank_params.get("truncate") is not None: # type: ignore
truncate_value = optional_rerank_params.get("truncate") # type: ignore
if truncate_value in ["NONE", "END"]:
request_data["truncate"] = truncate_value # type: ignore
return dict(request_data)
def transform_rerank_response(
self,
model: str,
raw_response: httpx.Response,
model_response: RerankResponse,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None,
request_data: dict = {},
optional_params: dict = {},
litellm_params: dict = {},
) -> RerankResponse:
"""
Transform Nvidia NIM rerank response to LiteLLM format.
Nvidia NIM returns (NvidiaNimRerankResponse):
{
"rankings": [
{
"index": 0,
"logit": 0.123
}
]
}
LiteLLM expects (RerankResponse):
{
"results": [
{
"index": 0,
"relevance_score": 0.123,
"document": {"text": "..."} # optional
}
]
}
"""
try:
raw_response_json = raw_response.json()
except Exception:
raise BaseLLMException(
status_code=raw_response.status_code,
message=raw_response.text,
headers=raw_response.headers,
)
# Parse as NvidiaNimRerankResponse
nvidia_response: NvidiaNimRerankResponse = raw_response_json
# Transform Nvidia NIM response to LiteLLM format
results: List[RerankResponseResult] = []
rankings = nvidia_response.get("rankings", [])
# Get original documents from request if we need to include them
original_passages: List[NvidiaNimPassageObject] = request_data.get(
"passages", []
)
for ranking in rankings:
result_item: RerankResponseResult = {
"index": ranking["index"],
"relevance_score": ranking["logit"],
}
# Include document if it was in the original request
index: int = ranking["index"]
if index < len(original_passages):
result_item["document"] = {"text": original_passages[index]["text"]} # type: ignore
results.append(result_item)
# Construct metadata with billed_units
# Nvidia NIM uses "usage" field with "total_tokens"
usage = raw_response_json.get("usage", {})
total_tokens = usage.get("total_tokens", 0)
billed_units: RerankBilledUnits = {
"total_tokens": total_tokens if total_tokens > 0 else len(results)
}
meta: RerankResponseMeta = {"billed_units": billed_units}
return RerankResponse(
id=raw_response_json.get("id") or str(uuid.uuid4()),
results=results,
meta=meta,
)
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
) -> BaseLLMException:
return BaseLLMException(
status_code=status_code,
message=error_message,
headers=headers,
)