#### OCR Endpoints ##### import json from typing import Any, Dict, Optional, cast import orjson from fastapi import APIRouter, Depends, Request, Response, UploadFile from fastapi.responses import ORJSONResponse from litellm._logging import verbose_proxy_logger from litellm.ocr.main import convert_file_document_to_url_document, get_mime_type from litellm.proxy._types import * from litellm.proxy.auth.user_api_key_auth import UserAPIKeyAuth, user_api_key_auth from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing router = APIRouter() def _build_document_from_upload( file_content: bytes, filename: Optional[str], content_type: Optional[str], ) -> Dict[str, str]: """ Convert uploaded file bytes into a Mistral-format document dict with base64 data URI. Delegates to convert_file_document_to_url_document after resolving MIME type from the upload's content_type header or filename. """ mime_type = content_type.split(";")[0].strip() if content_type else None if not mime_type or mime_type == "application/octet-stream": if filename: mime_type = get_mime_type(filename) return convert_file_document_to_url_document( { "type": "file", "file": file_content, "mime_type": mime_type or "application/octet-stream", } ) async def _parse_multipart_form(request: Request) -> Dict[str, Any]: """ Extract OCR data from a multipart form request. Uses the cached form if already parsed by auth middleware, otherwise parses the form from the request. Returns: A dict with 'document', 'model', and any other OCR params. """ try: form = await request.form() except Exception as e: raise ValueError( f"Failed to parse multipart form data: {str(e)}. " "When using curl with --form/-F, do NOT set the Content-Type header " "manually — curl will set it automatically with the required boundary." ) uploaded_file = form.get("file") # request.form() may return either a FastAPI or Starlette UploadFile # depending on middleware; check both via isinstance (FastAPI's UploadFile # is a subclass of Starlette's) and fall back to duck-type check. if uploaded_file is None or ( not isinstance(uploaded_file, UploadFile) and not hasattr(uploaded_file, "read") ): raise ValueError( "Multipart OCR request must include a 'file' field with the document to process" ) uploaded_file = cast(UploadFile, uploaded_file) # Seek to start in case the file was already partially read by middleware await uploaded_file.seek(0) file_content = await uploaded_file.read() if not file_content: raise ValueError("Uploaded file is empty") document = _build_document_from_upload( file_content=file_content, filename=uploaded_file.filename, content_type=uploaded_file.content_type, ) data: Dict[str, Any] = {"document": document} for field_name, field_value in form.items(): if field_name in ("file", "document"): continue # Try to parse JSON values (e.g. pages=[0,1,2]) if isinstance(field_value, str): try: data[field_name] = json.loads(field_value) except (json.JSONDecodeError, ValueError): data[field_name] = field_value else: data[field_name] = field_value verbose_proxy_logger.debug( f"OCR multipart form request parsed - model: {data.get('model')}, " f"document_type: {document['type']}, " f"filename: {uploaded_file.filename}" ) return data async def _parse_ocr_request(request: Request) -> Dict[str, Any]: """ Parse an OCR request, supporting both JSON and multipart form data. JSON body (existing behavior): { "model": "mistral/mistral-ocr-latest", "document": {"type": "document_url", "document_url": "https://..."} } Multipart form data (new): - file: the uploaded file - model: model name (form field) - Any other OCR params as form fields (pages, include_image_base64, etc.) Returns: A dict suitable for passing to the OCR processing pipeline. """ content_type = request.headers.get("content-type", "") if "multipart/form-data" in content_type.lower(): return await _parse_multipart_form(request) # --- JSON body (existing behavior) --- try: body = await request.body() except RuntimeError: # Body stream was consumed by auth middleware (e.g., form parsing). body = b"" if not body: # The body may be empty because the auth middleware already parsed # it as form data (e.g., _read_request_body called request.form()). # Check if form data is available. if getattr(request, "_form", None) is not None: verbose_proxy_logger.debug( "OCR request body is empty but form data is available from middleware — " "processing as multipart form." ) return await _parse_multipart_form(request) raise ValueError( "Empty request body. For file uploads, use multipart/form-data content type " "with a file field. When using curl with --form/-F, do NOT set the Content-Type " "header manually." ) try: data = orjson.loads(body) except orjson.JSONDecodeError as e: raise ValueError( f"Invalid JSON in request body: {e}. " "Ensure the request body is valid JSON with Content-Type: application/json, " "or use multipart/form-data for file uploads." ) # Security: reject type="file" documents received via JSON. # The "file" document type is designed for local SDK usage where the # caller and the process share a filesystem. In the proxy context the # caller is remote, so allowing a file-path string would let an # authenticated user read arbitrary files from the server's filesystem. # File uploads must go through multipart/form-data instead. doc = data.get("document") if isinstance(data, dict) else None if isinstance(doc, dict) and doc.get("type") == "file": raise ValueError( "document type 'file' is not supported through the JSON API. " "To upload a local file, use multipart/form-data with a 'file' field. " "For JSON requests, use 'document_url' or 'image_url' document types." ) return data @router.post( "/v1/ocr", dependencies=[Depends(user_api_key_auth)], response_class=ORJSONResponse, tags=["ocr"], ) @router.post( "/ocr", dependencies=[Depends(user_api_key_auth)], response_class=ORJSONResponse, tags=["ocr"], ) async def ocr( request: Request, fastapi_response: Response, user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), ): """ OCR endpoint for extracting text from documents and images. Supports two input modes: **1. JSON body** (Mistral OCR API compatible): ```bash curl -X POST "http://localhost:4000/v1/ocr" \ -H "Authorization: Bearer sk-1234" \ -H "Content-Type: application/json" \ -d '{ "model": "mistral-ocr", "document": { "type": "document_url", "document_url": "https://arxiv.org/pdf/2201.04234" } }' ``` **2. Multipart form file upload**: ```bash curl -X POST "http://localhost:4000/v1/ocr" \ -H "Authorization: Bearer sk-1234" \ -F "model=mistral-ocr" \ -F "file=@document.pdf" ``` """ from litellm.proxy.proxy_server import ( general_settings, llm_router, proxy_config, proxy_logging_obj, select_data_generator, user_api_base, user_max_tokens, user_model, user_request_timeout, user_temperature, version, ) data: dict = {} try: # Parse request body (JSON or multipart form) data = await _parse_ocr_request(request) # Process request using ProxyBaseLLMRequestProcessing processor = ProxyBaseLLMRequestProcessing(data=data) return await processor.base_process_llm_request( request=request, fastapi_response=fastapi_response, user_api_key_dict=user_api_key_dict, route_type="aocr", proxy_logging_obj=proxy_logging_obj, llm_router=llm_router, general_settings=general_settings, proxy_config=proxy_config, select_data_generator=select_data_generator, model=None, user_model=user_model, user_temperature=user_temperature, user_request_timeout=user_request_timeout, user_max_tokens=user_max_tokens, user_api_base=user_api_base, version=version, ) except Exception as e: processor = ProxyBaseLLMRequestProcessing(data=data) raise await processor._handle_llm_api_exception( e=e, user_api_key_dict=user_api_key_dict, proxy_logging_obj=proxy_logging_obj, version=version, )