chore: initial snapshot for gitea/github upload

This commit is contained in:
Your Name
2026-03-26 16:04:46 +08:00
commit a699a1ac98
3497 changed files with 1586237 additions and 0 deletions

View File

@@ -0,0 +1,9 @@
"""
File parsers for RAG ingestion.
Provides text extraction utilities for various file formats.
"""
from .pdf_parser import extract_text_from_pdf
__all__ = ["extract_text_from_pdf"]

View File

@@ -0,0 +1,76 @@
"""
PDF text extraction utilities.
Provides text extraction from PDF files using pypdf or PyPDF2.
"""
from typing import Optional
from litellm._logging import verbose_logger
def extract_text_from_pdf(file_content: bytes) -> Optional[str]:
"""
Extract text from PDF using pypdf if available.
Args:
file_content: Raw PDF bytes
Returns:
Extracted text or None if extraction fails
"""
try:
from io import BytesIO
# Try pypdf first (most common)
try:
from pypdf import PdfReader as PypdfReader
pdf_file = BytesIO(file_content)
reader = PypdfReader(pdf_file)
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
if text_parts:
extracted_text = "\n\n".join(text_parts)
verbose_logger.debug(
f"Extracted {len(extracted_text)} characters from PDF using pypdf"
)
return extracted_text
except ImportError:
verbose_logger.debug("pypdf not available, trying PyPDF2")
# Fallback to PyPDF2
try:
from PyPDF2 import PdfReader as PyPDF2Reader
pdf_file = BytesIO(file_content)
reader = PyPDF2Reader(pdf_file)
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
if text_parts:
extracted_text = "\n\n".join(text_parts)
verbose_logger.debug(
f"Extracted {len(extracted_text)} characters from PDF using PyPDF2"
)
return extracted_text
except ImportError:
verbose_logger.debug(
"PyPDF2 not available, PDF extraction requires OCR or pypdf/PyPDF2 library"
)
except Exception as e:
verbose_logger.debug(f"PDF text extraction failed: {e}")
return None