docling by existential-birds/beagle
npx skills add https://github.com/existential-birds/beagle --skill doclingDocling 是一个文档解析库,能够将 PDF、Word 文档、PowerPoint、图像及其他格式转换为具有高级版式理解能力的结构化数据。
基本文档转换:
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # URL、路径或 BytesIO
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())
文档转换的主要入口点。支持多种输入格式和转换选项。
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
# 基础转换器(启用所有格式)
converter = DocumentConverter()
# 限制格式
converter = DocumentConverter(
allowed_formats=[InputFormat.PDF, InputFormat.DOCX]
)
# 自定义流水线选项
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
所有转换操作都会返回一个包含以下内容的 ConversionResult:
document:解析后的 DoclingDocumentstatus:ConversionStatus.SUCCESS、PARTIAL_SUCCESS 或 FAILUREerrors:转换过程中遇到的错误列表input:源文档的信息result = converter.convert("document.pdf")
if result.status == ConversionStatus.SUCCESS:
markdown = result.document.export_to_markdown()
html = result.document.export_to_html()
data = result.document.export_to_dict()
export_to_markdown() 或 save_as_markdown()export_to_html() 或 save_as_html()export_to_dict() 或 save_as_json()(注意:没有 export_to_json() 方法)export_to_text() 或 export_to_markdown(strict_text=True) 或 save_as_markdown(strict_text=True)export_to_doctags() 或 save_as_doctags()from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("document.pdf")
# 导出为不同格式
markdown = result.document.export_to_markdown()
html = result.document.export_to_html()
json_data = result.document.export_to_dict()
# 或直接保存到文件
result.document.save_as_markdown("output.md")
result.document.save_as_html("output.html")
result.document.save_as_json("output.json")
有关 convert_all() 的详细信息,请参阅 references/batch.md。
converter = DocumentConverter()
result = converter.convert("https://example.com/document.pdf")
from io import BytesIO
from docling.datamodel.base_models import DocumentStream
with open("document.pdf", "rb") as f:
buf = BytesIO(f.read())
source = DocumentStream(name="document.pdf", stream=buf)
result = converter.convert(source)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
# 配置 PDF 特定选项
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options.lang = ["en", "es"]
pipeline_options.do_table_structure = True
pipeline_options.generate_page_images = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
converter = DocumentConverter()
# 限制文件大小(字节)和页数
result = converter.convert(
"large_document.pdf",
max_file_size=20_971_520, # 20 MB
max_num_pages=100
)
有关 RAG 集成,请参阅 references/chunking.md。
DoclingDocument 是一个表示已解析内容的 Pydantic 模型:
# 访问文档结构
doc = result.document
# 内容项(列表)
doc.texts # TextItem 实例(段落、标题等)
doc.tables # TableItem 实例
doc.pictures # PictureItem 实例
doc.key_value_items # 键值对
# 结构(树节点)
doc.body # 主要内容层次结构
doc.furniture # 页眉、页脚、页码
doc.groups # 列表、章节、部分
# 按阅读顺序迭代所有元素
for item, level in doc.iterate_items():
print(f"{' ' * level}{item.label}: {item.text[:50]}")
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
EasyOcrOptions,
TesseractOcrOptions,
TesseractCliOcrOptions,
OcrMacOptions,
RapidOcrOptions
)
# EasyOCR(默认)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = EasyOcrOptions(lang=["en", "de"])
# Tesseract
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions(lang=["eng", "deu"])
# RapidOCR
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = RapidOcrOptions()
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TableFormerMode
)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_table_structure = True
# 使用单元格匹配(映射到 PDF 单元格)
pipeline_options.table_structure_options.do_cell_matching = True
# 或使用预测的单元格
pipeline_options.table_structure_options.do_cell_matching = False
# 选择准确度模式
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True # 导出带图像的 HTML 时需要
# 导出嵌入图像
result.document.save_as_html(
"output.html",
image_mode=ImageRefMode.EMBEDDED
)
from docling.datamodel.base_models import ConversionStatus
result = converter.convert("document.pdf")
if result.status == ConversionStatus.SUCCESS:
print("转换成功")
elif result.status == ConversionStatus.PARTIAL_SUCCESS:
print("部分转换成功:")
for error in result.errors:
print(f" {error.error_message}")
else: # FAILURE
print("转换失败:")
for error in result.errors:
print(f" {error.error_message}")
带错误处理的批量处理:
# 出错时继续处理
results = converter.convert_all(
["doc1.pdf", "doc2.pdf", "doc3.pdf"],
raises_on_error=False
)
for result in results:
if result.status == ConversionStatus.SUCCESS:
result.document.save_as_markdown(f"{result.input.file.stem}.md")
else:
print(f"失败:{result.input.file}")
# 基本转换
docling document.pdf
# 转换为特定输出
docling --to markdown document.pdf
# 使用自定义模型路径
docling --artifacts-path /path/to/models document.pdf
# 使用 VLM 流水线
docling --pipeline vlm --vlm-model granite_docling document.pdf
DocumentConverter:主要转换类ConversionResult:包含文档和状态的转换结果DoclingDocument:统一的文档表示(Pydantic 模型)InputFormat:支持的输入格式枚举ConversionStatus:SUCCESS、PARTIAL_SUCCESS、FAILUREPdfPipelineOptions:PDF 流水线配置ImageRefMode:EMBEDDED、REFERENCED、PLACEHOLDERfrom docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownTextSplitter
converter = DocumentConverter()
result = converter.convert("document.pdf")
markdown = result.document.export_to_markdown()
splitter = MarkdownTextSplitter(chunk_size=1000)
chunks = splitter.split_text(markdown)
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from llama_index.core import Document
converter = DocumentConverter()
result = converter.convert("document.pdf")
chunker = HybridChunker()
chunks = list(chunker.chunk(result.document))
documents = [
Document(text=chunk.text, metadata=chunk.meta.export_json_dict())
for chunk in chunks
]
每周安装量
244
代码仓库
GitHub 星标
40
首次出现
2026年1月20日
安全审计
安装于
opencode216
gemini-cli213
codex209
github-copilot198
cursor194
amp178
Docling is a document parsing library that converts PDFs, Word documents, PowerPoint, images, and other formats into structured data with advanced layout understanding.
Basic document conversion:
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # URL, Path, or BytesIO
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())
The main entry point for document conversion. Supports various input formats and conversion options.
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
# Basic converter (all formats enabled)
converter = DocumentConverter()
# Restricted formats
converter = DocumentConverter(
allowed_formats=[InputFormat.PDF, InputFormat.DOCX]
)
# Custom pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
All conversion operations return a ConversionResult containing:
document: The parsed DoclingDocument
status: ConversionStatus.SUCCESS, PARTIAL_SUCCESS, or FAILURE
errors: List of errors encountered during conversion
input: Information about the source document
result = converter.convert("document.pdf")
if result.status == ConversionStatus.SUCCESS: markdown = result.document.export_to_markdown() html = result.document.export_to_html() data = result.document.export_to_dict()
export_to_markdown() or save_as_markdown()export_to_html() or save_as_html()export_to_dict() or save_as_json() (note: no export_to_json() method)export_to_text() or export_to_markdown(strict_text=True) or save_as_markdown(strict_text=True)from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("document.pdf")
# Export to different formats
markdown = result.document.export_to_markdown()
html = result.document.export_to_html()
json_data = result.document.export_to_dict()
# Or save directly to file
result.document.save_as_markdown("output.md")
result.document.save_as_html("output.html")
result.document.save_as_json("output.json")
See references/batch.md for details on convert_all().
converter = DocumentConverter()
result = converter.convert("https://example.com/document.pdf")
from io import BytesIO
from docling.datamodel.base_models import DocumentStream
with open("document.pdf", "rb") as f:
buf = BytesIO(f.read())
source = DocumentStream(name="document.pdf", stream=buf)
result = converter.convert(source)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
# Configure PDF-specific options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options.lang = ["en", "es"]
pipeline_options.do_table_structure = True
pipeline_options.generate_page_images = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
converter = DocumentConverter()
# Limit file size (bytes) and page count
result = converter.convert(
"large_document.pdf",
max_file_size=20_971_520, # 20 MB
max_num_pages=100
)
See references/chunking.md for RAG integration.
The DoclingDocument is a Pydantic model representing parsed content:
# Access document structure
doc = result.document
# Content items (lists)
doc.texts # TextItem instances (paragraphs, headings, etc.)
doc.tables # TableItem instances
doc.pictures # PictureItem instances
doc.key_value_items # Key-value pairs
# Structure (tree nodes)
doc.body # Main content hierarchy
doc.furniture # Headers, footers, page numbers
doc.groups # Lists, chapters, sections
# Iterate all elements in reading order
for item, level in doc.iterate_items():
print(f"{' ' * level}{item.label}: {item.text[:50]}")
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
EasyOcrOptions,
TesseractOcrOptions,
TesseractCliOcrOptions,
OcrMacOptions,
RapidOcrOptions
)
# EasyOCR (default)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = EasyOcrOptions(lang=["en", "de"])
# Tesseract
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions(lang=["eng", "deu"])
# RapidOCR
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = RapidOcrOptions()
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TableFormerMode
)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_table_structure = True
# Use cell matching (map to PDF cells)
pipeline_options.table_structure_options.do_cell_matching = True
# Or use predicted cells
pipeline_options.table_structure_options.do_cell_matching = False
# Choose accuracy mode
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True # Needed for HTML export with images
# Export with embedded images
result.document.save_as_html(
"output.html",
image_mode=ImageRefMode.EMBEDDED
)
from docling.datamodel.base_models import ConversionStatus
result = converter.convert("document.pdf")
if result.status == ConversionStatus.SUCCESS:
print("Conversion successful")
elif result.status == ConversionStatus.PARTIAL_SUCCESS:
print("Partial conversion:")
for error in result.errors:
print(f" {error.error_message}")
else: # FAILURE
print("Conversion failed:")
for error in result.errors:
print(f" {error.error_message}")
For batch processing with error handling:
# Continue processing on errors
results = converter.convert_all(
["doc1.pdf", "doc2.pdf", "doc3.pdf"],
raises_on_error=False
)
for result in results:
if result.status == ConversionStatus.SUCCESS:
result.document.save_as_markdown(f"{result.input.file.stem}.md")
else:
print(f"Failed: {result.input.file}")
# Basic conversion
docling document.pdf
# Convert to specific output
docling --to markdown document.pdf
# With custom model path
docling --artifacts-path /path/to/models document.pdf
# Using VLM pipeline
docling --pipeline vlm --vlm-model granite_docling document.pdf
DocumentConverter: Main conversion classConversionResult: Result of conversion with document and statusDoclingDocument: Unified document representation (Pydantic model)InputFormat: Enum of supported input formatsConversionStatus: SUCCESS, PARTIAL_SUCCESS, FAILUREPdfPipelineOptions: Configuration for PDF pipelineImageRefMode: EMBEDDED, REFERENCED, PLACEHOLDERfrom docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownTextSplitter
converter = DocumentConverter()
result = converter.convert("document.pdf")
markdown = result.document.export_to_markdown()
splitter = MarkdownTextSplitter(chunk_size=1000)
chunks = splitter.split_text(markdown)
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from llama_index.core import Document
converter = DocumentConverter()
result = converter.convert("document.pdf")
chunker = HybridChunker()
chunks = list(chunker.chunk(result.document))
documents = [
Document(text=chunk.text, metadata=chunk.meta.export_json_dict())
for chunk in chunks
]
Weekly Installs
244
Repository
GitHub Stars
40
First Seen
Jan 20, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykWarn
Installed on
opencode216
gemini-cli213
codex209
github-copilot198
cursor194
amp178
iOS 26 FoundationModels 设备端大模型集成指南:隐私优先的AI开发
1,100 周安装
Web Browser Skill:极简CDP工具,自动化网页导航、JS执行与Cookie处理
240 周安装
Claude Code 上下文压缩恢复工具 - 自动恢复工作状态,加载知识库,总结工作进度
241 周安装
阿里云AI语音实时TTS测试技能:最小可行性验证与兼容性探测指南
241 周安装
阿里云DNS CLI测试指南:alicloud-network-dns-cli-test 安装与使用教程
241 周安装
阿里云身份核验Cloudauth API使用指南:SDK集成与OpenAPI调用教程
241 周安装
阿里云SLS日志查询测试:配置指南与性能验证 | 云监控技能
241 周安装
export_to_doctags() or save_as_doctags()