pdf-to-docx by claude-office-skills/skills
npx skills add https://github.com/claude-office-skills/skills --skill pdf-to-docx此技能支持使用 pdf2docx 将 PDF 转换为可编辑的 Word 文档。pdf2docx 是一个 Python 库,能够保留布局、表格、图像和文本格式。与基于 OCR 的解决方案不同,pdf2docx 提取的是 PDF 的原始内容,以实现精确转换。
示例提示:
from pdf2docx import Converter
# 基本转换
cv = Converter('input.pdf')
cv.convert('output.docx')
cv.close()
# 或使用上下文管理器
with Converter('input.pdf') as cv:
cv.convert('output.docx')
from pdf2docx import Converter
cv = Converter('input.pdf')
# 整个文档
cv.convert('output.docx')
# 特定页面(0起始索引)
cv.convert('output.docx', start=0, end=5)
# 单页
cv.convert('output.docx', pages=[0])
# 多个特定页面
cv.convert('output.docx', pages=[0, 2, 4])
cv.close()
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
from pdf2docx import Converter
cv = Converter('input.pdf')
cv.convert(
'output.docx',
start=0, # 起始页(0起始索引)
end=None, # 结束页(None = 最后一页)
pages=None, # 特定页面列表
password=None, # PDF 密码(如果加密)
min_section_height=20.0, # 分区最小高度
connected_border_tolerance=0.5, # 边框检测容差
line_overlap_threshold=0.9, # 行合并阈值
line_break_width_ratio=0.5, # 换行检测
line_break_free_space_ratio=0.1,
line_separate_threshold=5, # 垂直线分离
new_paragraph_free_space_ratio=0.85,
float_image_ignorable_gap=5,
page_margin_factor_top=0.5,
page_margin_factor_bottom=0.5,
)
cv.close()
# 对原生 PDF 效果最佳
cv = Converter('native_pdf.pdf')
cv.convert('output.docx')
cv.close()
# 对于扫描的 PDF,请先使用 OCR
# pdf2docx 对原生文本 PDF 效果最佳
# 考虑先使用 pytesseract 或 PaddleOCR
import pytesseract
from pdf2image import convert_from_path
# 将 PDF 页面转换为图像
images = convert_from_path('scanned.pdf')
# 对每个页面进行 OCR
text = ''
for img in images:
text += pytesseract.image_to_string(img)
# 然后从文本创建 Word 文档
from pdf2docx import Converter
import os
def pdf_to_word(pdf_path, output_path=None, pages=None):
"""将 PDF 转换为 Word 文档。"""
if output_path is None:
output_path = pdf_path.replace('.pdf', '.docx')
cv = Converter(pdf_path)
if pages:
cv.convert(output_path, pages=pages)
else:
cv.convert(output_path)
cv.close()
return output_path
# 用法
result = pdf_to_word('document.pdf')
print(f"Created: {result}")
from pdf2docx import Converter
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
def convert_single(pdf_path, output_dir):
"""将单个 PDF 转换为 Word。"""
output_path = output_dir / pdf_path.with_suffix('.docx').name
try:
cv = Converter(str(pdf_path))
cv.convert(str(output_path))
cv.close()
return f"Success: {pdf_path.name}"
except Exception as e:
return f"Error: {pdf_path.name} - {e}"
def batch_convert(input_dir, output_dir, max_workers=4):
"""转换目录中的所有 PDF。"""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
pdf_files = list(input_path.glob('*.pdf'))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(convert_single, pdf, output_path)
for pdf in pdf_files
]
for future in futures:
print(future.result())
batch_convert('./pdfs', './word_docs')
from pdf2docx import Converter
def analyze_pdf(pdf_path):
"""在转换前分析 PDF 结构。"""
cv = Converter(pdf_path)
for i, page in enumerate(cv.pages):
print(f"Page {i+1}:")
print(f" Size: {page.width} x {page.height}")
print(f" Blocks: {len(page.blocks)}")
for block in page.blocks:
if hasattr(block, 'text'):
print(f" Text block: {block.text[:50]}...")
elif hasattr(block, 'image'):
print(f" Image block")
cv.close()
analyze_pdf('document.pdf')
from pdf2docx import Converter
def convert_with_progress(pdf_path, output_path):
"""带进度跟踪的 PDF 转换。"""
cv = Converter(pdf_path)
total_pages = len(cv.pages)
print(f"Converting {total_pages} pages...")
for i in range(total_pages):
cv.convert(output_path, start=i, end=i+1)
progress = (i + 1) / total_pages * 100
print(f"Progress: {progress:.1f}%")
cv.close()
print("Conversion complete!")
from pdf2docx import Converter
from docx import Document
def extract_tables_to_word(pdf_path, output_path):
"""仅从 PDF 中提取表格到 Word。"""
cv = Converter(pdf_path)
# 首先进行完整转换
temp_path = 'temp_full.docx'
cv.convert(temp_path)
cv.close()
# 打开并提取表格
doc = Document(temp_path)
new_doc = Document()
for table in doc.tables:
# 将表格复制到新文档
new_table = new_doc.add_table(rows=0, cols=len(table.columns))
for row in table.rows:
new_row = new_table.add_row()
for i, cell in enumerate(row.cells):
new_row.cells[i].text = cell.text
new_doc.add_paragraph() # 添加间距
new_doc.save(output_path)
os.remove(temp_path)
from pdf2docx import Converter
import os
def convert_contract(pdf_path):
"""将合同 PDF 转换为可编辑的 Word 并包含元数据。"""
# 定义输出路径
base_name = os.path.splitext(pdf_path)[0]
output_path = f"{base_name}_editable.docx"
# 转换
cv = Converter(pdf_path)
# 检查页数
page_count = len(cv.pages)
print(f"Processing {page_count} pages...")
# 转换所有页面
cv.convert(output_path)
cv.close()
print(f"Created: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")
return output_path
# 用法
result = convert_contract('contract.pdf')
from pdf2docx import Converter
def convert_selected_pages(pdf_path, page_ranges, output_path):
"""将特定的页面范围转换为 Word。
page_ranges: 元组列表,例如 [(1, 3), (5, 7)] 表示第 1-3 页和第 5-7 页
"""
cv = Converter(pdf_path)
# 转换页面(内部使用 0 起始索引)
all_pages = []
for start, end in page_ranges:
all_pages.extend(range(start - 1, end)) # 转换为 0 起始索引
cv.convert(output_path, pages=all_pages)
cv.close()
print(f"Converted pages: {page_ranges}")
return output_path
# 转换第 1-5 页和第 10-15 页
convert_selected_pages(
'long_document.pdf',
[(1, 5), (10, 15)],
'selected_pages.docx'
)
from pdf2docx import Converter
from docx import Document
def pdf_to_template(pdf_path, output_path):
"""将 PDF 报告转换为带有占位符的 Word 模板。"""
# 将 PDF 转换为 Word
cv = Converter(pdf_path)
cv.convert(output_path)
cv.close()
# 打开并添加占位符字段
doc = Document(output_path)
# 用占位符替换常见字段
replacements = {
'Company Name': '[COMPANY_NAME]',
'Date:': 'Date: [DATE]',
'Prepared by:': 'Prepared by: [AUTHOR]',
}
for para in doc.paragraphs:
for old, new in replacements.items():
if old in para.text:
para.text = para.text.replace(old, new)
# 同时检查表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for old, new in replacements.items():
if old in cell.text:
cell.text = cell.text.replace(old, new)
doc.save(output_path)
print(f"Template created: {output_path}")
pdf_to_template('annual_report.pdf', 'report_template.docx')
from pdf2docx import Converter
from pathlib import Path
import json
def process_invoices(input_folder, output_folder):
"""将 PDF 发票转换为可编辑的 Word 文档。"""
input_path = Path(input_folder)
output_path = Path(output_folder)
output_path.mkdir(exist_ok=True)
results = []
for pdf_file in input_path.glob('*.pdf'):
output_file = output_path / pdf_file.with_suffix('.docx').name
try:
cv = Converter(str(pdf_file))
cv.convert(str(output_file))
cv.close()
results.append({
'file': pdf_file.name,
'status': 'success',
'output': str(output_file)
})
except Exception as e:
results.append({
'file': pdf_file.name,
'status': 'error',
'error': str(e)
})
# 保存结果日志
with open(output_path / 'conversion_log.json', 'w') as f:
json.dump(results, f, indent=2)
# 摘要
success = sum(1 for r in results if r['status'] == 'success')
print(f"Converted {success}/{len(results)} files")
return results
results = process_invoices('./invoices_pdf', './invoices_word')
pip install pdf2docx
# 用于图像处理
pip install Pillow
每周安装量
36
仓库
GitHub 星标数
5
首次出现
6 天前
安全审计
安装于
claude-code28
gemini-cli18
github-copilot18
codex18
amp18
cline18
This skill enables conversion from PDF to editable Word documents using pdf2docx - a Python library that preserves layout, tables, images, and text formatting. Unlike OCR-based solutions, pdf2docx extracts native PDF content for accurate conversion.
Example prompts:
from pdf2docx import Converter
# Basic conversion
cv = Converter('input.pdf')
cv.convert('output.docx')
cv.close()
# Or using context manager
with Converter('input.pdf') as cv:
cv.convert('output.docx')
from pdf2docx import Converter
cv = Converter('input.pdf')
# Full document
cv.convert('output.docx')
# Specific pages (0-indexed)
cv.convert('output.docx', start=0, end=5)
# Single page
cv.convert('output.docx', pages=[0])
# Multiple specific pages
cv.convert('output.docx', pages=[0, 2, 4])
cv.close()
from pdf2docx import Converter
cv = Converter('input.pdf')
cv.convert(
'output.docx',
start=0, # Start page (0-indexed)
end=None, # End page (None = last page)
pages=None, # Specific pages list
password=None, # PDF password if encrypted
min_section_height=20.0, # Minimum height for section
connected_border_tolerance=0.5, # Border detection tolerance
line_overlap_threshold=0.9, # Line merging threshold
line_break_width_ratio=0.5, # Line break detection
line_break_free_space_ratio=0.1,
line_separate_threshold=5, # Vertical line separation
new_paragraph_free_space_ratio=0.85,
float_image_ignorable_gap=5,
page_margin_factor_top=0.5,
page_margin_factor_bottom=0.5,
)
cv.close()
# Works best with native PDFs
cv = Converter('native_pdf.pdf')
cv.convert('output.docx')
cv.close()
# For scanned PDFs, use OCR first
# pdf2docx works best with native text PDFs
# Consider using pytesseract or PaddleOCR first
import pytesseract
from pdf2image import convert_from_path
# Convert PDF pages to images
images = convert_from_path('scanned.pdf')
# OCR each page
text = ''
for img in images:
text += pytesseract.image_to_string(img)
# Then create Word document from text
from pdf2docx import Converter
import os
def pdf_to_word(pdf_path, output_path=None, pages=None):
"""Convert PDF to Word document."""
if output_path is None:
output_path = pdf_path.replace('.pdf', '.docx')
cv = Converter(pdf_path)
if pages:
cv.convert(output_path, pages=pages)
else:
cv.convert(output_path)
cv.close()
return output_path
# Usage
result = pdf_to_word('document.pdf')
print(f"Created: {result}")
from pdf2docx import Converter
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
def convert_single(pdf_path, output_dir):
"""Convert single PDF to Word."""
output_path = output_dir / pdf_path.with_suffix('.docx').name
try:
cv = Converter(str(pdf_path))
cv.convert(str(output_path))
cv.close()
return f"Success: {pdf_path.name}"
except Exception as e:
return f"Error: {pdf_path.name} - {e}"
def batch_convert(input_dir, output_dir, max_workers=4):
"""Convert all PDFs in directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
pdf_files = list(input_path.glob('*.pdf'))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(convert_single, pdf, output_path)
for pdf in pdf_files
]
for future in futures:
print(future.result())
batch_convert('./pdfs', './word_docs')
from pdf2docx import Converter
def analyze_pdf(pdf_path):
"""Analyze PDF structure before conversion."""
cv = Converter(pdf_path)
for i, page in enumerate(cv.pages):
print(f"Page {i+1}:")
print(f" Size: {page.width} x {page.height}")
print(f" Blocks: {len(page.blocks)}")
for block in page.blocks:
if hasattr(block, 'text'):
print(f" Text block: {block.text[:50]}...")
elif hasattr(block, 'image'):
print(f" Image block")
cv.close()
analyze_pdf('document.pdf')
from pdf2docx import Converter
def convert_with_progress(pdf_path, output_path):
"""Convert PDF with progress tracking."""
cv = Converter(pdf_path)
total_pages = len(cv.pages)
print(f"Converting {total_pages} pages...")
for i in range(total_pages):
cv.convert(output_path, start=i, end=i+1)
progress = (i + 1) / total_pages * 100
print(f"Progress: {progress:.1f}%")
cv.close()
print("Conversion complete!")
from pdf2docx import Converter
from docx import Document
def extract_tables_to_word(pdf_path, output_path):
"""Extract only tables from PDF to Word."""
cv = Converter(pdf_path)
# First do full conversion
temp_path = 'temp_full.docx'
cv.convert(temp_path)
cv.close()
# Open and extract tables
doc = Document(temp_path)
new_doc = Document()
for table in doc.tables:
# Copy table to new document
new_table = new_doc.add_table(rows=0, cols=len(table.columns))
for row in table.rows:
new_row = new_table.add_row()
for i, cell in enumerate(row.cells):
new_row.cells[i].text = cell.text
new_doc.add_paragraph() # Add spacing
new_doc.save(output_path)
os.remove(temp_path)
from pdf2docx import Converter
import os
def convert_contract(pdf_path):
"""Convert contract PDF to editable Word with metadata."""
# Define output path
base_name = os.path.splitext(pdf_path)[0]
output_path = f"{base_name}_editable.docx"
# Convert
cv = Converter(pdf_path)
# Check page count
page_count = len(cv.pages)
print(f"Processing {page_count} pages...")
# Convert all pages
cv.convert(output_path)
cv.close()
print(f"Created: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")
return output_path
# Usage
result = convert_contract('contract.pdf')
from pdf2docx import Converter
def convert_selected_pages(pdf_path, page_ranges, output_path):
"""Convert specific page ranges to Word.
page_ranges: List of tuples like [(1, 3), (5, 7)] for pages 1-3 and 5-7
"""
cv = Converter(pdf_path)
# Convert pages (0-indexed internally)
all_pages = []
for start, end in page_ranges:
all_pages.extend(range(start - 1, end)) # Convert to 0-indexed
cv.convert(output_path, pages=all_pages)
cv.close()
print(f"Converted pages: {page_ranges}")
return output_path
# Convert pages 1-5 and 10-15
convert_selected_pages(
'long_document.pdf',
[(1, 5), (10, 15)],
'selected_pages.docx'
)
from pdf2docx import Converter
from docx import Document
def pdf_to_template(pdf_path, output_path):
"""Convert PDF report to Word template with placeholders."""
# Convert PDF to Word
cv = Converter(pdf_path)
cv.convert(output_path)
cv.close()
# Open and add placeholder fields
doc = Document(output_path)
# Replace common fields with placeholders
replacements = {
'Company Name': '[COMPANY_NAME]',
'Date:': 'Date: [DATE]',
'Prepared by:': 'Prepared by: [AUTHOR]',
}
for para in doc.paragraphs:
for old, new in replacements.items():
if old in para.text:
para.text = para.text.replace(old, new)
# Also check tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for old, new in replacements.items():
if old in cell.text:
cell.text = cell.text.replace(old, new)
doc.save(output_path)
print(f"Template created: {output_path}")
pdf_to_template('annual_report.pdf', 'report_template.docx')
from pdf2docx import Converter
from pathlib import Path
import json
def process_invoices(input_folder, output_folder):
"""Convert PDF invoices to editable Word documents."""
input_path = Path(input_folder)
output_path = Path(output_folder)
output_path.mkdir(exist_ok=True)
results = []
for pdf_file in input_path.glob('*.pdf'):
output_file = output_path / pdf_file.with_suffix('.docx').name
try:
cv = Converter(str(pdf_file))
cv.convert(str(output_file))
cv.close()
results.append({
'file': pdf_file.name,
'status': 'success',
'output': str(output_file)
})
except Exception as e:
results.append({
'file': pdf_file.name,
'status': 'error',
'error': str(e)
})
# Save results log
with open(output_path / 'conversion_log.json', 'w') as f:
json.dump(results, f, indent=2)
# Summary
success = sum(1 for r in results if r['status'] == 'success')
print(f"Converted {success}/{len(results)} files")
return results
results = process_invoices('./invoices_pdf', './invoices_word')
pip install pdf2docx
# For image handling
pip install Pillow
Weekly Installs
36
Repository
GitHub Stars
5
First Seen
6 days ago
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code28
gemini-cli18
github-copilot18
codex18
amp18
cline18
xdrop 文件传输脚本:Bun 环境下安全上传下载工具,支持加密分享
20,700 周安装