PDF转Word文档工具 - 使用pdf2docx库实现高精度转换，保留布局和格式

pdf-to-docx by claude-office-skills/skills

382 周安装量

26 GitHub Stars

GitHub

安装命令

npx skills add https://github.com/claude-office-skills/skills --skill pdf-to-docx

办公软件 Python Web框架自动化

🇨🇳中文介绍

PDF 转 Word 技能

概述

此技能支持使用 pdf2docx 将 PDF 转换为可编辑的 Word 文档。pdf2docx 是一个 Python 库，能够保留布局、表格、图像和文本格式。与基于 OCR 的解决方案不同，pdf2docx 提取的是 PDF 的原始内容，以实现精确转换。

使用方法

提供您想要转换的 PDF 文件
可选地指定页面或转换选项
我将把它转换为可编辑的 Word 文档

示例提示：

"将此 PDF 报告转换为可编辑的 Word 文档"
"将此 PDF 的第 1-5 页转换为 Word 格式"
"将此扫描文档提取为可编辑文本"
"将此 PDF 合同转换为 Word 以便编辑"

领域知识

pdf2docx 基础

from pdf2docx import Converter

# 基本转换
cv = Converter('input.pdf')
cv.convert('output.docx')
cv.close()

# 或使用上下文管理器
with Converter('input.pdf') as cv:
    cv.convert('output.docx')

转换选项

from pdf2docx import Converter

cv = Converter('input.pdf')

# 整个文档
cv.convert('output.docx')

# 特定页面（0起始索引）
cv.convert('output.docx', start=0, end=5)

# 单页
cv.convert('output.docx', pages=[0])

# 多个特定页面
cv.convert('output.docx', pages=[0, 2, 4])

cv.close()

广告位招租

在这里展示您的产品或服务

触达数万 AI 开发者，精准高效

联系我们

处理不同类型的 PDF

原生 PDF（基于文本）

# 对原生 PDF 效果最佳
cv = Converter('native_pdf.pdf')
cv.convert('output.docx')
cv.close()

扫描的 PDF（基于图像）

# 对于扫描的 PDF，请先使用 OCR
# pdf2docx 对原生文本 PDF 效果最佳
# 考虑先使用 pytesseract 或 PaddleOCR

import pytesseract
from pdf2image import convert_from_path

# 将 PDF 页面转换为图像
images = convert_from_path('scanned.pdf')

# 对每个页面进行 OCR
text = ''
for img in images:
    text += pytesseract.image_to_string(img)

# 然后从文本创建 Word 文档

from pdf2docx import Converter
import os

def pdf_to_word(pdf_path, output_path=None, pages=None):
    """将 PDF 转换为 Word 文档。"""
    if output_path is None:
        output_path = pdf_path.replace('.pdf', '.docx')
    
    cv = Converter(pdf_path)
    
    if pages:
        cv.convert(output_path, pages=pages)
    else:
        cv.convert(output_path)
    
    cv.close()
    
    return output_path

# 用法
result = pdf_to_word('document.pdf')
print(f"Created: {result}")

from pdf2docx import Converter
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

def convert_single(pdf_path, output_dir):
    """将单个 PDF 转换为 Word。"""
    output_path = output_dir / pdf_path.with_suffix('.docx').name
    
    try:
        cv = Converter(str(pdf_path))
        cv.convert(str(output_path))
        cv.close()
        return f"Success: {pdf_path.name}"
    except Exception as e:
        return f"Error: {pdf_path.name} - {e}"

def batch_convert(input_dir, output_dir, max_workers=4):
    """转换目录中的所有 PDF。"""
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    pdf_files = list(input_path.glob('*.pdf'))
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(convert_single, pdf, output_path)
            for pdf in pdf_files
        ]
        
        for future in futures:
            print(future.result())

batch_convert('./pdfs', './word_docs')

from pdf2docx import Converter

def analyze_pdf(pdf_path):
    """在转换前分析 PDF 结构。"""
    cv = Converter(pdf_path)
    
    for i, page in enumerate(cv.pages):
        print(f"Page {i+1}:")
        print(f"  Size: {page.width} x {page.height}")
        print(f"  Blocks: {len(page.blocks)}")
        
        for block in page.blocks:
            if hasattr(block, 'text'):
                print(f"    Text block: {block.text[:50]}...")
            elif hasattr(block, 'image'):
                print(f"    Image block")
    
    cv.close()

analyze_pdf('document.pdf')

检查 PDF 类型：原生 PDF 比扫描的 PDF 转换效果更好
先预览：在完全转换前先用几页测试
处理表格：复杂的表格可能需要手动调整
图像质量：图像以原始分辨率提取
字体处理：某些字体可能会被替换为系统默认字体

带进度显示的转换

from pdf2docx import Converter

def convert_with_progress(pdf_path, output_path):
    """带进度跟踪的 PDF 转换。"""
    cv = Converter(pdf_path)
    
    total_pages = len(cv.pages)
    print(f"Converting {total_pages} pages...")
    
    for i in range(total_pages):
        cv.convert(output_path, start=i, end=i+1)
        progress = (i + 1) / total_pages * 100
        print(f"Progress: {progress:.1f}%")
    
    cv.close()
    print("Conversion complete!")

from pdf2docx import Converter
from docx import Document

def extract_tables_to_word(pdf_path, output_path):
    """仅从 PDF 中提取表格到 Word。"""
    cv = Converter(pdf_path)
    
    # 首先进行完整转换
    temp_path = 'temp_full.docx'
    cv.convert(temp_path)
    cv.close()
    
    # 打开并提取表格
    doc = Document(temp_path)
    new_doc = Document()
    
    for table in doc.tables:
        # 将表格复制到新文档
        new_table = new_doc.add_table(rows=0, cols=len(table.columns))
        
        for row in table.rows:
            new_row = new_table.add_row()
            for i, cell in enumerate(row.cells):
                new_row.cells[i].text = cell.text
        
        new_doc.add_paragraph()  # 添加间距
    
    new_doc.save(output_path)
    os.remove(temp_path)

示例 1：合同转换

from pdf2docx import Converter
import os

def convert_contract(pdf_path):
    """将合同 PDF 转换为可编辑的 Word 并包含元数据。"""
    
    # 定义输出路径
    base_name = os.path.splitext(pdf_path)[0]
    output_path = f"{base_name}_editable.docx"
    
    # 转换
    cv = Converter(pdf_path)
    
    # 检查页数
    page_count = len(cv.pages)
    print(f"Processing {page_count} pages...")
    
    # 转换所有页面
    cv.convert(output_path)
    cv.close()
    
    print(f"Created: {output_path}")
    print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")
    
    return output_path

# 用法
result = convert_contract('contract.pdf')

示例 2：选择性页面转换

from pdf2docx import Converter

def convert_selected_pages(pdf_path, page_ranges, output_path):
    """将特定的页面范围转换为 Word。
    
    page_ranges: 元组列表，例如 [(1, 3), (5, 7)] 表示第 1-3 页和第 5-7 页
    """
    cv = Converter(pdf_path)
    
    # 转换页面（内部使用 0 起始索引）
    all_pages = []
    for start, end in page_ranges:
        all_pages.extend(range(start - 1, end))  # 转换为 0 起始索引
    
    cv.convert(output_path, pages=all_pages)
    cv.close()
    
    print(f"Converted pages: {page_ranges}")
    return output_path

# 转换第 1-5 页和第 10-15 页
convert_selected_pages(
    'long_document.pdf',
    [(1, 5), (10, 15)],
    'selected_pages.docx'
)

示例 3：PDF 报告转换为可编辑模板

from pdf2docx import Converter
from docx import Document

def pdf_to_template(pdf_path, output_path):
    """将 PDF 报告转换为带有占位符的 Word 模板。"""
    
    # 将 PDF 转换为 Word
    cv = Converter(pdf_path)
    cv.convert(output_path)
    cv.close()
    
    # 打开并添加占位符字段
    doc = Document(output_path)
    
    # 用占位符替换常见字段
    replacements = {
        'Company Name': '[COMPANY_NAME]',
        'Date:': 'Date: [DATE]',
        'Prepared by:': 'Prepared by: [AUTHOR]',
    }
    
    for para in doc.paragraphs:
        for old, new in replacements.items():
            if old in para.text:
                para.text = para.text.replace(old, new)
    
    # 同时检查表格
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for old, new in replacements.items():
                    if old in cell.text:
                        cell.text = cell.text.replace(old, new)
    
    doc.save(output_path)
    print(f"Template created: {output_path}")

pdf_to_template('annual_report.pdf', 'report_template.docx')

示例 4：批量发票处理

from pdf2docx import Converter
from pathlib import Path
import json

def process_invoices(input_folder, output_folder):
    """将 PDF 发票转换为可编辑的 Word 文档。"""
    
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)
    
    results = []
    
    for pdf_file in input_path.glob('*.pdf'):
        output_file = output_path / pdf_file.with_suffix('.docx').name
        
        try:
            cv = Converter(str(pdf_file))
            cv.convert(str(output_file))
            cv.close()
            
            results.append({
                'file': pdf_file.name,
                'status': 'success',
                'output': str(output_file)
            })
            
        except Exception as e:
            results.append({
                'file': pdf_file.name,
                'status': 'error',
                'error': str(e)
            })
    
    # 保存结果日志
    with open(output_path / 'conversion_log.json', 'w') as f:
        json.dump(results, f, indent=2)
    
    # 摘要
    success = sum(1 for r in results if r['status'] == 'success')
    print(f"Converted {success}/{len(results)} files")
    
    return results

results = process_invoices('./invoices_pdf', './invoices_word')

扫描的 PDF 需要 OCR 预处理
复杂的布局可能无法完美转换
某些字体可能不可用
水印会包含在转换中
受保护/加密的 PDF 需要密码

pip install pdf2docx

# 用于图像处理
pip install Pillow

🇺🇸English

PDF to Word Skill

Overview

This skill enables conversion from PDF to editable Word documents using pdf2docx - a Python library that preserves layout, tables, images, and text formatting. Unlike OCR-based solutions, pdf2docx extracts native PDF content for accurate conversion.

How to Use

Provide the PDF file you want to convert
Optionally specify pages or conversion options
I'll convert it to an editable Word document

Example prompts:

"Convert this PDF report to an editable Word document"
"Turn pages 1-5 of this PDF into Word format"
"Extract this scanned document as editable text"
"Convert this PDF contract to Word for editing"

Domain Knowledge

pdf2docx Fundamentals

from pdf2docx import Converter

# Basic conversion
cv = Converter('input.pdf')
cv.convert('output.docx')
cv.close()

# Or using context manager
with Converter('input.pdf') as cv:
    cv.convert('output.docx')

Conversion Options

from pdf2docx import Converter

cv = Converter('input.pdf')

# Full document
cv.convert('output.docx')

# Specific pages (0-indexed)
cv.convert('output.docx', start=0, end=5)

# Single page
cv.convert('output.docx', pages=[0])

# Multiple specific pages
cv.convert('output.docx', pages=[0, 2, 4])

cv.close()

Advanced Options

from pdf2docx import Converter

cv = Converter('input.pdf')

cv.convert(
    'output.docx',
    start=0,                    # Start page (0-indexed)
    end=None,                   # End page (None = last page)
    pages=None,                 # Specific pages list
    password=None,              # PDF password if encrypted
    min_section_height=20.0,    # Minimum height for section
    connected_border_tolerance=0.5,  # Border detection tolerance
    line_overlap_threshold=0.9, # Line merging threshold
    line_break_width_ratio=0.5, # Line break detection
    line_break_free_space_ratio=0.1,
    line_separate_threshold=5,  # Vertical line separation
    new_paragraph_free_space_ratio=0.85,
    float_image_ignorable_gap=5,
    page_margin_factor_top=0.5,
    page_margin_factor_bottom=0.5,
)

cv.close()

Handling Different PDF Types

Native PDFs (Text-based)

# Works best with native PDFs
cv = Converter('native_pdf.pdf')
cv.convert('output.docx')
cv.close()

Scanned PDFs (Image-based)

# For scanned PDFs, use OCR first
# pdf2docx works best with native text PDFs
# Consider using pytesseract or PaddleOCR first

import pytesseract
from pdf2image import convert_from_path

# Convert PDF pages to images
images = convert_from_path('scanned.pdf')

# OCR each page
text = ''
for img in images:
    text += pytesseract.image_to_string(img)

# Then create Word document from text

Python Integration

from pdf2docx import Converter
import os

def pdf_to_word(pdf_path, output_path=None, pages=None):
    """Convert PDF to Word document."""
    if output_path is None:
        output_path = pdf_path.replace('.pdf', '.docx')
    
    cv = Converter(pdf_path)
    
    if pages:
        cv.convert(output_path, pages=pages)
    else:
        cv.convert(output_path)
    
    cv.close()
    
    return output_path

# Usage
result = pdf_to_word('document.pdf')
print(f"Created: {result}")

Batch Conversion

from pdf2docx import Converter
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

def convert_single(pdf_path, output_dir):
    """Convert single PDF to Word."""
    output_path = output_dir / pdf_path.with_suffix('.docx').name
    
    try:
        cv = Converter(str(pdf_path))
        cv.convert(str(output_path))
        cv.close()
        return f"Success: {pdf_path.name}"
    except Exception as e:
        return f"Error: {pdf_path.name} - {e}"

def batch_convert(input_dir, output_dir, max_workers=4):
    """Convert all PDFs in directory."""
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    pdf_files = list(input_path.glob('*.pdf'))
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(convert_single, pdf, output_path)
            for pdf in pdf_files
        ]
        
        for future in futures:
            print(future.result())

batch_convert('./pdfs', './word_docs')

Parsing PDF Structure

from pdf2docx import Converter

def analyze_pdf(pdf_path):
    """Analyze PDF structure before conversion."""
    cv = Converter(pdf_path)
    
    for i, page in enumerate(cv.pages):
        print(f"Page {i+1}:")
        print(f"  Size: {page.width} x {page.height}")
        print(f"  Blocks: {len(page.blocks)}")
        
        for block in page.blocks:
            if hasattr(block, 'text'):
                print(f"    Text block: {block.text[:50]}...")
            elif hasattr(block, 'image'):
                print(f"    Image block")
    
    cv.close()

analyze_pdf('document.pdf')

Best Practices

Check PDF Type : Native PDFs convert better than scanned
Preview First : Test with a few pages before full conversion
Handle Tables : Complex tables may need manual adjustment
Image Quality : Images are extracted at original resolution
Font Handling : Some fonts may substitute to system defaults

Common Patterns

Convert with Progress

from pdf2docx import Converter

def convert_with_progress(pdf_path, output_path):
    """Convert PDF with progress tracking."""
    cv = Converter(pdf_path)
    
    total_pages = len(cv.pages)
    print(f"Converting {total_pages} pages...")
    
    for i in range(total_pages):
        cv.convert(output_path, start=i, end=i+1)
        progress = (i + 1) / total_pages * 100
        print(f"Progress: {progress:.1f}%")
    
    cv.close()
    print("Conversion complete!")

Extract Tables Only

from pdf2docx import Converter
from docx import Document

def extract_tables_to_word(pdf_path, output_path):
    """Extract only tables from PDF to Word."""
    cv = Converter(pdf_path)
    
    # First do full conversion
    temp_path = 'temp_full.docx'
    cv.convert(temp_path)
    cv.close()
    
    # Open and extract tables
    doc = Document(temp_path)
    new_doc = Document()
    
    for table in doc.tables:
        # Copy table to new document
        new_table = new_doc.add_table(rows=0, cols=len(table.columns))
        
        for row in table.rows:
            new_row = new_table.add_row()
            for i, cell in enumerate(row.cells):
                new_row.cells[i].text = cell.text
        
        new_doc.add_paragraph()  # Add spacing
    
    new_doc.save(output_path)
    os.remove(temp_path)

Examples

Example 1: Contract Conversion

from pdf2docx import Converter
import os

def convert_contract(pdf_path):
    """Convert contract PDF to editable Word with metadata."""
    
    # Define output path
    base_name = os.path.splitext(pdf_path)[0]
    output_path = f"{base_name}_editable.docx"
    
    # Convert
    cv = Converter(pdf_path)
    
    # Check page count
    page_count = len(cv.pages)
    print(f"Processing {page_count} pages...")
    
    # Convert all pages
    cv.convert(output_path)
    cv.close()
    
    print(f"Created: {output_path}")
    print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")
    
    return output_path

# Usage
result = convert_contract('contract.pdf')

Example 2: Selective Page Conversion

from pdf2docx import Converter

def convert_selected_pages(pdf_path, page_ranges, output_path):
    """Convert specific page ranges to Word.
    
    page_ranges: List of tuples like [(1, 3), (5, 7)] for pages 1-3 and 5-7
    """
    cv = Converter(pdf_path)
    
    # Convert pages (0-indexed internally)
    all_pages = []
    for start, end in page_ranges:
        all_pages.extend(range(start - 1, end))  # Convert to 0-indexed
    
    cv.convert(output_path, pages=all_pages)
    cv.close()
    
    print(f"Converted pages: {page_ranges}")
    return output_path

# Convert pages 1-5 and 10-15
convert_selected_pages(
    'long_document.pdf',
    [(1, 5), (10, 15)],
    'selected_pages.docx'
)

Example 3: PDF Report to Editable Template

from pdf2docx import Converter
from docx import Document

def pdf_to_template(pdf_path, output_path):
    """Convert PDF report to Word template with placeholders."""
    
    # Convert PDF to Word
    cv = Converter(pdf_path)
    cv.convert(output_path)
    cv.close()
    
    # Open and add placeholder fields
    doc = Document(output_path)
    
    # Replace common fields with placeholders
    replacements = {
        'Company Name': '[COMPANY_NAME]',
        'Date:': 'Date: [DATE]',
        'Prepared by:': 'Prepared by: [AUTHOR]',
    }
    
    for para in doc.paragraphs:
        for old, new in replacements.items():
            if old in para.text:
                para.text = para.text.replace(old, new)
    
    # Also check tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for old, new in replacements.items():
                    if old in cell.text:
                        cell.text = cell.text.replace(old, new)
    
    doc.save(output_path)
    print(f"Template created: {output_path}")

pdf_to_template('annual_report.pdf', 'report_template.docx')

Example 4: Bulk Invoice Processing

from pdf2docx import Converter
from pathlib import Path
import json

def process_invoices(input_folder, output_folder):
    """Convert PDF invoices to editable Word documents."""
    
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)
    
    results = []
    
    for pdf_file in input_path.glob('*.pdf'):
        output_file = output_path / pdf_file.with_suffix('.docx').name
        
        try:
            cv = Converter(str(pdf_file))
            cv.convert(str(output_file))
            cv.close()
            
            results.append({
                'file': pdf_file.name,
                'status': 'success',
                'output': str(output_file)
            })
            
        except Exception as e:
            results.append({
                'file': pdf_file.name,
                'status': 'error',
                'error': str(e)
            })
    
    # Save results log
    with open(output_path / 'conversion_log.json', 'w') as f:
        json.dump(results, f, indent=2)
    
    # Summary
    success = sum(1 for r in results if r['status'] == 'success')
    print(f"Converted {success}/{len(results)} files")
    
    return results

results = process_invoices('./invoices_pdf', './invoices_word')

Limitations

Scanned PDFs require OCR preprocessing
Complex layouts may not convert perfectly
Some fonts may not be available
Watermarks are included in conversion
Protected/encrypted PDFs need password

Installation

pip install pdf2docx

# For image handling
pip install Pillow

Resources

Weekly Installs

Repository

claude-office-s…s/skills

GitHub Stars

First Seen

6 days ago

Security Audits

Gen Agent Trust HubPass SocketPass SnykPass

Installed on

claude-code28

gemini-cli18

github-copilot18

codex18

amp18

cline18

xdrop 文件传输脚本：Bun 环境下安全上传下载工具，支持加密分享

20,700 周安装