PaddleOCR 光学字符识别教程 - 多语言文本识别、表格提取与文档版面分析

OCR with PaddleOCR by amnadtaowsoam/cerebraskills

2 GitHub Stars

GitHub

安装命令

npx skills add https://github.com/amnadtaowsoam/cerebraskills --skill 'OCR with PaddleOCR'

AI/机器学习自动化数据处理

🇨🇳中文介绍

使用 PaddleOCR 进行光学字符识别

概述

PaddleOCR 是一个功能强大的开源 OCR 工具包，支持多语言文本识别、表格识别和文档版面分析。本技能涵盖了各种文档处理场景的实现模式。

先决条件

Python 3.8+ : PaddlePaddle 和 PaddleOCR 所需
PaddlePaddle : 深度学习框架（CPU 或 GPU 版本）
OpenCV : 用于图像预处理和操作
NumPy : 用于数组操作
图像预处理 : 了解图像增强技术
深度学习基础 : 了解神经网络和模型推理

关键概念

检测模型 : 使用 DBNet 定位图像中的文本区域
识别模型 : 使用 CRNN 识别文本内容
方向分类器 : 确定文本方向（0°、90°、180°、270°）
多语言支持 : 支持 80 多种语言的特定模型
表格识别 : 用于提取结构化表格数据的专用模型
文档版面分析 : 识别文档结构（标题、段落、表格、图像）
GPU 加速 : 支持 CUDA 以实现更快的推理
模型量化 : INT8 量化，用于在边缘设备上部署

实现指南

安装

# CPU 版本
pip install paddlepaddle paddleocr

# GPU 版本 (CUDA 11.2)
pip install paddlepaddle-gpu paddleocr

# GPU 版本 (CUDA 11.8)
pip install paddlepaddle-gpu==2.5.2.post118 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html

广告位招租

在这里展示您的产品或服务

触达数万 AI 开发者，精准高效

联系我们

相关 Skills

FlyClaw：零登录航班聚合查询工具，Python实现多源航班信息与价格搜索

4,000,000 周安装

find-skills 技能搜索工具 - Vercel Labs 开源智能体技能包管理器

812,900 周安装

Azure RBAC 权限管理工具：查找最小角色、创建自定义角色与自动化分配

117,000 周安装

React 组合模式指南：Vercel 组件架构最佳实践，提升代码可维护性

109,600 周安装

from paddleocr import PaddleOCR

# 支持的语言：'ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'ca', 'hi'

# 英语
ocr_en = PaddleOCR(use_angle_cls=True, lang='en')

# 中文
ocr_ch = PaddleOCR(use_angle_cls=True, lang='ch')

# 泰语
ocr_th = PaddleOCR(use_angle_cls=True, lang='th')

# 韩语
ocr_kr = PaddleOCR(use_angle_cls=True, lang='korean')

# 自定义语言模型
ocr_custom = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    det_model_dir='./custom_det/',
    rec_model_dir='./custom_rec/',
    cls_model_dir='./custom_cls/'
)

from paddleocr import PaddleOCR
import cv2

# 初始化并启用表格识别
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    table=True,  # 启用表格识别
    show_log=True
)

# 读取包含表格的图像
image = cv2.imread('table.png')

# 执行表格 OCR
result = ocr.ocr(image, cls=True)

# 提取表格数据
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        bbox, (text, confidence) = line
        print(f"文本: {text}, 置信度: {confidence:.2f}")

from paddleocr import PPStructure

# 初始化结构分析
table_engine = PPStructure(show_log=True)

# 分析文档版面
image_path = 'document.png'
result = table_engine(image_path)

# 处理版面结果
for region in result:
    print(f"类型: {region['type']}")
    print(f"置信度: {region['score']:.2f}")
    
    if region['type'] == 'table':
        # 提取表格 HTML
        html = region['res']['html']
        print(f"表格 HTML: {html}")
    elif region['type'] == 'text':
        # 提取文本
        for text_line in region['res']:
            print(f"文本: {text_line['text']}")

from paddleocr import PaddleOCR
import os
import glob

# 初始化 OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# 处理多个图像
image_dir = 'documents/'
image_files = glob.glob(os.path.join(image_dir, '*.png'))

results = []
for image_file in image_files:
    image = cv2.imread(image_file)
    result = ocr.ocr(image, cls=True)
    results.append({
        'file': image_file,
        'result': result
    })

# 保存结果
import json
with open('ocr_results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# 准备训练数据
# 数据格式：image_path, text_content

# 训练自定义检测模型
!python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy

# 训练自定义识别模型
!python tools/train.py -c configs/rec/ch_PP-OCRv4/ch_PP-OCRv4_rec.yml -o Global.pretrained_model=./your_model/best_accuracy

# 导出模型用于推理
!python tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy Global.save_inference_dir=./inference/det

from paddleocr import PaddleOCR
import cv2

ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)

# 提取结构化结果
def extract_text_results(result):
    """提取并结构化 OCR 结果"""
    extracted = []
    
    for idx in range(len(result)):
        res = result[idx]
        for line in res:
            bbox, (text, confidence) = line
            
            # 计算边界框
            x1 = min([point[0] for point in bbox])
            y1 = min([point[1] for point in bbox])
            x2 = max([point[0] for point in bbox])
            y2 = max([point[1] for point in bbox])
            
            extracted.append({
                'text': text,
                'confidence': confidence,
                'bbox': {
                    'x1': x1,
                    'y1': y1,
                    'x2': x2,
                    'y2': y2
                },
                'points': bbox
            })
    
    return extracted

# 获取结构化结果
structured_results = extract_text_results(result)

# 按 Y 位置排序（从上到下）
sorted_results = sorted(structured_results, key=lambda x: x['bbox']['y1'])

# 打印结果
for item in sorted_results:
    print(f"{item['text']} (置信度: {item['confidence']:.2f})")

import cv2
import numpy as np
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)

# 绘制边界框
def draw_ocr_results(image, result):
    """在图像上绘制 OCR 结果"""
    image_copy = image.copy()
    
    for idx in range(len(result)):
        res = result[idx]
        for line in res:
            bbox, (text, confidence) = line
            
            # 转换为 numpy 数组
            points = np.array(bbox, dtype=np.int32)
            
            # 绘制边界框
            color = (0, 255, 0) if confidence > 0.9 else (0, 165, 255)
            cv2.polylines(image_copy, [points], True, color, 2)
            
            # 绘制文本
            x, y = bbox[0]
            cv2.putText(
                image_copy,
                f"{text} ({confidence:.2f})",
                (int(x), int(y - 10)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                color,
                1
            )
    
    return image_copy

# 可视化
result_image = draw_ocr_results(image, result)
cv2.imwrite('ocr_result.png', result_image)

import cv2
import numpy as np
from paddleocr import PaddleOCR

def preprocess_image(image_path):
    """预处理图像以获得更好的 OCR 结果"""
    image = cv2.imread(image_path)
    
    # 转换为灰度图
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # 应用去噪
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # 应用自适应阈值处理
    binary = cv2.adaptiveThreshold(
        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )
    
    return binary

# 使用预处理后的图像
ocr = PaddleOCR(use_angle_cls=True, lang='en')
processed_image = preprocess_image('document.png')
result = ocr.ocr(processed_image, cls=True)

from paddleocr import PaddleOCR
import cv2

def safe_ocr(image_path, ocr):
    """带有错误处理的安全 OCR"""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"加载图像失败: {image_path}")
        
        result = ocr.ocr(image, cls=True)
        return result
    
    except Exception as e:
        print(f"OCR 错误: {e}")
        return None

# 初始化 OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# 带错误处理的处理过程
result = safe_ocr('document.png', ocr)
if result:
    # 处理结果
    pass

🇺🇸English

OCR with PaddleOCR

Overview

PaddleOCR is a powerful, open-source OCR toolkit that supports multi-language text recognition, table recognition, and document layout analysis. This skill covers implementation patterns for various document processing scenarios.

Prerequisites

Python 3.8+ : Required for PaddlePaddle and PaddleOCR
PaddlePaddle : Deep learning framework (CPU or GPU version)
OpenCV : For image preprocessing and manipulation
NumPy : For array operations
Image Preprocessing : Understanding of image enhancement techniques
Deep Learning Basics : Knowledge of neural networks and model inference

Key Concepts

Detection Model : Locates text regions in images using DBNet
Recognition Model : Identifies text content using CRNN
Direction Classifier : Determines text orientation (0°, 90°, 180°, 270°)
Multi-language Support : Supports 80+ languages with specific models
Table Recognition : Specialized models for extracting structured table data
Document Layout Analysis : Identifies document structure (headers, paragraphs, tables, images)
GPU Acceleration : CUDA support for faster inference
Model Quantization : INT8 quantization for deployment on edge devices

Implementation Guide

Installation

# CPU version
pip install paddlepaddle paddleocr

# GPU version (CUDA 11.2)
pip install paddlepaddle-gpu paddleocr

# GPU version (CUDA 11.8)
pip install paddlepaddle-gpu==2.5.2.post118 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html

Basic Text Recognition

from paddleocr import PaddleOCR
import cv2

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Read image
image_path = 'document.png'
image = cv2.imread(image_path)

# Perform OCR
result = ocr.ocr(image, cls=True)

# Extract text
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        print(line[1][0])  # Text content

Multi-language OCR

from paddleocr import PaddleOCR

# Supported languages: 'ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'ca', 'hi'

# English
ocr_en = PaddleOCR(use_angle_cls=True, lang='en')

# Chinese
ocr_ch = PaddleOCR(use_angle_cls=True, lang='ch')

# Thai
ocr_th = PaddleOCR(use_angle_cls=True, lang='th')

# Korean
ocr_kr = PaddleOCR(use_angle_cls=True, lang='korean')

# Custom language model
ocr_custom = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    det_model_dir='./custom_det/',
    rec_model_dir='./custom_rec/',
    cls_model_dir='./custom_cls/'
)

Table Recognition

from paddleocr import PaddleOCR
import cv2

# Initialize with table recognition
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    table=True,  # Enable table recognition
    show_log=True
)

# Read image with table
image = cv2.imread('table.png')

# Perform table OCR
result = ocr.ocr(image, cls=True)

# Extract table data
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        bbox, (text, confidence) = line
        print(f"Text: {text}, Confidence: {confidence:.2f}")

Document Layout Analysis

from paddleocr import PPStructure

# Initialize structure analysis
table_engine = PPStructure(show_log=True)

# Analyze document layout
image_path = 'document.png'
result = table_engine(image_path)

# Process layout results
for region in result:
    print(f"Type: {region['type']}")
    print(f"Confidence: {region['score']:.2f}")
    
    if region['type'] == 'table':
        # Extract table HTML
        html = region['res']['html']
        print(f"Table HTML: {html}")
    elif region['type'] == 'text':
        # Extract text
        for text_line in region['res']:
            print(f"Text: {text_line['text']}")

Batch Processing

from paddleocr import PaddleOCR
import os
import glob

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Process multiple images
image_dir = 'documents/'
image_files = glob.glob(os.path.join(image_dir, '*.png'))

results = []
for image_file in image_files:
    image = cv2.imread(image_file)
    result = ocr.ocr(image, cls=True)
    results.append({
        'file': image_file,
        'result': result
    })

# Save results
import json
with open('ocr_results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

GPU Acceleration

from paddleocr import PaddleOCR

# Initialize with GPU support
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    use_gpu=True,  # Enable GPU
    gpu_mem=500,  # GPU memory in MB
    enable_mkldnn=True  # Enable MKLDNN acceleration
)

Custom Model Training

# Prepare training data
# Data format: image_path, text_content

# Train custom detection model
!python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy

# Train custom recognition model
!python tools/train.py -c configs/rec/ch_PP-OCRv4/ch_PP-OCRv4_rec.yml -o Global.pretrained_model=./your_model/best_accuracy

# Export model for inference
!python tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy Global.save_inference_dir=./inference/det

Result Processing

from paddleocr import PaddleOCR
import cv2

ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)

# Extract structured results
def extract_text_results(result):
    """Extract and structure OCR results"""
    extracted = []
    
    for idx in range(len(result)):
        res = result[idx]
        for line in res:
            bbox, (text, confidence) = line
            
            # Calculate bounding box
            x1 = min([point[0] for point in bbox])
            y1 = min([point[1] for point in bbox])
            x2 = max([point[0] for point in bbox])
            y2 = max([point[1] for point in bbox])
            
            extracted.append({
                'text': text,
                'confidence': confidence,
                'bbox': {
                    'x1': x1,
                    'y1': y1,
                    'x2': x2,
                    'y2': y2
                },
                'points': bbox
            })
    
    return extracted

# Get structured results
structured_results = extract_text_results(result)

# Sort by Y position (top to bottom)
sorted_results = sorted(structured_results, key=lambda x: x['bbox']['y1'])

# Print results
for item in sorted_results:
    print(f"{item['text']} (confidence: {item['confidence']:.2f})")

Visualization

import cv2
import numpy as np
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)

# Draw bounding boxes
def draw_ocr_results(image, result):
    """Draw OCR results on image"""
    image_copy = image.copy()
    
    for idx in range(len(result)):
        res = result[idx]
        for line in res:
            bbox, (text, confidence) = line
            
            # Convert to numpy array
            points = np.array(bbox, dtype=np.int32)
            
            # Draw bounding box
            color = (0, 255, 0) if confidence > 0.9 else (0, 165, 255)
            cv2.polylines(image_copy, [points], True, color, 2)
            
            # Draw text
            x, y = bbox[0]
            cv2.putText(
                image_copy,
                f"{text} ({confidence:.2f})",
                (int(x), int(y - 10)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                color,
                1
            )
    
    return image_copy

# Visualize
result_image = draw_ocr_results(image, result)
cv2.imwrite('ocr_result.png', result_image)

Best Practices

Performance Optimization

# Use appropriate model size
# PP-OCRv4: Best accuracy, slower
# PP-OCRv4-mobile: Good accuracy, faster
# PP-OCRv4-server: Best accuracy for server deployment

ocr = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    det_algorithm='DB',  # Detection algorithm
    rec_algorithm='CRNN',  # Recognition algorithm
    use_tensorrt=True,  # Enable TensorRT for faster inference
    precision='fp16'  # Use FP16 for faster inference
)

Image Preprocessing

import cv2
import numpy as np
from paddleocr import PaddleOCR

def preprocess_image(image_path):
    """Preprocess image for better OCR results"""
    image = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply denoising
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # Apply adaptive thresholding
    binary = cv2.adaptiveThreshold(
        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )
    
    return binary

# Use preprocessed image
ocr = PaddleOCR(use_angle_cls=True, lang='en')
processed_image = preprocess_image('document.png')
result = ocr.ocr(processed_image, cls=True)

Error Handling

from paddleocr import PaddleOCR
import cv2

def safe_ocr(image_path, ocr):
    """Safe OCR with error handling"""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Failed to load image: {image_path}")
        
        result = ocr.ocr(image, cls=True)
        return result
    
    except Exception as e:
        print(f"OCR error: {e}")
        return None

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Process with error handling
result = safe_ocr('document.png', ocr)
if result:
    # Process results
    pass

Confidence Thresholding

def filter_by_confidence(result, threshold=0.8):
    """Filter OCR results by confidence threshold"""
    filtered = []
    
    for idx in range(len(result)):
        res = result[idx]
        for line in res:
            bbox, (text, confidence) = line
            
            if confidence >= threshold:
                filtered.append({
                    'text': text,
                    'confidence': confidence,
                    'bbox': bbox
                })
    
    return filtered

# Filter low-confidence results
high_confidence_results = filter_by_confidence(result, threshold=0.8)

Related Skills

Image Preprocessing - Image enhancement for better OCR accuracy
Document Parsing - Structured data extraction from documents
OCR with Tesseract - Alternative OCR engine
PDF Processing - PDF-specific processing techniques
Document Ingestion Pipeline - Document loading workflows

Additional Resources

Weekly Installs

Repository

amnadtaowsoam/c…raskills

GitHub Stars

First Seen

Jan 1, 1970

Security Audits

Gen Agent Trust HubPass SocketPass SnykPass

PaddleOCR 光学字符识别教程 - 多语言文本识别、表格提取与文档版面分析

🇨🇳中文介绍

使用 PaddleOCR 进行光学字符识别

概述

先决条件

关键概念

实现指南

安装

相关 Skills

基本文本识别

多语言 OCR

表格识别

文档版面分析

批量处理

GPU 加速

自定义模型训练

结果处理

可视化

最佳实践

性能优化

图像预处理

错误处理

置信度阈值过滤

相关技能

额外资源