OCR with PaddleOCR by amnadtaowsoam/cerebraskills
npx skills add https://github.com/amnadtaowsoam/cerebraskills --skill 'OCR with PaddleOCR'PaddleOCR 是一个功能强大的开源 OCR 工具包,支持多语言文本识别、表格识别和文档版面分析。本技能涵盖了各种文档处理场景的实现模式。
# CPU 版本
pip install paddlepaddle paddleocr
# GPU 版本 (CUDA 11.2)
pip install paddlepaddle-gpu paddleocr
# GPU 版本 (CUDA 11.8)
pip install paddlepaddle-gpu==2.5.2.post118 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
from paddleocr import PaddleOCR
import cv2
# 初始化 PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# 读取图像
image_path = 'document.png'
image = cv2.imread(image_path)
# 执行 OCR
result = ocr.ocr(image, cls=True)
# 提取文本
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line[1][0]) # 文本内容
from paddleocr import PaddleOCR
# 支持的语言:'ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'ca', 'hi'
# 英语
ocr_en = PaddleOCR(use_angle_cls=True, lang='en')
# 中文
ocr_ch = PaddleOCR(use_angle_cls=True, lang='ch')
# 泰语
ocr_th = PaddleOCR(use_angle_cls=True, lang='th')
# 韩语
ocr_kr = PaddleOCR(use_angle_cls=True, lang='korean')
# 自定义语言模型
ocr_custom = PaddleOCR(
use_angle_cls=True,
lang='en',
det_model_dir='./custom_det/',
rec_model_dir='./custom_rec/',
cls_model_dir='./custom_cls/'
)
from paddleocr import PaddleOCR
import cv2
# 初始化并启用表格识别
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
table=True, # 启用表格识别
show_log=True
)
# 读取包含表格的图像
image = cv2.imread('table.png')
# 执行表格 OCR
result = ocr.ocr(image, cls=True)
# 提取表格数据
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
print(f"文本: {text}, 置信度: {confidence:.2f}")
from paddleocr import PPStructure
# 初始化结构分析
table_engine = PPStructure(show_log=True)
# 分析文档版面
image_path = 'document.png'
result = table_engine(image_path)
# 处理版面结果
for region in result:
print(f"类型: {region['type']}")
print(f"置信度: {region['score']:.2f}")
if region['type'] == 'table':
# 提取表格 HTML
html = region['res']['html']
print(f"表格 HTML: {html}")
elif region['type'] == 'text':
# 提取文本
for text_line in region['res']:
print(f"文本: {text_line['text']}")
from paddleocr import PaddleOCR
import os
import glob
# 初始化 OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# 处理多个图像
image_dir = 'documents/'
image_files = glob.glob(os.path.join(image_dir, '*.png'))
results = []
for image_file in image_files:
image = cv2.imread(image_file)
result = ocr.ocr(image, cls=True)
results.append({
'file': image_file,
'result': result
})
# 保存结果
import json
with open('ocr_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
from paddleocr import PaddleOCR
# 初始化并启用 GPU 支持
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
use_gpu=True, # 启用 GPU
gpu_mem=500, # GPU 内存,单位 MB
enable_mkldnn=True # 启用 MKLDNN 加速
)
# 准备训练数据
# 数据格式:image_path, text_content
# 训练自定义检测模型
!python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy
# 训练自定义识别模型
!python tools/train.py -c configs/rec/ch_PP-OCRv4/ch_PP-OCRv4_rec.yml -o Global.pretrained_model=./your_model/best_accuracy
# 导出模型用于推理
!python tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy Global.save_inference_dir=./inference/det
from paddleocr import PaddleOCR
import cv2
ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)
# 提取结构化结果
def extract_text_results(result):
"""提取并结构化 OCR 结果"""
extracted = []
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
# 计算边界框
x1 = min([point[0] for point in bbox])
y1 = min([point[1] for point in bbox])
x2 = max([point[0] for point in bbox])
y2 = max([point[1] for point in bbox])
extracted.append({
'text': text,
'confidence': confidence,
'bbox': {
'x1': x1,
'y1': y1,
'x2': x2,
'y2': y2
},
'points': bbox
})
return extracted
# 获取结构化结果
structured_results = extract_text_results(result)
# 按 Y 位置排序(从上到下)
sorted_results = sorted(structured_results, key=lambda x: x['bbox']['y1'])
# 打印结果
for item in sorted_results:
print(f"{item['text']} (置信度: {item['confidence']:.2f})")
import cv2
import numpy as np
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)
# 绘制边界框
def draw_ocr_results(image, result):
"""在图像上绘制 OCR 结果"""
image_copy = image.copy()
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
# 转换为 numpy 数组
points = np.array(bbox, dtype=np.int32)
# 绘制边界框
color = (0, 255, 0) if confidence > 0.9 else (0, 165, 255)
cv2.polylines(image_copy, [points], True, color, 2)
# 绘制文本
x, y = bbox[0]
cv2.putText(
image_copy,
f"{text} ({confidence:.2f})",
(int(x), int(y - 10)),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
color,
1
)
return image_copy
# 可视化
result_image = draw_ocr_results(image, result)
cv2.imwrite('ocr_result.png', result_image)
# 使用合适的模型大小
# PP-OCRv4: 最佳精度,较慢
# PP-OCRv4-mobile: 良好精度,较快
# PP-OCRv4-server: 服务器部署的最佳精度
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
det_algorithm='DB', # 检测算法
rec_algorithm='CRNN', # 识别算法
use_tensorrt=True, # 启用 TensorRT 以获得更快的推理速度
precision='fp16' # 使用 FP16 以获得更快的推理速度
)
import cv2
import numpy as np
from paddleocr import PaddleOCR
def preprocess_image(image_path):
"""预处理图像以获得更好的 OCR 结果"""
image = cv2.imread(image_path)
# 转换为灰度图
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# 应用去噪
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# 应用自适应阈值处理
binary = cv2.adaptiveThreshold(
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
return binary
# 使用预处理后的图像
ocr = PaddleOCR(use_angle_cls=True, lang='en')
processed_image = preprocess_image('document.png')
result = ocr.ocr(processed_image, cls=True)
from paddleocr import PaddleOCR
import cv2
def safe_ocr(image_path, ocr):
"""带有错误处理的安全 OCR"""
try:
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"加载图像失败: {image_path}")
result = ocr.ocr(image, cls=True)
return result
except Exception as e:
print(f"OCR 错误: {e}")
return None
# 初始化 OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# 带错误处理的处理过程
result = safe_ocr('document.png', ocr)
if result:
# 处理结果
pass
def filter_by_confidence(result, threshold=0.8):
"""根据置信度阈值过滤 OCR 结果"""
filtered = []
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
if confidence >= threshold:
filtered.append({
'text': text,
'confidence': confidence,
'bbox': bbox
})
return filtered
# 过滤低置信度结果
high_confidence_results = filter_by_confidence(result, threshold=0.8)
每周安装次数
0
仓库
GitHub 星标数
1
首次出现
1970年1月1日
安全审计
PaddleOCR is a powerful, open-source OCR toolkit that supports multi-language text recognition, table recognition, and document layout analysis. This skill covers implementation patterns for various document processing scenarios.
# CPU version
pip install paddlepaddle paddleocr
# GPU version (CUDA 11.2)
pip install paddlepaddle-gpu paddleocr
# GPU version (CUDA 11.8)
pip install paddlepaddle-gpu==2.5.2.post118 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
from paddleocr import PaddleOCR
import cv2
# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Read image
image_path = 'document.png'
image = cv2.imread(image_path)
# Perform OCR
result = ocr.ocr(image, cls=True)
# Extract text
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line[1][0]) # Text content
from paddleocr import PaddleOCR
# Supported languages: 'ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'ca', 'hi'
# English
ocr_en = PaddleOCR(use_angle_cls=True, lang='en')
# Chinese
ocr_ch = PaddleOCR(use_angle_cls=True, lang='ch')
# Thai
ocr_th = PaddleOCR(use_angle_cls=True, lang='th')
# Korean
ocr_kr = PaddleOCR(use_angle_cls=True, lang='korean')
# Custom language model
ocr_custom = PaddleOCR(
use_angle_cls=True,
lang='en',
det_model_dir='./custom_det/',
rec_model_dir='./custom_rec/',
cls_model_dir='./custom_cls/'
)
from paddleocr import PaddleOCR
import cv2
# Initialize with table recognition
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
table=True, # Enable table recognition
show_log=True
)
# Read image with table
image = cv2.imread('table.png')
# Perform table OCR
result = ocr.ocr(image, cls=True)
# Extract table data
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
print(f"Text: {text}, Confidence: {confidence:.2f}")
from paddleocr import PPStructure
# Initialize structure analysis
table_engine = PPStructure(show_log=True)
# Analyze document layout
image_path = 'document.png'
result = table_engine(image_path)
# Process layout results
for region in result:
print(f"Type: {region['type']}")
print(f"Confidence: {region['score']:.2f}")
if region['type'] == 'table':
# Extract table HTML
html = region['res']['html']
print(f"Table HTML: {html}")
elif region['type'] == 'text':
# Extract text
for text_line in region['res']:
print(f"Text: {text_line['text']}")
from paddleocr import PaddleOCR
import os
import glob
# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Process multiple images
image_dir = 'documents/'
image_files = glob.glob(os.path.join(image_dir, '*.png'))
results = []
for image_file in image_files:
image = cv2.imread(image_file)
result = ocr.ocr(image, cls=True)
results.append({
'file': image_file,
'result': result
})
# Save results
import json
with open('ocr_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
from paddleocr import PaddleOCR
# Initialize with GPU support
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
use_gpu=True, # Enable GPU
gpu_mem=500, # GPU memory in MB
enable_mkldnn=True # Enable MKLDNN acceleration
)
# Prepare training data
# Data format: image_path, text_content
# Train custom detection model
!python tools/train.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy
# Train custom recognition model
!python tools/train.py -c configs/rec/ch_PP-OCRv4/ch_PP-OCRv4_rec.yml -o Global.pretrained_model=./your_model/best_accuracy
# Export model for inference
!python tools/export_model.py -c configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det.yml -o Global.pretrained_model=./your_model/best_accuracy Global.save_inference_dir=./inference/det
from paddleocr import PaddleOCR
import cv2
ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)
# Extract structured results
def extract_text_results(result):
"""Extract and structure OCR results"""
extracted = []
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
# Calculate bounding box
x1 = min([point[0] for point in bbox])
y1 = min([point[1] for point in bbox])
x2 = max([point[0] for point in bbox])
y2 = max([point[1] for point in bbox])
extracted.append({
'text': text,
'confidence': confidence,
'bbox': {
'x1': x1,
'y1': y1,
'x2': x2,
'y2': y2
},
'points': bbox
})
return extracted
# Get structured results
structured_results = extract_text_results(result)
# Sort by Y position (top to bottom)
sorted_results = sorted(structured_results, key=lambda x: x['bbox']['y1'])
# Print results
for item in sorted_results:
print(f"{item['text']} (confidence: {item['confidence']:.2f})")
import cv2
import numpy as np
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
image = cv2.imread('document.png')
result = ocr.ocr(image, cls=True)
# Draw bounding boxes
def draw_ocr_results(image, result):
"""Draw OCR results on image"""
image_copy = image.copy()
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
# Convert to numpy array
points = np.array(bbox, dtype=np.int32)
# Draw bounding box
color = (0, 255, 0) if confidence > 0.9 else (0, 165, 255)
cv2.polylines(image_copy, [points], True, color, 2)
# Draw text
x, y = bbox[0]
cv2.putText(
image_copy,
f"{text} ({confidence:.2f})",
(int(x), int(y - 10)),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
color,
1
)
return image_copy
# Visualize
result_image = draw_ocr_results(image, result)
cv2.imwrite('ocr_result.png', result_image)
# Use appropriate model size
# PP-OCRv4: Best accuracy, slower
# PP-OCRv4-mobile: Good accuracy, faster
# PP-OCRv4-server: Best accuracy for server deployment
ocr = PaddleOCR(
use_angle_cls=True,
lang='en',
det_algorithm='DB', # Detection algorithm
rec_algorithm='CRNN', # Recognition algorithm
use_tensorrt=True, # Enable TensorRT for faster inference
precision='fp16' # Use FP16 for faster inference
)
import cv2
import numpy as np
from paddleocr import PaddleOCR
def preprocess_image(image_path):
"""Preprocess image for better OCR results"""
image = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply denoising
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
return binary
# Use preprocessed image
ocr = PaddleOCR(use_angle_cls=True, lang='en')
processed_image = preprocess_image('document.png')
result = ocr.ocr(processed_image, cls=True)
from paddleocr import PaddleOCR
import cv2
def safe_ocr(image_path, ocr):
"""Safe OCR with error handling"""
try:
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Failed to load image: {image_path}")
result = ocr.ocr(image, cls=True)
return result
except Exception as e:
print(f"OCR error: {e}")
return None
# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Process with error handling
result = safe_ocr('document.png', ocr)
if result:
# Process results
pass
def filter_by_confidence(result, threshold=0.8):
"""Filter OCR results by confidence threshold"""
filtered = []
for idx in range(len(result)):
res = result[idx]
for line in res:
bbox, (text, confidence) = line
if confidence >= threshold:
filtered.append({
'text': text,
'confidence': confidence,
'bbox': bbox
})
return filtered
# Filter low-confidence results
high_confidence_results = filter_by_confidence(result, threshold=0.8)
Weekly Installs
0
Repository
GitHub Stars
1
First Seen
Jan 1, 1970
Security Audits
AI Elements:基于shadcn/ui的AI原生应用组件库,快速构建对话界面
60,400 周安装