table-extractor by dkyazzentwatwa/chatgpt-skills
npx skills add https://github.com/dkyazzentwatwa/chatgpt-skills --skill table-extractor从 PDF 和图像中提取表格,转换为结构化数据格式。
from table_extractor import TableExtractor
extractor = TableExtractor()
# 从 PDF 提取
extractor.load_pdf("document.pdf")
tables = extractor.extract_all()
# 将第一个表格保存为 CSV
tables[0].to_csv("table.csv")
# 从图像提取
extractor.load_image("scanned_table.png")
table = extractor.extract_table()
print(table)
# 从 PDF 提取
python table_extractor.py --input document.pdf --output tables/
# 提取指定页面
python table_extractor.py --input document.pdf --pages 1-3 --output tables/
# 从图像提取
python table_extractor.py --input scan.png --output table.csv
# 导出到 Excel
python table_extractor.py --input document.pdf --format xlsx --output tables.xlsx
# 对扫描的 PDF 使用 OCR
python table_extractor.py --input scanned.pdf --ocr --output tables/
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
class TableExtractor:
def __init__(self)
# 加载
def load_pdf(self, filepath: str, pages: List[int] = None) -> 'TableExtractor'
def load_image(self, filepath: str) -> 'TableExtractor'
# 提取
def extract_table(self, page: int = 0) -> pd.DataFrame
def extract_all(self) -> List[pd.DataFrame]
def extract_page(self, page: int) -> List[pd.DataFrame]
# 检测
def detect_tables(self, page: int = 0) -> List[Dict]
def get_table_count(self) -> int
# 配置
def set_ocr(self, enabled: bool = True, lang: str = "eng") -> 'TableExtractor'
def set_column_detection(self, mode: str = "auto") -> 'TableExtractor'
# 导出
def to_csv(self, tables: List, output_dir: str) -> List[str]
def to_excel(self, tables: List, output: str) -> str
def to_json(self, tables: List, output: str) -> str
# 检测表格但不提取内容
tables_info = extractor.detect_tables(page=0)
# 返回:
# [
# {"index": 0, "rows": 10, "cols": 5, "bbox": (x1, y1, x2, y2)},
# {"index": 1, "rows": 8, "cols": 3, "bbox": (x1, y1, x2, y2)}
# ]
extractor = TableExtractor()
extractor.load_pdf("quarterly_report.pdf")
# 提取所有表格
tables = extractor.extract_all()
# 将每个表格导出为 CSV
for i, table in enumerate(tables):
table.to_csv(f"table_{i}.csv", index=False)
extractor = TableExtractor()
extractor.set_ocr(enabled=True, lang="eng")
extractor.load_image("scanned_form.png")
table = extractor.extract_table()
print(table)
每周安装量
93
代码仓库
GitHub 星标数
36
首次出现
2026 年 1 月 24 日
安全审计
安装于
opencode78
gemini-cli76
codex75
cursor73
github-copilot69
amp64
Extract tables from PDFs and images into structured data formats.
from table_extractor import TableExtractor
extractor = TableExtractor()
# Extract from PDF
extractor.load_pdf("document.pdf")
tables = extractor.extract_all()
# Save first table to CSV
tables[0].to_csv("table.csv")
# Extract from image
extractor.load_image("scanned_table.png")
table = extractor.extract_table()
print(table)
# Extract from PDF
python table_extractor.py --input document.pdf --output tables/
# Extract specific pages
python table_extractor.py --input document.pdf --pages 1-3 --output tables/
# Extract from image
python table_extractor.py --input scan.png --output table.csv
# Export to Excel
python table_extractor.py --input document.pdf --format xlsx --output tables.xlsx
# With OCR for scanned PDFs
python table_extractor.py --input scanned.pdf --ocr --output tables/
class TableExtractor:
def __init__(self)
# Loading
def load_pdf(self, filepath: str, pages: List[int] = None) -> 'TableExtractor'
def load_image(self, filepath: str) -> 'TableExtractor'
# Extraction
def extract_table(self, page: int = 0) -> pd.DataFrame
def extract_all(self) -> List[pd.DataFrame]
def extract_page(self, page: int) -> List[pd.DataFrame]
# Detection
def detect_tables(self, page: int = 0) -> List[Dict]
def get_table_count(self) -> int
# Configuration
def set_ocr(self, enabled: bool = True, lang: str = "eng") -> 'TableExtractor'
def set_column_detection(self, mode: str = "auto") -> 'TableExtractor'
# Export
def to_csv(self, tables: List, output_dir: str) -> List[str]
def to_excel(self, tables: List, output: str) -> str
def to_json(self, tables: List, output: str) -> str
# Detect tables without extracting
tables_info = extractor.detect_tables(page=0)
# Returns:
# [
# {"index": 0, "rows": 10, "cols": 5, "bbox": (x1, y1, x2, y2)},
# {"index": 1, "rows": 8, "cols": 3, "bbox": (x1, y1, x2, y2)}
# ]
extractor = TableExtractor()
extractor.load_pdf("quarterly_report.pdf")
# Extract all tables
tables = extractor.extract_all()
# Export each to CSV
for i, table in enumerate(tables):
table.to_csv(f"table_{i}.csv", index=False)
extractor = TableExtractor()
extractor.set_ocr(enabled=True, lang="eng")
extractor.load_image("scanned_form.png")
table = extractor.extract_table()
print(table)
Weekly Installs
93
Repository
GitHub Stars
36
First Seen
Jan 24, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
opencode78
gemini-cli76
codex75
cursor73
github-copilot69
amp64
Skills CLI 使用指南:AI Agent 技能包管理器安装与管理教程
43,100 周安装