office-to-md by claude-office-skills/skills
npx skills add https://github.com/claude-office-skills/skills --skill office-to-md此技能利用 markitdown —— 微软开源的文档转 Markdown 工具,实现多种 Office 格式到 Markdown 的转换。非常适合让 Office 内容变得可搜索、可版本控制,并适配 AI 处理。
示例提示:
from markitdown import MarkItDown
# 初始化转换器
md = MarkItDown()
# 转换文件
result = md.convert("document.docx")
print(result.text_content)
# 保存到文件
with open("output.md", "w") as f:
f.write(result.text_content)
| 格式 | 扩展名 | 备注 |
|---|---|---|
| Word | .docx | 完整文本、表格、基本格式 |
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
| Excel | .xlsx | 转换为 Markdown 表格 |
| PowerPoint | .pptx | 幻灯片作为章节 |
| 文本提取 |
| HTML | .html | 干净的 Markdown |
| 图像 | .jpg, .png | 使用视觉模型进行 OCR |
| 音频 | .mp3, .wav | 转录 |
| ZIP | .zip | 处理包含的文件 |
from markitdown import MarkItDown
# 简单转换
md = MarkItDown()
result = md.convert("document.docx")
# 访问内容
markdown_text = result.text_content
# 使用选项
md = MarkItDown(
llm_client=None, # 用于增强处理的可选 LLM
llm_model=None # 使用 LLM 时的模型名称
)
# 安装
pip install markitdown
# 转换文件
markitdown document.docx > output.md
# 或指定输出文件
markitdown document.docx -o output.md
from markitdown import MarkItDown
md = MarkItDown()
# 转换 Word 文档
result = md.convert("report.docx")
# 输出保留:
# - 标题(作为 # 标题)
# - 粗体/斜体格式
# - 列表(项目符号和编号)
# - 表格(作为 Markdown 表格)
# - 超链接
print(result.text_content)
示例输出:
# 2024 年度报告
## 执行摘要
本报告总结了主要成就和挑战...
### 关键指标
| 指标 | 2023 | 2024 | 变化 |
|--------|------|------|--------|
| 收入 | $10M | $12M | +20% |
| 用户数 | 50K | 75K | +50% |
## 详细分析
以下部分提供...
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("data.xlsx")
# 每个工作表成为一个章节
# 数据转换为 Markdown 表格
print(result.text_content)
示例输出:
## Sheet1
| 姓名 | 部门 | 薪资 |
|------|------------|--------|
| John | 工程部 | $80,000 |
| Jane | 市场部 | $75,000 |
## Sheet2
| 产品 | Q1 | Q2 | Q3 | Q4 |
|---------|----|----|----|----|
| 部件 A | 100 | 120 | 150 | 180 |
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("presentation.pptx")
# 每张幻灯片成为一个章节
# 如果存在,包含演讲者备注
print(result.text_content)
示例输出:
# 幻灯片 1:公司概述
我们的使命是...
## 关键点
- 创新第一
- 以客户为中心
- 全球覆盖
---
# 幻灯片 2:市场分析
市场机会巨大...
**备注:** 在此处提及竞争对手分析
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("document.pdf")
# 提取文本内容
# 检测到的表格会被转换
print(result.text_content)
from markitdown import MarkItDown
import anthropic
# 使用 Claude 进行图像描述
client = anthropic.Anthropic()
md = MarkItDown(
llm_client=client,
llm_model="claude-sonnet-4-20250514"
)
result = md.convert("diagram.png")
print(result.text_content)
# 输出:图像内容的描述
from markitdown import MarkItDown
from pathlib import Path
def batch_convert(input_dir, output_dir):
"""将所有 Office 文件转换为 Markdown。"""
md = MarkItDown()
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
extensions = ['.docx', '.xlsx', '.pptx', '.pdf']
for ext in extensions:
for file in input_path.glob(f'*{ext}'):
try:
result = md.convert(str(file))
output_file = output_path / f"{file.stem}.md"
with open(output_file, 'w') as f:
f.write(result.text_content)
print(f"已转换: {file.name}")
except Exception as e:
print(f"转换 {file.name} 时出错: {e}")
batch_convert('./documents', './markdown')
import os
from datetime import datetime
from markitdown import MarkItDown
def archive_document(doc_path, archive_dir):
"""将 Office 文档转换并归档为 Markdown。"""
md = MarkItDown()
result = md.convert(doc_path)
# 创建归档结构
date_str = datetime.now().strftime('%Y-%m-%d')
filename = os.path.basename(doc_path)
base_name = os.path.splitext(filename)[0]
# 保存元数据
output_content = f"""---
source: {filename}
converted: {date_str}
---
{result.text_content}
"""
output_path = os.path.join(archive_dir, f"{base_name}.md")
with open(output_path, 'w') as f:
f.write(output_content)
return output_path
from markitdown import MarkItDown
from pathlib import Path
import json
def create_ai_corpus(doc_folder, output_file):
"""将文档转换为用于 AI 训练/RAG 的 JSON 语料库。"""
md = MarkItDown()
corpus = []
for doc in Path(doc_folder).glob('**/*'):
if doc.suffix in ['.docx', '.pdf', '.pptx', '.xlsx']:
try:
result = md.convert(str(doc))
corpus.append({
'source': str(doc),
'filename': doc.name,
'content': result.text_content,
'type': doc.suffix[1:]
})
except Exception as e:
print(f"跳过 {doc.name}: {e}")
with open(output_file, 'w') as f:
json.dump(corpus, f, indent=2)
print(f"创建了包含 {len(corpus)} 个文档的语料库")
return corpus
from markitdown import MarkItDown
from pathlib import Path
def convert_docs_to_wiki(docs_folder, wiki_folder):
"""将所有 Office 文档转换为 Markdown Wiki 结构。"""
md = MarkItDown()
docs_path = Path(docs_folder)
wiki_path = Path(wiki_folder)
# 创建 Wiki 结构
wiki_path.mkdir(exist_ok=True)
# 创建索引
index_content = "# 文档索引\n\n"
for doc in sorted(docs_path.glob('**/*.docx')):
try:
result = md.convert(str(doc))
# 在 Wiki 中创建相对路径
rel_path = doc.relative_to(docs_path)
output_file = wiki_path / rel_path.with_suffix('.md')
output_file.parent.mkdir(parents=True, exist_ok=True)
# 写入 Markdown
with open(output_file, 'w') as f:
f.write(result.text_content)
# 添加到索引
link = str(rel_path.with_suffix('.md')).replace('\\', '/')
index_content += f"- [{doc.stem}]({link})\n"
print(f"已转换: {doc.name}")
except Exception as e:
print(f"错误: {doc.name} - {e}")
# 写入索引
with open(wiki_path / 'index.md', 'w') as f:
f.write(index_content)
convert_docs_to_wiki('./company_docs', './wiki')
from markitdown import MarkItDown
import re
from datetime import datetime
def process_meeting_notes(pptx_path):
"""从 PowerPoint 中提取并结构化会议记录。"""
md = MarkItDown()
result = md.convert(pptx_path)
# 解析 Markdown
content = result.text_content
# 提取章节
sections = {
'attendees': [],
'agenda': [],
'decisions': [],
'action_items': []
}
current_section = None
for line in content.split('\n'):
line_lower = line.lower()
if 'attendee' in line_lower or 'participant' in line_lower:
current_section = 'attendees'
elif 'agenda' in line_lower:
current_section = 'agenda'
elif 'decision' in line_lower:
current_section = 'decisions'
elif 'action' in line_lower:
current_section = 'action_items'
elif line.strip().startswith(('-', '*', '•')) and current_section:
sections[current_section].append(line.strip()[1:].strip())
# 生成结构化输出
output = f"""# 会议记录
**日期:** {datetime.now().strftime('%Y-%m-%d')}
**来源:** {pptx_path}
## 与会者
{chr(10).join('- ' + a for a in sections['attendees'])}
## 议程
{chr(10).join('- ' + a for a in sections['agenda'])}
## 做出的决定
{chr(10).join('- ' + d for d in sections['decisions'])}
## 待办事项
{chr(10).join('- [ ] ' + a for a in sections['action_items'])}
"""
return output
notes = process_meeting_notes('team_meeting.pptx')
print(notes)
from markitdown import MarkItDown
def excel_to_data_dictionary(xlsx_path):
"""将 Excel 数据模型转换为数据字典文档。"""
md = MarkItDown()
result = md.convert(xlsx_path)
# 添加文档结构
doc = f"""# 数据字典
生成自: `{xlsx_path}`
{result.text_content}
## 使用说明
- 所有表格均源自源 Excel 文件
- 使用前请检查数据类型和约束
- 如有疑问请联系数据团队
## 变更日志
| 日期 | 变更 | 作者 |
|------|--------|--------|
| {datetime.now().strftime('%Y-%m-%d')} | 初始生成 | 自动 |
"""
return doc
documentation = excel_to_data_dictionary('data_model.xlsx')
with open('data_dictionary.md', 'w') as f:
f.write(documentation)
pip install markitdown
# 用于图像/音频处理
pip install markitdown[all]
# 用于特定功能
pip install markitdown[images] # 图像 OCR
pip install markitdown[audio] # 音频转录
每周安装量
25
仓库
GitHub 星标数
5
首次出现
1 天前
安全审计
安装于
claude-code19
cursor8
gemini-cli7
kimi-cli7
amp7
cline7
This skill enables conversion from various Office formats to Markdown using markitdown - Microsoft's open-source tool for converting documents to Markdown. Perfect for making Office content searchable, version-controllable, and AI-friendly.
Example prompts:
from markitdown import MarkItDown
# Initialize converter
md = MarkItDown()
# Convert file
result = md.convert("document.docx")
print(result.text_content)
# Save to file
with open("output.md", "w") as f:
f.write(result.text_content)
| Format | Extension | Notes |
|---|---|---|
| Word | .docx | Full text, tables, basic formatting |
| Excel | .xlsx | Converts to Markdown tables |
| PowerPoint | .pptx | Slides as sections |
| Text extraction | ||
| HTML | .html | Clean markdown |
| Images | .jpg, .png | OCR with vision model |
| Audio | .mp3, .wav | Transcription |
| ZIP | .zip | Processes contained files |
from markitdown import MarkItDown
# Simple conversion
md = MarkItDown()
result = md.convert("document.docx")
# Access content
markdown_text = result.text_content
# With options
md = MarkItDown(
llm_client=None, # Optional LLM for enhanced processing
llm_model=None # Model name if using LLM
)
# Install
pip install markitdown
# Convert file
markitdown document.docx > output.md
# Or with output file
markitdown document.docx -o output.md
from markitdown import MarkItDown
md = MarkItDown()
# Convert Word document
result = md.convert("report.docx")
# Output preserves:
# - Headings (as # headers)
# - Bold/italic formatting
# - Lists (bulleted and numbered)
# - Tables (as markdown tables)
# - Hyperlinks
print(result.text_content)
Example Output:
# Annual Report 2024
## Executive Summary
This report summarizes the key achievements and challenges...
### Key Metrics
| Metric | 2023 | 2024 | Change |
|--------|------|------|--------|
| Revenue | $10M | $12M | +20% |
| Users | 50K | 75K | +50% |
## Detailed Analysis
The following sections provide...
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("data.xlsx")
# Each sheet becomes a section
# Data becomes markdown tables
print(result.text_content)
Example Output:
## Sheet1
| Name | Department | Salary |
|------|------------|--------|
| John | Engineering | $80,000 |
| Jane | Marketing | $75,000 |
## Sheet2
| Product | Q1 | Q2 | Q3 | Q4 |
|---------|----|----|----|----|
| Widget A | 100 | 120 | 150 | 180 |
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("presentation.pptx")
# Each slide becomes a section
# Speaker notes included if present
print(result.text_content)
Example Output:
# Slide 1: Company Overview
Our mission is to...
## Key Points
- Innovation first
- Customer focused
- Global reach
---
# Slide 2: Market Analysis
The market opportunity is significant...
**Notes:** Mention the competitor analysis here
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("document.pdf")
# Extracts text content
# Tables converted where detected
print(result.text_content)
from markitdown import MarkItDown
import anthropic
# Use Claude for image description
client = anthropic.Anthropic()
md = MarkItDown(
llm_client=client,
llm_model="claude-sonnet-4-20250514"
)
result = md.convert("diagram.png")
print(result.text_content)
# Output: Description of the image content
from markitdown import MarkItDown
from pathlib import Path
def batch_convert(input_dir, output_dir):
"""Convert all Office files to Markdown."""
md = MarkItDown()
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
extensions = ['.docx', '.xlsx', '.pptx', '.pdf']
for ext in extensions:
for file in input_path.glob(f'*{ext}'):
try:
result = md.convert(str(file))
output_file = output_path / f"{file.stem}.md"
with open(output_file, 'w') as f:
f.write(result.text_content)
print(f"Converted: {file.name}")
except Exception as e:
print(f"Error converting {file.name}: {e}")
batch_convert('./documents', './markdown')
import os
from datetime import datetime
from markitdown import MarkItDown
def archive_document(doc_path, archive_dir):
"""Convert and archive Office document to Markdown."""
md = MarkItDown()
result = md.convert(doc_path)
# Create archive structure
date_str = datetime.now().strftime('%Y-%m-%d')
filename = os.path.basename(doc_path)
base_name = os.path.splitext(filename)[0]
# Save with metadata
output_content = f"""---
source: {filename}
converted: {date_str}
---
{result.text_content}
"""
output_path = os.path.join(archive_dir, f"{base_name}.md")
with open(output_path, 'w') as f:
f.write(output_content)
return output_path
from markitdown import MarkItDown
from pathlib import Path
import json
def create_ai_corpus(doc_folder, output_file):
"""Convert documents to JSON corpus for AI training/RAG."""
md = MarkItDown()
corpus = []
for doc in Path(doc_folder).glob('**/*'):
if doc.suffix in ['.docx', '.pdf', '.pptx', '.xlsx']:
try:
result = md.convert(str(doc))
corpus.append({
'source': str(doc),
'filename': doc.name,
'content': result.text_content,
'type': doc.suffix[1:]
})
except Exception as e:
print(f"Skipped {doc.name}: {e}")
with open(output_file, 'w') as f:
json.dump(corpus, f, indent=2)
print(f"Created corpus with {len(corpus)} documents")
return corpus
from markitdown import MarkItDown
from pathlib import Path
def convert_docs_to_wiki(docs_folder, wiki_folder):
"""Convert all Office docs to markdown wiki structure."""
md = MarkItDown()
docs_path = Path(docs_folder)
wiki_path = Path(wiki_folder)
# Create wiki structure
wiki_path.mkdir(exist_ok=True)
# Create index
index_content = "# Documentation Index\n\n"
for doc in sorted(docs_path.glob('**/*.docx')):
try:
result = md.convert(str(doc))
# Create relative path in wiki
rel_path = doc.relative_to(docs_path)
output_file = wiki_path / rel_path.with_suffix('.md')
output_file.parent.mkdir(parents=True, exist_ok=True)
# Write markdown
with open(output_file, 'w') as f:
f.write(result.text_content)
# Add to index
link = str(rel_path.with_suffix('.md')).replace('\\', '/')
index_content += f"- [{doc.stem}]({link})\n"
print(f"Converted: {doc.name}")
except Exception as e:
print(f"Error: {doc.name} - {e}")
# Write index
with open(wiki_path / 'index.md', 'w') as f:
f.write(index_content)
convert_docs_to_wiki('./company_docs', './wiki')
from markitdown import MarkItDown
import re
from datetime import datetime
def process_meeting_notes(pptx_path):
"""Extract and structure meeting notes from PowerPoint."""
md = MarkItDown()
result = md.convert(pptx_path)
# Parse the markdown
content = result.text_content
# Extract sections
sections = {
'attendees': [],
'agenda': [],
'decisions': [],
'action_items': []
}
current_section = None
for line in content.split('\n'):
line_lower = line.lower()
if 'attendee' in line_lower or 'participant' in line_lower:
current_section = 'attendees'
elif 'agenda' in line_lower:
current_section = 'agenda'
elif 'decision' in line_lower:
current_section = 'decisions'
elif 'action' in line_lower:
current_section = 'action_items'
elif line.strip().startswith(('-', '*', '•')) and current_section:
sections[current_section].append(line.strip()[1:].strip())
# Generate structured output
output = f"""# Meeting Notes
**Date:** {datetime.now().strftime('%Y-%m-%d')}
**Source:** {pptx_path}
## Attendees
{chr(10).join('- ' + a for a in sections['attendees'])}
## Agenda
{chr(10).join('- ' + a for a in sections['agenda'])}
## Decisions Made
{chr(10).join('- ' + d for d in sections['decisions'])}
## Action Items
{chr(10).join('- [ ] ' + a for a in sections['action_items'])}
"""
return output
notes = process_meeting_notes('team_meeting.pptx')
print(notes)
from markitdown import MarkItDown
def excel_to_data_dictionary(xlsx_path):
"""Convert Excel data model to data dictionary documentation."""
md = MarkItDown()
result = md.convert(xlsx_path)
# Add documentation structure
doc = f"""# Data Dictionary
Generated from: `{xlsx_path}`
{result.text_content}
## Usage Notes
- All tables are derived from the source Excel file
- Review data types and constraints before use
- Contact data team for clarifications
## Change Log
| Date | Change | Author |
|------|--------|--------|
| {datetime.now().strftime('%Y-%m-%d')} | Initial generation | Auto |
"""
return doc
documentation = excel_to_data_dictionary('data_model.xlsx')
with open('data_dictionary.md', 'w') as f:
f.write(documentation)
pip install markitdown
# For image/audio processing
pip install markitdown[all]
# For specific features
pip install markitdown[images] # Image OCR
pip install markitdown[audio] # Audio transcription
Weekly Installs
25
Repository
GitHub Stars
5
First Seen
1 day ago
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code19
cursor8
gemini-cli7
kimi-cli7
amp7
cline7
Google Slides 演示文稿创建与共享自动化教程 - 使用 Google Workspace CLI
6,500 周安装
OpenAPI 转 TypeScript 工具 - 自动生成 API 接口与类型守卫
563 周安装
Rust Unsafe代码检查器 - 安全使用Unsafe Rust的完整指南与最佳实践
566 周安装
数据库模式设计器 - 内置最佳实践,自动生成生产级SQL/NoSQL数据库架构
565 周安装
Nx 生成器使用指南:自动化代码生成与单体仓库项目搭建
594 周安装
.NET并发编程模式指南:async/await、Channels、Akka.NET选择决策树
725 周安装
韩语语法检查器 - 基于国立国语院标准的拼写、空格、语法、标点错误检测与纠正
586 周安装