structured-output-extractor by patricio0312rev/skills
npx skills add https://github.com/patricio0312rev/skills --skill structured-output-extractor从 LLM 响应中提取可靠、类型化的数据。
| 方法 | 可靠性 | 灵活性 | 最佳适用场景 |
|---|---|---|---|
| OpenAI JSON 模式 | 高 | 中 | 简单的 JSON |
| 函数调用 | 非常高 | 高 | 复杂模式 |
| Instructor | 非常高 | 高 | Python/TS 应用 |
| Zod + 提示 | 中 | 高 | 自定义解析 |
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
// extractors/json-mode.ts
import OpenAI from 'openai';
const openai = new OpenAI();
interface ExtractedData {
name: string;
email: string;
phone?: string;
company?: string;
}
export async function extractContactInfo(text: string): Promise<ExtractedData> {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: `Extract contact information from text. Return JSON with:
{
"name": "string",
"email": "string",
"phone": "string or null",
"company": "string or null"
}`,
},
{ role: 'user', content: text },
],
});
return JSON.parse(response.choices[0].message.content!);
}
// extractors/structured.ts
import OpenAI from 'openai';
import { z } from 'zod';
import { zodResponseFormat } from 'openai/helpers/zod';
const ContactSchema = z.object({
name: z.string().describe('联系人的全名'),
email: z.string().email().describe('电子邮件地址'),
phone: z.string().nullable().describe('电话号码(如果可用)'),
company: z.string().nullable().describe('公司名称(如果提及)'),
role: z.string().nullable().describe('职位或角色'),
});
type Contact = z.infer<typeof ContactSchema>;
export async function extractContact(text: string): Promise<Contact> {
const response = await openai.beta.chat.completions.parse({
model: 'gpt-4o-2024-08-06',
messages: [
{
role: 'system',
content: '从提供的文本中提取联系信息。',
},
{ role: 'user', content: text },
],
response_format: zodResponseFormat(ContactSchema, 'contact'),
});
return response.choices[0].message.parsed!;
}
// extractors/function-calling.ts
import OpenAI from 'openai';
const openai = new OpenAI();
const functions = [
{
name: 'extract_entities',
description: '从文本中提取命名实体',
parameters: {
type: 'object',
properties: {
people: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string' },
role: { type: 'string' },
organization: { type: 'string' },
},
required: ['name'],
},
},
organizations: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['company', 'nonprofit', 'government', 'other'] },
},
required: ['name'],
},
},
locations: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['city', 'country', 'address', 'other'] },
},
required: ['name'],
},
},
dates: {
type: 'array',
items: {
type: 'object',
properties: {
text: { type: 'string' },
normalized: { type: 'string', format: 'date' },
},
required: ['text'],
},
},
},
required: ['people', 'organizations', 'locations', 'dates'],
},
},
];
export async function extractEntities(text: string) {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
messages: [
{
role: 'system',
content: '从提供的文本中提取所有命名实体。',
},
{ role: 'user', content: text },
],
functions,
function_call: { name: 'extract_entities' },
});
const functionCall = response.choices[0].message.function_call;
return JSON.parse(functionCall!.arguments);
}
// extractors/tools.ts
import OpenAI from 'openai';
const tools: OpenAI.Chat.ChatCompletionTool[] = [
{
type: 'function',
function: {
name: 'classify_intent',
description: '根据用户消息分类其意图',
parameters: {
type: 'object',
properties: {
intent: {
type: 'string',
enum: ['question', 'complaint', 'feedback', 'request', 'other'],
},
confidence: {
type: 'number',
minimum: 0,
maximum: 1,
},
entities: {
type: 'object',
properties: {
product: { type: 'string' },
issue: { type: 'string' },
sentiment: { type: 'string', enum: ['positive', 'negative', 'neutral'] },
},
},
suggestedAction: {
type: 'string',
},
},
required: ['intent', 'confidence'],
},
},
},
];
export async function classifyMessage(message: string) {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
messages: [{ role: 'user', content: message }],
tools,
tool_choice: { type: 'function', function: { name: 'classify_intent' } },
});
const toolCall = response.choices[0].message.tool_calls?.[0];
return JSON.parse(toolCall!.function.arguments);
}
// extractors/zod-extractor.ts
import { z } from 'zod';
import OpenAI from 'openai';
const openai = new OpenAI();
// 定义模式
const ProductReviewSchema = z.object({
productName: z.string(),
rating: z.number().min(1).max(5),
pros: z.array(z.string()),
cons: z.array(z.string()),
summary: z.string().max(200),
wouldRecommend: z.boolean(),
sentiment: z.enum(['positive', 'negative', 'mixed', 'neutral']),
});
type ProductReview = z.infer<typeof ProductReviewSchema>;
// 从 Zod 生成 JSON 模式
function zodToJsonSchema(schema: z.ZodObject<any>) {
// 简化版 - 生产环境请使用 zod-to-json-schema
const shape = schema.shape;
const properties: Record<string, any> = {};
const required: string[] = [];
for (const [key, value] of Object.entries(shape)) {
const zodType = value as z.ZodTypeAny;
properties[key] = zodTypeToJson(zodType);
if (!zodType.isOptional()) {
required.push(key);
}
}
return { type: 'object', properties, required };
}
export async function extractReview(reviewText: string): Promise<ProductReview> {
const schema = zodToJsonSchema(ProductReviewSchema);
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: `提取结构化的产品评论。返回符合此模式的 JSON:
${JSON.stringify(schema, null, 2)}`,
},
{ role: 'user', content: reviewText },
],
});
const data = JSON.parse(response.choices[0].message.content!);
// 使用 Zod 验证
return ProductReviewSchema.parse(data);
}
// extractors/retry.ts
export async function extractWithRetry<T>(
schema: z.ZodSchema<T>,
prompt: string,
text: string,
maxRetries = 3
): Promise<T> {
let lastError: Error | null = null;
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{ role: 'system', content: prompt },
{ role: 'user', content: text },
// 重试时添加上下文错误信息
...(lastError
? [
{
role: 'user' as const,
content: `上一次尝试验证失败:${lastError.message}。请修正。`,
},
]
: []),
],
});
const data = JSON.parse(response.choices[0].message.content!);
return schema.parse(data);
} catch (error) {
if (error instanceof z.ZodError) {
lastError = new Error(
error.errors.map((e) => `${e.path.join('.')}: ${e.message}`).join(', ')
);
} else {
throw error;
}
}
}
throw new Error(`在 ${maxRetries} 次尝试后失败:${lastError?.message}`);
}
// extractors/hierarchical.ts
const DocumentSchema = z.object({
title: z.string(),
authors: z.array(
z.object({
name: z.string(),
affiliation: z.string().optional(),
})
),
abstract: z.string(),
sections: z.array(
z.object({
heading: z.string(),
content: z.string(),
subsections: z
.array(
z.object({
heading: z.string(),
content: z.string(),
})
)
.optional(),
})
),
references: z.array(
z.object({
authors: z.array(z.string()),
title: z.string(),
year: z.number(),
source: z.string().optional(),
})
),
keywords: z.array(z.string()),
});
// extractors/multistep.ts
export async function extractComplex(document: string) {
// 步骤 1:提取结构
const structure = await extract(
z.object({
sections: z.array(z.string()),
hasReferences: z.boolean(),
}),
'识别文档结构',
document
);
// 步骤 2:提取每个部分
const sections = await Promise.all(
structure.sections.map((section) =>
extract(
z.object({
heading: z.string(),
summary: z.string(),
keyPoints: z.array(z.string()),
}),
`从部分提取详细信息:${section}`,
document
)
)
);
// 步骤 3:如果存在则提取参考文献
let references = [];
if (structure.hasReferences) {
references = await extract(
z.array(
z.object({
authors: z.array(z.string()),
title: z.string(),
year: z.number(),
})
),
'提取所有参考文献',
document
);
}
return { sections, references };
}
// extractors/streaming.ts
import { zodToJsonSchema } from 'zod-to-json-schema';
export async function* streamExtract<T>(
schema: z.ZodSchema<T>,
prompt: string,
text: string
): AsyncGenerator<{ partial: any; complete: boolean }> {
const jsonSchema = zodToJsonSchema(schema);
const stream = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: `${prompt}\n\n返回符合以下内容的 JSON:${JSON.stringify(jsonSchema)}`,
},
{ role: 'user', content: text },
],
stream: true,
});
let fullContent = '';
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content || '';
fullContent += content;
// 尝试解析部分 JSON
try {
const partial = JSON.parse(fullContent);
yield { partial, complete: false };
} catch {
// JSON 尚未完整
}
}
// 最终解析和验证
const final = JSON.parse(fullContent);
const validated = schema.parse(final);
yield { partial: validated, complete: true };
}
// extractors/error-handling.ts
export class ExtractionError extends Error {
constructor(
message: string,
public readonly raw: string,
public readonly validationErrors?: z.ZodError
) {
super(message);
this.name = 'ExtractionError';
}
}
export async function safeExtract<T>(
schema: z.ZodSchema<T>,
prompt: string,
text: string
): Promise<{ success: true; data: T } | { success: false; error: ExtractionError }> {
try {
const data = await extractWithRetry(schema, prompt, text);
return { success: true, data };
} catch (error) {
if (error instanceof z.ZodError) {
return {
success: false,
error: new ExtractionError('验证失败', text, error),
};
}
return {
success: false,
error: new ExtractionError(
error instanceof Error ? error.message : '未知错误',
text
),
};
}
}
每个结构化提取都应包括:
每周安装量
70
代码仓库
GitHub 星标数
21
首次出现
2026 年 1 月 24 日
安全审计
安装于
codex60
opencode60
gemini-cli59
github-copilot55
cursor53
kimi-cli48
Extract reliable, typed data from LLM responses.
| Method | Reliability | Flexibility | Best For |
|---|---|---|---|
| OpenAI JSON Mode | High | Medium | Simple JSON |
| Function Calling | Very High | High | Complex schemas |
| Instructor | Very High | High | Python/TS apps |
| Zod + Prompting | Medium | High | Custom parsing |
// extractors/json-mode.ts
import OpenAI from 'openai';
const openai = new OpenAI();
interface ExtractedData {
name: string;
email: string;
phone?: string;
company?: string;
}
export async function extractContactInfo(text: string): Promise<ExtractedData> {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: `Extract contact information from text. Return JSON with:
{
"name": "string",
"email": "string",
"phone": "string or null",
"company": "string or null"
}`,
},
{ role: 'user', content: text },
],
});
return JSON.parse(response.choices[0].message.content!);
}
// extractors/structured.ts
import OpenAI from 'openai';
import { z } from 'zod';
import { zodResponseFormat } from 'openai/helpers/zod';
const ContactSchema = z.object({
name: z.string().describe('Full name of the contact'),
email: z.string().email().describe('Email address'),
phone: z.string().nullable().describe('Phone number if available'),
company: z.string().nullable().describe('Company name if mentioned'),
role: z.string().nullable().describe('Job title or role'),
});
type Contact = z.infer<typeof ContactSchema>;
export async function extractContact(text: string): Promise<Contact> {
const response = await openai.beta.chat.completions.parse({
model: 'gpt-4o-2024-08-06',
messages: [
{
role: 'system',
content: 'Extract contact information from the provided text.',
},
{ role: 'user', content: text },
],
response_format: zodResponseFormat(ContactSchema, 'contact'),
});
return response.choices[0].message.parsed!;
}
// extractors/function-calling.ts
import OpenAI from 'openai';
const openai = new OpenAI();
const functions = [
{
name: 'extract_entities',
description: 'Extract named entities from text',
parameters: {
type: 'object',
properties: {
people: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string' },
role: { type: 'string' },
organization: { type: 'string' },
},
required: ['name'],
},
},
organizations: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['company', 'nonprofit', 'government', 'other'] },
},
required: ['name'],
},
},
locations: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['city', 'country', 'address', 'other'] },
},
required: ['name'],
},
},
dates: {
type: 'array',
items: {
type: 'object',
properties: {
text: { type: 'string' },
normalized: { type: 'string', format: 'date' },
},
required: ['text'],
},
},
},
required: ['people', 'organizations', 'locations', 'dates'],
},
},
];
export async function extractEntities(text: string) {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
messages: [
{
role: 'system',
content: 'Extract all named entities from the provided text.',
},
{ role: 'user', content: text },
],
functions,
function_call: { name: 'extract_entities' },
});
const functionCall = response.choices[0].message.function_call;
return JSON.parse(functionCall!.arguments);
}
// extractors/tools.ts
import OpenAI from 'openai';
const tools: OpenAI.Chat.ChatCompletionTool[] = [
{
type: 'function',
function: {
name: 'classify_intent',
description: 'Classify the user intent from their message',
parameters: {
type: 'object',
properties: {
intent: {
type: 'string',
enum: ['question', 'complaint', 'feedback', 'request', 'other'],
},
confidence: {
type: 'number',
minimum: 0,
maximum: 1,
},
entities: {
type: 'object',
properties: {
product: { type: 'string' },
issue: { type: 'string' },
sentiment: { type: 'string', enum: ['positive', 'negative', 'neutral'] },
},
},
suggestedAction: {
type: 'string',
},
},
required: ['intent', 'confidence'],
},
},
},
];
export async function classifyMessage(message: string) {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
messages: [{ role: 'user', content: message }],
tools,
tool_choice: { type: 'function', function: { name: 'classify_intent' } },
});
const toolCall = response.choices[0].message.tool_calls?.[0];
return JSON.parse(toolCall!.function.arguments);
}
// extractors/zod-extractor.ts
import { z } from 'zod';
import OpenAI from 'openai';
const openai = new OpenAI();
// Define schema
const ProductReviewSchema = z.object({
productName: z.string(),
rating: z.number().min(1).max(5),
pros: z.array(z.string()),
cons: z.array(z.string()),
summary: z.string().max(200),
wouldRecommend: z.boolean(),
sentiment: z.enum(['positive', 'negative', 'mixed', 'neutral']),
});
type ProductReview = z.infer<typeof ProductReviewSchema>;
// Generate JSON schema from Zod
function zodToJsonSchema(schema: z.ZodObject<any>) {
// Simplified - use zod-to-json-schema in production
const shape = schema.shape;
const properties: Record<string, any> = {};
const required: string[] = [];
for (const [key, value] of Object.entries(shape)) {
const zodType = value as z.ZodTypeAny;
properties[key] = zodTypeToJson(zodType);
if (!zodType.isOptional()) {
required.push(key);
}
}
return { type: 'object', properties, required };
}
export async function extractReview(reviewText: string): Promise<ProductReview> {
const schema = zodToJsonSchema(ProductReviewSchema);
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: `Extract a structured product review. Return JSON matching this schema:
${JSON.stringify(schema, null, 2)}`,
},
{ role: 'user', content: reviewText },
],
});
const data = JSON.parse(response.choices[0].message.content!);
// Validate with Zod
return ProductReviewSchema.parse(data);
}
// extractors/retry.ts
export async function extractWithRetry<T>(
schema: z.ZodSchema<T>,
prompt: string,
text: string,
maxRetries = 3
): Promise<T> {
let lastError: Error | null = null;
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
const response = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{ role: 'system', content: prompt },
{ role: 'user', content: text },
// Add error context on retry
...(lastError
? [
{
role: 'user' as const,
content: `Previous attempt failed validation: ${lastError.message}. Please fix.`,
},
]
: []),
],
});
const data = JSON.parse(response.choices[0].message.content!);
return schema.parse(data);
} catch (error) {
if (error instanceof z.ZodError) {
lastError = new Error(
error.errors.map((e) => `${e.path.join('.')}: ${e.message}`).join(', ')
);
} else {
throw error;
}
}
}
throw new Error(`Failed after ${maxRetries} attempts: ${lastError?.message}`);
}
// extractors/hierarchical.ts
const DocumentSchema = z.object({
title: z.string(),
authors: z.array(
z.object({
name: z.string(),
affiliation: z.string().optional(),
})
),
abstract: z.string(),
sections: z.array(
z.object({
heading: z.string(),
content: z.string(),
subsections: z
.array(
z.object({
heading: z.string(),
content: z.string(),
})
)
.optional(),
})
),
references: z.array(
z.object({
authors: z.array(z.string()),
title: z.string(),
year: z.number(),
source: z.string().optional(),
})
),
keywords: z.array(z.string()),
});
// extractors/multistep.ts
export async function extractComplex(document: string) {
// Step 1: Extract structure
const structure = await extract(
z.object({
sections: z.array(z.string()),
hasReferences: z.boolean(),
}),
'Identify the document structure',
document
);
// Step 2: Extract each section
const sections = await Promise.all(
structure.sections.map((section) =>
extract(
z.object({
heading: z.string(),
summary: z.string(),
keyPoints: z.array(z.string()),
}),
`Extract details from section: ${section}`,
document
)
)
);
// Step 3: Extract references if present
let references = [];
if (structure.hasReferences) {
references = await extract(
z.array(
z.object({
authors: z.array(z.string()),
title: z.string(),
year: z.number(),
})
),
'Extract all references',
document
);
}
return { sections, references };
}
// extractors/streaming.ts
import { zodToJsonSchema } from 'zod-to-json-schema';
export async function* streamExtract<T>(
schema: z.ZodSchema<T>,
prompt: string,
text: string
): AsyncGenerator<{ partial: any; complete: boolean }> {
const jsonSchema = zodToJsonSchema(schema);
const stream = await openai.chat.completions.create({
model: 'gpt-4-turbo-preview',
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: `${prompt}\n\nReturn JSON matching: ${JSON.stringify(jsonSchema)}`,
},
{ role: 'user', content: text },
],
stream: true,
});
let fullContent = '';
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content || '';
fullContent += content;
// Try to parse partial JSON
try {
const partial = JSON.parse(fullContent);
yield { partial, complete: false };
} catch {
// JSON not complete yet
}
}
// Final parse and validate
const final = JSON.parse(fullContent);
const validated = schema.parse(final);
yield { partial: validated, complete: true };
}
// extractors/error-handling.ts
export class ExtractionError extends Error {
constructor(
message: string,
public readonly raw: string,
public readonly validationErrors?: z.ZodError
) {
super(message);
this.name = 'ExtractionError';
}
}
export async function safeExtract<T>(
schema: z.ZodSchema<T>,
prompt: string,
text: string
): Promise<{ success: true; data: T } | { success: false; error: ExtractionError }> {
try {
const data = await extractWithRetry(schema, prompt, text);
return { success: true, data };
} catch (error) {
if (error instanceof z.ZodError) {
return {
success: false,
error: new ExtractionError('Validation failed', text, error),
};
}
return {
success: false,
error: new ExtractionError(
error instanceof Error ? error.message : 'Unknown error',
text
),
};
}
}
Every structured extraction should include:
Weekly Installs
70
Repository
GitHub Stars
21
First Seen
Jan 24, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
codex60
opencode60
gemini-cli59
github-copilot55
cursor53
kimi-cli48
AI 代码实施计划编写技能 | 自动化开发任务分解与 TDD 流程规划工具
50,900 周安装