mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 14:35:57 +08:00
merged with main
This commit is contained in:
parent
8192d756e9
commit
b6802bc443
@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let schema = options.schema;
|
let schema = options.schema;
|
||||||
|
if (schema) {
|
||||||
|
schema = removeDefaultProperty(schema);
|
||||||
|
}
|
||||||
|
|
||||||
if (schema && schema.type === "array") {
|
if (schema && schema.type === "array") {
|
||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
|
|||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: Object.fromEntries(
|
properties: Object.fromEntries(
|
||||||
Object.entries(schema).map(([key, value]) => [key, { type: value }]),
|
Object.entries(schema).map(([key, value]) => {
|
||||||
|
return [key, removeDefaultProperty(value)];
|
||||||
|
})
|
||||||
),
|
),
|
||||||
required: Object.keys(schema),
|
required: Object.keys(schema),
|
||||||
additionalProperties: false,
|
additionalProperties: false
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,124 +189,6 @@ export async function generateOpenAICompletions(
|
|||||||
|
|
||||||
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
|
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
|
||||||
try {
|
try {
|
||||||
// Encode the message into tokens
|
|
||||||
const tokens = encoder.encode(markdown);
|
|
||||||
|
|
||||||
// Return the number of tokens
|
|
||||||
numTokens = tokens.length;
|
|
||||||
} catch (error) {
|
|
||||||
logger.warn("Calculating num tokens of string failed", { error, markdown });
|
|
||||||
|
|
||||||
markdown = markdown.slice(0, maxTokens * modifier);
|
|
||||||
|
|
||||||
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
|
||||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
|
||||||
} finally {
|
|
||||||
// Free the encoder resources after use
|
|
||||||
encoder.free();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (numTokens > maxTokens) {
|
|
||||||
// trim the document to the maximum number of tokens, tokens != characters
|
|
||||||
markdown = markdown.slice(0, maxTokens * modifier);
|
|
||||||
|
|
||||||
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
|
||||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
|
||||||
}
|
|
||||||
|
|
||||||
let schema = options.schema;
|
|
||||||
if (schema) {
|
|
||||||
schema = removeDefaultProperty(schema);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (schema && schema.type === "array") {
|
|
||||||
schema = {
|
|
||||||
type: "object",
|
|
||||||
properties: {
|
|
||||||
items: options.schema,
|
|
||||||
},
|
|
||||||
required: ["items"],
|
|
||||||
additionalProperties: false,
|
|
||||||
};
|
|
||||||
} else if (schema && typeof schema === 'object' && !schema.type) {
|
|
||||||
schema = {
|
|
||||||
type: "object",
|
|
||||||
properties: Object.fromEntries(
|
|
||||||
Object.entries(schema).map(([key, value]) => {
|
|
||||||
return [key, removeDefaultProperty(value)];
|
|
||||||
})
|
|
||||||
),
|
|
||||||
required: Object.keys(schema),
|
|
||||||
additionalProperties: false
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
schema = normalizeSchema(schema);
|
|
||||||
|
|
||||||
const jsonCompletion = await openai.beta.chat.completions.parse({
|
|
||||||
model,
|
|
||||||
temperature: 0,
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: "system",
|
|
||||||
content: options.systemPrompt,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content: [{ type: "text", text: markdown }],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content: options.prompt !== undefined
|
|
||||||
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
|
|
||||||
: "Transform the above content into structured JSON output.",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
response_format: options.schema ? {
|
|
||||||
type: "json_schema",
|
|
||||||
json_schema: {
|
|
||||||
name: "websiteContent",
|
|
||||||
schema: schema,
|
|
||||||
strict: true,
|
|
||||||
}
|
|
||||||
} : { type: "json_object" },
|
|
||||||
});
|
|
||||||
|
|
||||||
if (jsonCompletion.choices[0].message.refusal !== null) {
|
|
||||||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
|
||||||
}
|
|
||||||
|
|
||||||
extract = jsonCompletion.choices[0].message.parsed;
|
|
||||||
|
|
||||||
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
|
|
||||||
try {
|
|
||||||
if (!isExtractEndpoint) {
|
|
||||||
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
|
||||||
} else {
|
|
||||||
const extractData = JSON.parse(jsonCompletion.choices[0].message.content);
|
|
||||||
extract = options.schema ? extractData.data.extract : extractData;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
|
|
||||||
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
|
||||||
// otherwise, we just return the items array
|
|
||||||
if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
|
|
||||||
extract = extract?.items;
|
|
||||||
}
|
|
||||||
return { extract, warning, numTokens };
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
|
|
||||||
if (meta.options.formats.includes("extract")) {
|
|
||||||
const { extract, warning } = await generateOpenAICompletions(
|
|
||||||
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
|
|
||||||
meta.options.extract!,
|
|
||||||
document.markdown,
|
|
||||||
document.warning,
|
|
||||||
if (!isExtractEndpoint) {
|
if (!isExtractEndpoint) {
|
||||||
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||||
} else {
|
} else {
|
||||||
@ -331,6 +219,26 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise
|
|||||||
return { extract, warning, numTokens };
|
return { extract, warning, numTokens };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function performLLMExtract(
|
||||||
|
meta: Meta,
|
||||||
|
document: Document,
|
||||||
|
): Promise<Document> {
|
||||||
|
if (meta.options.formats.includes("extract")) {
|
||||||
|
const { extract, warning } = await generateOpenAICompletions(
|
||||||
|
meta.logger.child({
|
||||||
|
method: "performLLMExtract/generateOpenAICompletions",
|
||||||
|
}),
|
||||||
|
meta.options.extract!,
|
||||||
|
document.markdown,
|
||||||
|
document.warning,
|
||||||
|
);
|
||||||
|
document.extract = extract;
|
||||||
|
document.warning = warning;
|
||||||
|
}
|
||||||
|
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
export function removeDefaultProperty(schema: any): any {
|
export function removeDefaultProperty(schema: any): any {
|
||||||
if (typeof schema !== 'object' || schema === null) return schema;
|
if (typeof schema !== 'object' || schema === null) return schema;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user