mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 11:45:56 +08:00
fixes
This commit is contained in:
parent
2fdff9cc45
commit
2ffde5abc1
@ -1,5 +1,8 @@
|
|||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
import {
|
||||||
|
generateCompletions,
|
||||||
|
GenerateCompletionsOptions,
|
||||||
|
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { buildDocument } from "../build-document";
|
import { buildDocument } from "../build-document";
|
||||||
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
@ -54,7 +57,7 @@ export async function batchExtractPromise(
|
|||||||
|
|
||||||
const { extractedDataArray, warning } = await extractData({
|
const { extractedDataArray, warning } = await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
url: doc.metadata.sourceURL || doc.metadata.url || "",
|
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||||
});
|
});
|
||||||
|
|
||||||
await fs.writeFile(
|
await fs.writeFile(
|
||||||
@ -74,5 +77,5 @@ export async function batchExtractPromise(
|
|||||||
},
|
},
|
||||||
warning: warning,
|
warning: warning,
|
||||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
import {
|
||||||
|
generateCompletions,
|
||||||
|
GenerateCompletionsOptions,
|
||||||
|
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { buildDocument } from "../build-document";
|
import { buildDocument } from "../build-document";
|
||||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
@ -35,7 +38,9 @@ export async function singleAnswerCompletion({
|
|||||||
prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt,
|
prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt,
|
||||||
schema: rSchema,
|
schema: rSchema,
|
||||||
},
|
},
|
||||||
markdown: singleAnswerDocs.map((x, i) => `[ID: ${i}]` + buildDocument(x)).join("\n"),
|
markdown: singleAnswerDocs
|
||||||
|
.map((x, i) => `[ID: ${i}]` + buildDocument(x))
|
||||||
|
.join("\n"),
|
||||||
isExtractEndpoint: true,
|
isExtractEndpoint: true,
|
||||||
model: getModel("gemini-2.0-flash", "google"),
|
model: getModel("gemini-2.0-flash", "google"),
|
||||||
};
|
};
|
||||||
@ -58,7 +63,6 @@ export async function singleAnswerCompletion({
|
|||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// const completion = await generateCompletions({
|
// const completion = await generateCompletions({
|
||||||
// logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
// logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||||
// options: {
|
// options: {
|
||||||
@ -79,7 +83,7 @@ export async function singleAnswerCompletion({
|
|||||||
// );
|
// );
|
||||||
return {
|
return {
|
||||||
extract: completion.extract,
|
extract: completion.extract,
|
||||||
tokenUsage: completion.totalUsage,
|
tokenUsage: completion.tokenUsage,
|
||||||
sources: singleAnswerDocs.map(
|
sources: singleAnswerDocs.map(
|
||||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||||
),
|
),
|
||||||
|
@ -455,11 +455,12 @@ export async function performExtraction(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Race between timeout and completion
|
// Race between timeout and completion
|
||||||
const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>;
|
const multiEntityCompletion = (await completionPromise) as Awaited<
|
||||||
|
ReturnType<typeof batchExtractPromise>
|
||||||
|
>;
|
||||||
|
|
||||||
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
|
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
|
||||||
|
|
||||||
|
|
||||||
// Track multi-entity extraction tokens
|
// Track multi-entity extraction tokens
|
||||||
if (multiEntityCompletion) {
|
if (multiEntityCompletion) {
|
||||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||||
@ -520,7 +521,9 @@ export async function performExtraction(
|
|||||||
);
|
);
|
||||||
extractionResults.push(...validResults);
|
extractionResults.push(...validResults);
|
||||||
// Merge all extracts from valid results into a single array
|
// Merge all extracts from valid results into a single array
|
||||||
const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]);
|
const extractArrays = validResults.map((r) =>
|
||||||
|
Array.isArray(r.extract) ? r.extract : [r.extract],
|
||||||
|
);
|
||||||
const mergedExtracts = extractArrays.flat();
|
const mergedExtracts = extractArrays.flat();
|
||||||
multiEntityCompletions.push(...mergedExtracts);
|
multiEntityCompletions.push(...mergedExtracts);
|
||||||
logger.debug("All multi-entity completion chunks finished.", {
|
logger.debug("All multi-entity completion chunks finished.", {
|
||||||
@ -684,7 +687,7 @@ export async function performExtraction(
|
|||||||
tokenUsage: singleAnswerTokenUsage,
|
tokenUsage: singleAnswerTokenUsage,
|
||||||
sources: singleAnswerSources,
|
sources: singleAnswerSources,
|
||||||
} = await singleAnswerCompletion({
|
} = await singleAnswerCompletion({
|
||||||
url: request.urls?.[0] || "",
|
urls: [request.urls?.[0] || ""],
|
||||||
singleAnswerDocs,
|
singleAnswerDocs,
|
||||||
rSchema,
|
rSchema,
|
||||||
links,
|
links,
|
||||||
@ -693,10 +696,7 @@ export async function performExtraction(
|
|||||||
});
|
});
|
||||||
logger.debug("Done generating singleAnswer completions.");
|
logger.debug("Done generating singleAnswer completions.");
|
||||||
|
|
||||||
singleAnswerResult = transformArrayToObject(
|
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
||||||
rSchema,
|
|
||||||
completionResult,
|
|
||||||
);
|
|
||||||
|
|
||||||
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
|
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
|
|||||||
shouldUseSmartscrape: {
|
shouldUseSmartscrape: {
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
description:
|
description:
|
||||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
|
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
|
||||||
},
|
},
|
||||||
// Note: extractedData is added dynamically in prepareSmartScrapeSchema
|
// Note: extractedData is added dynamically in prepareSmartScrapeSchema
|
||||||
};
|
};
|
||||||
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
|
|||||||
type: ["string", "null"],
|
type: ["string", "null"],
|
||||||
// Using the more detailed multi-step description as the common one
|
// Using the more detailed multi-step description as the common one
|
||||||
description:
|
description:
|
||||||
"Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.",
|
"Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
|
||||||
},
|
},
|
||||||
smartscrape_prompt: {
|
smartscrape_prompt: {
|
||||||
type: ["string", "null"],
|
type: ["string", "null"],
|
||||||
// Using the more detailed multi-step description as the common one
|
// Using the more detailed multi-step description as the common one
|
||||||
description:
|
description:
|
||||||
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').",
|
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
|
|||||||
smartScrapePages: {
|
smartScrapePages: {
|
||||||
type: "array",
|
type: "array",
|
||||||
description:
|
description:
|
||||||
"Make an entry for each page we want to run smart scrape on.",
|
"Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
|
||||||
items: {
|
items: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
@ -185,7 +185,7 @@ export async function extractData({
|
|||||||
//WRAP SCHEMA
|
//WRAP SCHEMA
|
||||||
const schema = extractOptions.options.schema;
|
const schema = extractOptions.options.schema;
|
||||||
const logger = extractOptions.logger;
|
const logger = extractOptions.logger;
|
||||||
const isSingleUrl = urls.length === 0;
|
const isSingleUrl = urls.length === 1;
|
||||||
console.log("!!!!!!!!!!!!!!!!!!hereee");
|
console.log("!!!!!!!!!!!!!!!!!!hereee");
|
||||||
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
|
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
|
||||||
const extractOptionsNewSchema = {
|
const extractOptionsNewSchema = {
|
||||||
|
@ -413,12 +413,13 @@ export async function performLLMExtract(
|
|||||||
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||||
// model: getModel("o3-mini", "openai"),
|
// model: getModel("o3-mini", "openai"),
|
||||||
// model: getModel("qwen-qwq-32b", "groq"),
|
// model: getModel("qwen-qwq-32b", "groq"),
|
||||||
|
// model: getModel("gemini-2.0-flash", "google"),
|
||||||
model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
||||||
};
|
};
|
||||||
|
|
||||||
const { extractedDataArray, warning } = await extractData({
|
const { extractedDataArray, warning } = await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
url: meta.url,
|
urls: [meta.url],
|
||||||
});
|
});
|
||||||
|
|
||||||
//TODO: add merge here
|
//TODO: add merge here
|
||||||
|
Loading…
x
Reference in New Issue
Block a user