mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 05:05:55 +08:00
Nick: fixed extract issues
This commit is contained in:
parent
6d77879d68
commit
79e335636a
@ -61,7 +61,7 @@ export async function extractController(
|
|||||||
const baseUrl = url.replace("/*", "");
|
const baseUrl = url.replace("/*", "");
|
||||||
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||||
|
|
||||||
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
const allowExternalLinks = req.body.allowExternalLinks;
|
||||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||||
let mapUrl =
|
let mapUrl =
|
||||||
req.body.prompt && allowExternalLinks
|
req.body.prompt && allowExternalLinks
|
||||||
@ -84,6 +84,8 @@ export async function extractController(
|
|||||||
includeSubdomains: req.body.includeSubdomains,
|
includeSubdomains: req.body.includeSubdomains,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// console.log("mapResults", mapResults);
|
||||||
|
|
||||||
let mappedLinks = mapResults.links as MapDocument[];
|
let mappedLinks = mapResults.links as MapDocument[];
|
||||||
// Limit number of links to MAX_EXTRACT_LIMIT
|
// Limit number of links to MAX_EXTRACT_LIMIT
|
||||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||||
@ -92,6 +94,7 @@ export async function extractController(
|
|||||||
(x) =>
|
(x) =>
|
||||||
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||||
);
|
);
|
||||||
|
// console.log("mappedLinksRerank", mappedLinksRerank);
|
||||||
|
|
||||||
// Filter by path prefix if present
|
// Filter by path prefix if present
|
||||||
// wrong
|
// wrong
|
||||||
@ -150,15 +153,20 @@ export async function extractController(
|
|||||||
} else {
|
} else {
|
||||||
// Handle direct URLs without glob pattern
|
// Handle direct URLs without glob pattern
|
||||||
if (!isUrlBlocked(url)) {
|
if (!isUrlBlocked(url)) {
|
||||||
|
// console.log("url", url);
|
||||||
return [url];
|
return [url];
|
||||||
}
|
}
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// console.log("urlPromises", urlPromises.length);
|
||||||
|
|
||||||
// Wait for all URL processing to complete and flatten results
|
// Wait for all URL processing to complete and flatten results
|
||||||
const processedUrls = await Promise.all(urlPromises);
|
const processedUrls = await Promise.all(urlPromises);
|
||||||
links.push(...processedUrls.flat());
|
const flattenedUrls = processedUrls.flat().filter(url => url); // Filter out any null/undefined values
|
||||||
|
links.push(...flattenedUrls);
|
||||||
|
// console.log("links", links.length, "flattenedUrls", flattenedUrls.length);
|
||||||
|
|
||||||
if (links.length === 0) {
|
if (links.length === 0) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
|
@ -198,7 +198,7 @@ export const extractV1Options = z
|
|||||||
limit: z.number().int().positive().finite().safe().optional(),
|
limit: z.number().int().positive().finite().safe().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
allowExternalLinks: z.boolean().default(true),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
})
|
})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user