mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-03 20:35:09 +08:00
Caleb: now extracting and returning a list of all links on the page for a customer
This commit is contained in:
parent
db0545014f
commit
d39d3be649
@ -89,6 +89,7 @@ export class Document {
|
|||||||
warning?: string;
|
warning?: string;
|
||||||
|
|
||||||
index?: number;
|
index?: number;
|
||||||
|
linksOnPage?: string[]; // Add this new field as a separate property
|
||||||
|
|
||||||
constructor(data: Partial<Document>) {
|
constructor(data: Partial<Document>) {
|
||||||
if (!data.content) {
|
if (!data.content) {
|
||||||
@ -102,6 +103,7 @@ export class Document {
|
|||||||
this.markdown = data.markdown || "";
|
this.markdown = data.markdown || "";
|
||||||
this.childrenLinks = data.childrenLinks || undefined;
|
this.childrenLinks = data.childrenLinks || undefined;
|
||||||
this.provider = data.provider || undefined;
|
this.provider = data.provider || undefined;
|
||||||
|
this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -109,6 +109,38 @@ function getScrapingFallbackOrder(
|
|||||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function extractLinks(html: string, baseUrl: string): string[] {
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const links: string[] = [];
|
||||||
|
|
||||||
|
// Parse the base URL to get the origin
|
||||||
|
const urlObject = new URL(baseUrl);
|
||||||
|
const origin = urlObject.origin;
|
||||||
|
|
||||||
|
$('a').each((_, element) => {
|
||||||
|
const href = $(element).attr('href');
|
||||||
|
if (href) {
|
||||||
|
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||||
|
// Absolute URL, add as is
|
||||||
|
links.push(href);
|
||||||
|
} else if (href.startsWith('/')) {
|
||||||
|
// Relative URL starting with '/', append to origin
|
||||||
|
links.push(`${origin}${href}`);
|
||||||
|
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||||
|
// Relative URL not starting with '/', append to base URL
|
||||||
|
links.push(`${baseUrl}/${href}`);
|
||||||
|
} else if (href.startsWith('mailto:')) {
|
||||||
|
// mailto: links, add as is
|
||||||
|
links.push(href);
|
||||||
|
}
|
||||||
|
// Fragment-only links (#) are ignored
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove duplicates and return
|
||||||
|
return [...new Set(links)];
|
||||||
|
}
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions = {
|
||||||
@ -234,7 +266,6 @@ export async function scrapSingleUrl(
|
|||||||
scraperResponse.text = customScrapedContent.html;
|
scraperResponse.text = customScrapedContent.html;
|
||||||
screenshot = customScrapedContent.screenshot;
|
screenshot = customScrapedContent.screenshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
return {
|
return {
|
||||||
@ -309,6 +340,10 @@ export async function scrapSingleUrl(
|
|||||||
const soup = cheerio.load(rawHtml);
|
const soup = cheerio.load(rawHtml);
|
||||||
const metadata = extractMetadata(soup, urlToScrap);
|
const metadata = extractMetadata(soup, urlToScrap);
|
||||||
|
|
||||||
|
let linksOnPage: string[] | undefined;
|
||||||
|
|
||||||
|
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||||
|
|
||||||
let document: Document;
|
let document: Document;
|
||||||
if (screenshot && screenshot.length > 0) {
|
if (screenshot && screenshot.length > 0) {
|
||||||
document = {
|
document = {
|
||||||
@ -317,9 +352,10 @@ export async function scrapSingleUrl(
|
|||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
rawHtml:
|
rawHtml:
|
||||||
pageOptions.includeRawHtml ||
|
pageOptions.includeRawHtml ||
|
||||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||||
? rawHtml
|
? rawHtml
|
||||||
: undefined,
|
: undefined,
|
||||||
|
linksOnPage,
|
||||||
metadata: {
|
metadata: {
|
||||||
...metadata,
|
...metadata,
|
||||||
screenshot: screenshot,
|
screenshot: screenshot,
|
||||||
@ -335,7 +371,7 @@ export async function scrapSingleUrl(
|
|||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
rawHtml:
|
rawHtml:
|
||||||
pageOptions.includeRawHtml ||
|
pageOptions.includeRawHtml ||
|
||||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||||
? rawHtml
|
? rawHtml
|
||||||
: undefined,
|
: undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
@ -344,6 +380,7 @@ export async function scrapSingleUrl(
|
|||||||
pageStatusCode: pageStatusCode,
|
pageStatusCode: pageStatusCode,
|
||||||
pageError: pageError,
|
pageError: pageError,
|
||||||
},
|
},
|
||||||
|
linksOnPage,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,6 +391,7 @@ export async function scrapSingleUrl(
|
|||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: "",
|
||||||
html: "",
|
html: "",
|
||||||
|
linksOnPage: [],
|
||||||
metadata: {
|
metadata: {
|
||||||
sourceURL: urlToScrap,
|
sourceURL: urlToScrap,
|
||||||
pageStatusCode: pageStatusCode,
|
pageStatusCode: pageStatusCode,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user