Caleb: now extracting and returning a list of all links on the page for a customer

This commit is contained in:
Caleb Peffer 2024-07-16 18:38:03 -07:00
parent db0545014f
commit d39d3be649
2 changed files with 44 additions and 4 deletions

View File

@ -89,6 +89,7 @@ export class Document {
warning?: string; warning?: string;
index?: number; index?: number;
linksOnPage?: string[]; // Add this new field as a separate property
constructor(data: Partial<Document>) { constructor(data: Partial<Document>) {
if (!data.content) { if (!data.content) {
@ -102,6 +103,7 @@ export class Document {
this.markdown = data.markdown || ""; this.markdown = data.markdown || "";
this.childrenLinks = data.childrenLinks || undefined; this.childrenLinks = data.childrenLinks || undefined;
this.provider = data.provider || undefined; this.provider = data.provider || undefined;
this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
} }
} }

View File

@ -109,6 +109,38 @@ function getScrapingFallbackOrder(
return scrapersInOrder as (typeof baseScrapers)[number][]; return scrapersInOrder as (typeof baseScrapers)[number][];
} }
function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const links: string[] = [];
// Parse the base URL to get the origin
const urlObject = new URL(baseUrl);
const origin = urlObject.origin;
$('a').each((_, element) => {
const href = $(element).attr('href');
if (href) {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to origin
links.push(`${origin}${href}`);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(`${baseUrl}/${href}`);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
}
});
// Remove duplicates and return
return [...new Set(links)];
}
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
@ -234,7 +266,6 @@ export async function scrapSingleUrl(
scraperResponse.text = customScrapedContent.html; scraperResponse.text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot; screenshot = customScrapedContent.screenshot;
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
return { return {
@ -309,6 +340,10 @@ export async function scrapSingleUrl(
const soup = cheerio.load(rawHtml); const soup = cheerio.load(rawHtml);
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
let linksOnPage: string[] | undefined;
linksOnPage = extractLinks(rawHtml, urlToScrap);
let document: Document; let document: Document;
if (screenshot && screenshot.length > 0) { if (screenshot && screenshot.length > 0) {
document = { document = {
@ -317,9 +352,10 @@ export async function scrapSingleUrl(
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html" extractorOptions.mode === "llm-extraction-from-raw-html"
? rawHtml ? rawHtml
: undefined, : undefined,
linksOnPage,
metadata: { metadata: {
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
@ -335,7 +371,7 @@ export async function scrapSingleUrl(
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html" extractorOptions.mode === "llm-extraction-from-raw-html"
? rawHtml ? rawHtml
: undefined, : undefined,
metadata: { metadata: {
@ -344,6 +380,7 @@ export async function scrapSingleUrl(
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError, pageError: pageError,
}, },
linksOnPage,
}; };
} }
@ -354,6 +391,7 @@ export async function scrapSingleUrl(
content: "", content: "",
markdown: "", markdown: "",
html: "", html: "",
linksOnPage: [],
metadata: { metadata: {
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,