mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-30 08:25:12 +08:00
Merge pull request #423 from mendableai/cjp/linksOnPage
Caleb: Return a list of links on a page by default
This commit is contained in:
commit
8d5ebc9b9f
@ -89,7 +89,8 @@ export class Document {
|
||||
warning?: string;
|
||||
|
||||
index?: number;
|
||||
|
||||
linksOnPage?: string[]; // Add this new field as a separate property
|
||||
|
||||
constructor(data: Partial<Document>) {
|
||||
if (!data.content) {
|
||||
throw new Error("Missing required fields");
|
||||
@ -102,6 +103,7 @@ export class Document {
|
||||
this.markdown = data.markdown || "";
|
||||
this.childrenLinks = data.childrenLinks || undefined;
|
||||
this.provider = data.provider || undefined;
|
||||
this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,7 @@
|
||||
import { scrapSingleUrl } from '../single_url';
|
||||
import { PageOptions } from '../../../lib/entities';
|
||||
|
||||
|
||||
jest.mock('../single_url', () => {
|
||||
const originalModule = jest.requireActual('../single_url');
|
||||
originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
|
||||
@ -5,9 +9,6 @@ jest.mock('../single_url', () => {
|
||||
return originalModule;
|
||||
});
|
||||
|
||||
import { scrapSingleUrl } from '../single_url';
|
||||
import { PageOptions } from '../../../lib/entities';
|
||||
|
||||
describe('scrapSingleUrl', () => {
|
||||
it('should handle includeHtml option correctly', async () => {
|
||||
const url = 'https://roastmywebsite.ai';
|
||||
@ -22,3 +23,15 @@ describe('scrapSingleUrl', () => {
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
it('should return a list of links on the mendable.ai page', async () => {
|
||||
const url = 'https://mendable.ai';
|
||||
const pageOptions: PageOptions = { includeHtml: true };
|
||||
|
||||
const result = await scrapSingleUrl(url, pageOptions);
|
||||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://mendable.ai/blog')
|
||||
}, 10000);
|
||||
|
@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { scrapWithPlaywright } from "./scrapers/playwright";
|
||||
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
|
||||
import { extractLinks } from "./utils/utils";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@ -109,6 +110,8 @@ function getScrapingFallbackOrder(
|
||||
return scrapersInOrder as (typeof baseScrapers)[number][];
|
||||
}
|
||||
|
||||
|
||||
|
||||
export async function scrapSingleUrl(
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
@ -234,7 +237,6 @@ export async function scrapSingleUrl(
|
||||
scraperResponse.text = customScrapedContent.html;
|
||||
screenshot = customScrapedContent.screenshot;
|
||||
}
|
||||
|
||||
//* TODO: add an optional to return markdown or structured/extracted content
|
||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||
return {
|
||||
@ -309,6 +311,10 @@ export async function scrapSingleUrl(
|
||||
const soup = cheerio.load(rawHtml);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
@ -317,9 +323,10 @@ export async function scrapSingleUrl(
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
@ -335,7 +342,7 @@ export async function scrapSingleUrl(
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
@ -344,6 +351,7 @@ export async function scrapSingleUrl(
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage,
|
||||
};
|
||||
}
|
||||
|
||||
@ -354,6 +362,7 @@ export async function scrapSingleUrl(
|
||||
content: "",
|
||||
markdown: "",
|
||||
html: "",
|
||||
linksOnPage: [],
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
|
@ -1,4 +1,6 @@
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
|
||||
|
||||
export async function attemptScrapWithRequests(
|
||||
urlToScrap: string
|
||||
@ -21,3 +23,35 @@ export async function attemptScrapWithRequests(
|
||||
export function sanitizeText(text: string): string {
|
||||
return text.replace("\u0000", "");
|
||||
}
|
||||
|
||||
export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
const $ = cheerio.load(html);
|
||||
const links: string[] = [];
|
||||
|
||||
// Parse the base URL to get the origin
|
||||
const urlObject = new URL(baseUrl);
|
||||
const origin = urlObject.origin;
|
||||
|
||||
$('a').each((_, element) => {
|
||||
const href = $(element).attr('href');
|
||||
if (href) {
|
||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith('/')) {
|
||||
// Relative URL starting with '/', append to origin
|
||||
links.push(`${origin}${href}`);
|
||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(`${baseUrl}/${href}`);
|
||||
} else if (href.startsWith('mailto:')) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
}
|
||||
});
|
||||
|
||||
// Remove duplicates and return
|
||||
return [...new Set(links)];
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user