mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 16:59:05 +08:00
Merge branch 'main' into nsc/semantic-index-extract
This commit is contained in:
commit
81cf05885b
23
README.md
23
README.md
@ -365,19 +365,18 @@ curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
### Search (v0) (Beta)
|
### Search
|
||||||
|
|
||||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
The search endpoint combines web search with Firecrawl’s scraping capabilities to return full page content for any query.
|
||||||
|
|
||||||
|
Include `scrapeOptions` with `formats: ["markdown"]` to get complete markdown content for each search result otherwise it defaults to getting SERP results (url, title, description).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
curl -X POST https://api.firecrawl.dev/v1/search \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||||
-d '{
|
-d '{
|
||||||
"query": "firecrawl",
|
"query": "What is Mendable?"
|
||||||
"pageOptions": {
|
|
||||||
"fetchPageContent": true // false for a fast serp api
|
|
||||||
}
|
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -387,14 +386,8 @@ curl -X POST https://api.firecrawl.dev/v0/search \
|
|||||||
"data": [
|
"data": [
|
||||||
{
|
{
|
||||||
"url": "https://mendable.ai",
|
"url": "https://mendable.ai",
|
||||||
"markdown": "# Markdown Content",
|
"title": "Mendable | AI for CX and Sales",
|
||||||
"provider": "web-scraper",
|
"description": "AI for CX and Sales"
|
||||||
"metadata": {
|
|
||||||
"title": "Mendable | AI for CX and Sales",
|
|
||||||
"description": "AI for CX and Sales",
|
|
||||||
"language": null,
|
|
||||||
"sourceURL": "https://www.mendable.ai/"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
224
apps/api/src/controllers/v1/search.ts
Normal file
224
apps/api/src/controllers/v1/search.ts
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import { logger } from "../../lib/logger";
|
||||||
|
import {
|
||||||
|
Document,
|
||||||
|
RequestWithAuth,
|
||||||
|
SearchRequest,
|
||||||
|
SearchResponse,
|
||||||
|
searchRequestSchema,
|
||||||
|
ScrapeOptions,
|
||||||
|
} from "./types";
|
||||||
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
|
import { logJob } from "../../services/logging/log_job";
|
||||||
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
|
import { PlanType, Mode } from "../../types";
|
||||||
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
|
import { search } from "../../search";
|
||||||
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
|
|
||||||
|
async function scrapeSearchResult(
|
||||||
|
searchResult: { url: string; title: string; description: string },
|
||||||
|
options: {
|
||||||
|
teamId: string;
|
||||||
|
plan: PlanType | undefined;
|
||||||
|
origin: string;
|
||||||
|
timeout: number;
|
||||||
|
scrapeOptions: ScrapeOptions;
|
||||||
|
},
|
||||||
|
): Promise<Document> {
|
||||||
|
const jobId = uuidv4();
|
||||||
|
const jobPriority = await getJobPriority({
|
||||||
|
plan: options.plan as PlanType,
|
||||||
|
team_id: options.teamId,
|
||||||
|
basePriority: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (isUrlBlocked(searchResult.url)) {
|
||||||
|
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
||||||
|
}
|
||||||
|
await addScrapeJob(
|
||||||
|
{
|
||||||
|
url: searchResult.url,
|
||||||
|
mode: "single_urls" as Mode,
|
||||||
|
team_id: options.teamId,
|
||||||
|
scrapeOptions: options.scrapeOptions,
|
||||||
|
internalOptions: {},
|
||||||
|
plan: options.plan || "free",
|
||||||
|
origin: options.origin,
|
||||||
|
is_scrape: true,
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
jobId,
|
||||||
|
jobPriority,
|
||||||
|
);
|
||||||
|
|
||||||
|
const doc = await waitForJob<Document>(jobId, options.timeout);
|
||||||
|
await getScrapeQueue().remove(jobId);
|
||||||
|
|
||||||
|
// Move SERP results to top level
|
||||||
|
return {
|
||||||
|
title: searchResult.title,
|
||||||
|
description: searchResult.description,
|
||||||
|
url: searchResult.url,
|
||||||
|
...doc,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error in scrapeSearchResult: ${error}`, {
|
||||||
|
url: searchResult.url,
|
||||||
|
teamId: options.teamId,
|
||||||
|
});
|
||||||
|
|
||||||
|
let statusCode = 0;
|
||||||
|
if (error.message.includes("Could not scrape url")) {
|
||||||
|
statusCode = 403;
|
||||||
|
}
|
||||||
|
// Return a minimal document with SERP results at top level
|
||||||
|
return {
|
||||||
|
title: searchResult.title,
|
||||||
|
description: searchResult.description,
|
||||||
|
url: searchResult.url,
|
||||||
|
metadata: {
|
||||||
|
statusCode,
|
||||||
|
error: error.message,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function searchController(
|
||||||
|
req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
|
||||||
|
res: Response<SearchResponse>,
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
req.body = searchRequestSchema.parse(req.body);
|
||||||
|
|
||||||
|
const jobId = uuidv4();
|
||||||
|
const startTime = new Date().getTime();
|
||||||
|
|
||||||
|
let limit = req.body.limit;
|
||||||
|
|
||||||
|
// Buffer results by 50% to account for filtered URLs
|
||||||
|
const num_results_buffer = Math.floor(limit * 1.5);
|
||||||
|
|
||||||
|
let searchResults = await search({
|
||||||
|
query: req.body.query,
|
||||||
|
advanced: false,
|
||||||
|
num_results: num_results_buffer,
|
||||||
|
tbs: req.body.tbs,
|
||||||
|
filter: req.body.filter,
|
||||||
|
lang: req.body.lang,
|
||||||
|
country: req.body.country,
|
||||||
|
location: req.body.location,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Filter blocked URLs early to avoid unnecessary billing
|
||||||
|
if (searchResults.length > limit) {
|
||||||
|
searchResults = searchResults.slice(0, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (searchResults.length === 0) {
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: [],
|
||||||
|
warning: "No search results found",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
!req.body.scrapeOptions.formats ||
|
||||||
|
req.body.scrapeOptions.formats.length === 0
|
||||||
|
) {
|
||||||
|
billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch(
|
||||||
|
(error) => {
|
||||||
|
logger.error(
|
||||||
|
`Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: searchResults.map((r) => ({
|
||||||
|
url: r.url,
|
||||||
|
title: r.title,
|
||||||
|
description: r.description,
|
||||||
|
})) as Document[],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scrape each non-blocked result, handling timeouts individually
|
||||||
|
const scrapePromises = searchResults.map((result) =>
|
||||||
|
scrapeSearchResult(result, {
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
plan: req.auth.plan,
|
||||||
|
origin: req.body.origin,
|
||||||
|
timeout: req.body.timeout,
|
||||||
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const docs = await Promise.all(scrapePromises);
|
||||||
|
|
||||||
|
// Bill for successful scrapes only
|
||||||
|
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
|
||||||
|
logger.error(
|
||||||
|
`Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Filter out empty content but keep docs with SERP results
|
||||||
|
const filteredDocs = docs.filter(
|
||||||
|
(doc) =>
|
||||||
|
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (filteredDocs.length === 0) {
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: docs,
|
||||||
|
warning: "No content found in search results",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const endTime = new Date().getTime();
|
||||||
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
|
||||||
|
logJob({
|
||||||
|
job_id: jobId,
|
||||||
|
success: true,
|
||||||
|
num_docs: filteredDocs.length,
|
||||||
|
docs: filteredDocs,
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
mode: "search",
|
||||||
|
url: req.body.query,
|
||||||
|
origin: req.body.origin,
|
||||||
|
});
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: filteredDocs,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
if (
|
||||||
|
error instanceof Error &&
|
||||||
|
(error.message.startsWith("Job wait") || error.message === "timeout")
|
||||||
|
) {
|
||||||
|
return res.status(408).json({
|
||||||
|
success: false,
|
||||||
|
error: "Request timed out",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Sentry.captureException(error);
|
||||||
|
logger.error("Unhandled error occurred in search", { error });
|
||||||
|
return res.status(500).json({
|
||||||
|
success: false,
|
||||||
|
error: error.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions
|
|||||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||||
|
|
||||||
export type Document = {
|
export type Document = {
|
||||||
|
title?: string;
|
||||||
|
description?: string;
|
||||||
|
url?: string;
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
html?: string;
|
html?: string;
|
||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
@ -426,6 +429,11 @@ export type Document = {
|
|||||||
error?: string;
|
error?: string;
|
||||||
[key: string]: string | string[] | number | undefined;
|
[key: string]: string | string[] | number | undefined;
|
||||||
};
|
};
|
||||||
|
serpResults?: {
|
||||||
|
title: string;
|
||||||
|
description: string;
|
||||||
|
url: string;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export type ErrorResponse = {
|
export type ErrorResponse = {
|
||||||
@ -757,3 +765,36 @@ export function toLegacyDocument(
|
|||||||
warning: document.warning,
|
warning: document.warning,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const searchRequestSchema = z.object({
|
||||||
|
query: z.string(),
|
||||||
|
limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
|
||||||
|
tbs: z.string().optional(),
|
||||||
|
filter: z.string().optional(),
|
||||||
|
lang: z.string().optional().default("en"),
|
||||||
|
country: z.string().optional().default("us"),
|
||||||
|
location: z.string().optional(),
|
||||||
|
origin: z.string().optional().default("api"),
|
||||||
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
|
scrapeOptions: scrapeOptions.extend({
|
||||||
|
formats: z.array(z.enum([
|
||||||
|
"markdown",
|
||||||
|
"html",
|
||||||
|
"rawHtml",
|
||||||
|
"links",
|
||||||
|
"screenshot",
|
||||||
|
"screenshot@fullPage",
|
||||||
|
"extract"
|
||||||
|
])).default([])
|
||||||
|
}).default({}),
|
||||||
|
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
|
||||||
|
|
||||||
|
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||||
|
|
||||||
|
export type SearchResponse =
|
||||||
|
| ErrorResponse
|
||||||
|
| {
|
||||||
|
success: true;
|
||||||
|
warning?: string;
|
||||||
|
data: Document[];
|
||||||
|
};
|
||||||
|
@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract";
|
|||||||
// import { readinessController } from "../controllers/v1/readiness";
|
// import { readinessController } from "../controllers/v1/readiness";
|
||||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
|
import { searchController } from "../controllers/v1/search";
|
||||||
|
|
||||||
function checkCreditsMiddleware(
|
function checkCreditsMiddleware(
|
||||||
minimum?: number,
|
minimum?: number,
|
||||||
@ -169,6 +170,13 @@ v1Router.post(
|
|||||||
wrap(batchScrapeController),
|
wrap(batchScrapeController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.post(
|
||||||
|
"/search",
|
||||||
|
authMiddleware(RateLimiterMode.Search),
|
||||||
|
checkCreditsMiddleware(),
|
||||||
|
wrap(searchController),
|
||||||
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/map",
|
"/map",
|
||||||
authMiddleware(RateLimiterMode.Map),
|
authMiddleware(RateLimiterMode.Map),
|
||||||
@ -231,3 +239,6 @@ v1Router.get(
|
|||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
wrap(creditUsageController),
|
wrap(creditUsageController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ import { serper_search } from "./serper";
|
|||||||
export async function search({
|
export async function search({
|
||||||
query,
|
query,
|
||||||
advanced = false,
|
advanced = false,
|
||||||
num_results = 7,
|
num_results = 5,
|
||||||
tbs = undefined,
|
tbs = undefined,
|
||||||
filter = undefined,
|
filter = undefined,
|
||||||
lang = "en",
|
lang = "en",
|
||||||
|
@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL";
|
|||||||
|
|
||||||
type Mode = "crawl" | "single_urls" | "sitemap";
|
type Mode = "crawl" | "single_urls" | "sitemap";
|
||||||
|
|
||||||
|
export { Mode };
|
||||||
|
|
||||||
export interface CrawlResult {
|
export interface CrawlResult {
|
||||||
source: string;
|
source: string;
|
||||||
content: string;
|
content: string;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.10.1",
|
"version": "1.11.2",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -381,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||||||
expect(filteredLinks?.length).toBeGreaterThan(0);
|
expect(filteredLinks?.length).toBeGreaterThan(0);
|
||||||
}, 30000); // 30 seconds timeout
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
test('should throw NotImplementedError for search on v1', async () => {
|
|
||||||
|
|
||||||
|
test('should search with string query', async () => {
|
||||||
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
||||||
await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
|
const response = await app.search("firecrawl");
|
||||||
|
expect(response.success).toBe(true);
|
||||||
|
console.log(response.data);
|
||||||
|
expect(response.data?.length).toBeGreaterThan(0);
|
||||||
|
expect(response.data?.[0]?.markdown).toBeDefined();
|
||||||
|
expect(response.data?.[0]?.metadata).toBeDefined();
|
||||||
|
expect(response.data?.[0]?.metadata?.title).toBeDefined();
|
||||||
|
expect(response.data?.[0]?.metadata?.description).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should search with params object', async () => {
|
||||||
|
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
||||||
|
const response = await app.search("firecrawl", {
|
||||||
|
limit: 3,
|
||||||
|
lang: 'en',
|
||||||
|
country: 'us',
|
||||||
|
scrapeOptions: {
|
||||||
|
formats: ['markdown', 'html', 'links'],
|
||||||
|
onlyMainContent: true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
expect(response.success).toBe(true);
|
||||||
|
expect(response.data.length).toBeLessThanOrEqual(3);
|
||||||
|
for (const doc of response.data) {
|
||||||
|
expect(doc.markdown).toBeDefined();
|
||||||
|
expect(doc.html).toBeDefined();
|
||||||
|
expect(doc.links).toBeDefined();
|
||||||
|
expect(doc.metadata).toBeDefined();
|
||||||
|
expect(doc.metadata?.title).toBeDefined();
|
||||||
|
expect(doc.metadata?.description).toBeDefined();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle invalid API key for search', async () => {
|
||||||
|
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" });
|
||||||
|
await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios";
|
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios";
|
||||||
import type * as zt from "zod";
|
import * as zt from "zod";
|
||||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||||
import { WebSocket } from "isows";
|
import { WebSocket } from "isows";
|
||||||
import { TypedEventTarget } from "typescript-event-target";
|
import { TypedEventTarget } from "typescript-event-target";
|
||||||
@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
|||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata?: FirecrawlDocumentMetadata;
|
metadata?: FirecrawlDocumentMetadata;
|
||||||
actions: ActionsSchema;
|
actions: ActionsSchema;
|
||||||
|
// v1 search only
|
||||||
|
title?: string;
|
||||||
|
description?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -244,7 +247,7 @@ export interface MapResponse {
|
|||||||
*/
|
*/
|
||||||
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
||||||
prompt?: string;
|
prompt?: string;
|
||||||
schema?: LLMSchema;
|
schema?: LLMSchema | object;
|
||||||
systemPrompt?: string;
|
systemPrompt?: string;
|
||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
includeSubdomains?: boolean;
|
includeSubdomains?: boolean;
|
||||||
@ -282,6 +285,33 @@ export class FirecrawlError extends Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parameters for search operations.
|
||||||
|
* Defines options for searching and scraping search results.
|
||||||
|
*/
|
||||||
|
export interface SearchParams {
|
||||||
|
limit?: number;
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
lang?: string;
|
||||||
|
country?: string;
|
||||||
|
location?: string;
|
||||||
|
origin?: string;
|
||||||
|
timeout?: number;
|
||||||
|
scrapeOptions?: ScrapeParams;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for search operations.
|
||||||
|
* Defines the structure of the response received after a search operation.
|
||||||
|
*/
|
||||||
|
export interface SearchResponse {
|
||||||
|
success: boolean;
|
||||||
|
data: FirecrawlDocument<undefined>[];
|
||||||
|
warning?: string;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main class for interacting with the Firecrawl API.
|
* Main class for interacting with the Firecrawl API.
|
||||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||||
@ -369,16 +399,80 @@ export default class FirecrawlApp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
|
* Searches using the Firecrawl API and optionally scrapes the results.
|
||||||
* @param query - The search query string.
|
* @param query - The search query string.
|
||||||
* @param params - Additional parameters for the search.
|
* @param params - Optional parameters for the search request.
|
||||||
* @returns Throws an error advising to use version 0 of the API.
|
* @returns The response from the search operation.
|
||||||
*/
|
*/
|
||||||
async search(
|
async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
|
||||||
query: string,
|
const headers: AxiosRequestHeaders = {
|
||||||
params?: any
|
"Content-Type": "application/json",
|
||||||
): Promise<any> {
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400);
|
} as AxiosRequestHeaders;
|
||||||
|
|
||||||
|
let jsonData: any = {
|
||||||
|
query,
|
||||||
|
limit: params?.limit ?? 5,
|
||||||
|
tbs: params?.tbs,
|
||||||
|
filter: params?.filter,
|
||||||
|
lang: params?.lang ?? "en",
|
||||||
|
country: params?.country ?? "us",
|
||||||
|
location: params?.location,
|
||||||
|
origin: params?.origin ?? "api",
|
||||||
|
timeout: params?.timeout ?? 60000,
|
||||||
|
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
|
||||||
|
};
|
||||||
|
|
||||||
|
if (jsonData?.scrapeOptions?.extract?.schema) {
|
||||||
|
let schema = jsonData.scrapeOptions.extract.schema;
|
||||||
|
|
||||||
|
// Try parsing the schema as a Zod schema
|
||||||
|
try {
|
||||||
|
schema = zodToJsonSchema(schema);
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
|
}
|
||||||
|
jsonData = {
|
||||||
|
...jsonData,
|
||||||
|
scrapeOptions: {
|
||||||
|
...jsonData.scrapeOptions,
|
||||||
|
extract: {
|
||||||
|
...jsonData.scrapeOptions.extract,
|
||||||
|
schema: schema,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response: AxiosResponse = await this.postRequest(
|
||||||
|
this.apiUrl + `/v1/search`,
|
||||||
|
jsonData,
|
||||||
|
headers
|
||||||
|
);
|
||||||
|
|
||||||
|
if (response.status === 200) {
|
||||||
|
const responseData = response.data;
|
||||||
|
if (responseData.success) {
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
data: responseData.data as FirecrawlDocument<any>[],
|
||||||
|
warning: responseData.warning,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.handleError(response, "search");
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.data?.error) {
|
||||||
|
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||||
|
} else {
|
||||||
|
throw new FirecrawlError(error.message, 500);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error.", data: [] };
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -741,16 +835,18 @@ export default class FirecrawlApp {
|
|||||||
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
||||||
const headers = this.prepareHeaders();
|
const headers = this.prepareHeaders();
|
||||||
|
|
||||||
if (!params?.prompt) {
|
|
||||||
throw new FirecrawlError("Prompt is required", 400);
|
|
||||||
}
|
|
||||||
|
|
||||||
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
|
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
|
||||||
let jsonSchema: any;
|
let jsonSchema: any;
|
||||||
try {
|
try {
|
||||||
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
|
if (!params?.schema) {
|
||||||
|
jsonSchema = undefined;
|
||||||
|
} else if (params.schema instanceof zt.ZodType) {
|
||||||
|
jsonSchema = zodToJsonSchema(params.schema);
|
||||||
|
} else {
|
||||||
|
jsonSchema = params.schema;
|
||||||
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
|
throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.7.1"
|
__version__ = "1.8.0"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -371,4 +371,70 @@ def test_search_e2e():
|
|||||||
# assert isinstance(llm_extraction['supports_sso'], bool)
|
# assert isinstance(llm_extraction['supports_sso'], bool)
|
||||||
# assert isinstance(llm_extraction['is_open_source'], bool)
|
# assert isinstance(llm_extraction['is_open_source'], bool)
|
||||||
|
|
||||||
|
def test_search_with_string_query():
|
||||||
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
|
response = app.search("firecrawl")
|
||||||
|
assert response["success"] is True
|
||||||
|
assert len(response["data"]) > 0
|
||||||
|
assert response["data"][0]["markdown"] is not None
|
||||||
|
assert response["data"][0]["metadata"] is not None
|
||||||
|
assert response["data"][0]["metadata"]["title"] is not None
|
||||||
|
assert response["data"][0]["metadata"]["description"] is not None
|
||||||
|
|
||||||
|
def test_search_with_params_dict():
|
||||||
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
|
response = app.search("firecrawl", {
|
||||||
|
"limit": 3,
|
||||||
|
"lang": "en",
|
||||||
|
"country": "us",
|
||||||
|
"scrapeOptions": {
|
||||||
|
"formats": ["markdown", "html", "links"],
|
||||||
|
"onlyMainContent": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
assert response["success"] is True
|
||||||
|
assert len(response["data"]) <= 3
|
||||||
|
for doc in response["data"]:
|
||||||
|
assert doc["markdown"] is not None
|
||||||
|
assert doc["html"] is not None
|
||||||
|
assert doc["links"] is not None
|
||||||
|
assert doc["metadata"] is not None
|
||||||
|
assert doc["metadata"]["title"] is not None
|
||||||
|
assert doc["metadata"]["description"] is not None
|
||||||
|
|
||||||
|
def test_search_with_params_object():
|
||||||
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
|
params = SearchParams(
|
||||||
|
query="firecrawl",
|
||||||
|
limit=3,
|
||||||
|
lang="en",
|
||||||
|
country="us",
|
||||||
|
scrapeOptions={
|
||||||
|
"formats": ["markdown", "html", "links"],
|
||||||
|
"onlyMainContent": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response = app.search(params.query, params)
|
||||||
|
assert response["success"] is True
|
||||||
|
assert len(response["data"]) <= 3
|
||||||
|
for doc in response["data"]:
|
||||||
|
assert doc["markdown"] is not None
|
||||||
|
assert doc["html"] is not None
|
||||||
|
assert doc["links"] is not None
|
||||||
|
assert doc["metadata"] is not None
|
||||||
|
assert doc["metadata"]["title"] is not None
|
||||||
|
assert doc["metadata"]["description"] is not None
|
||||||
|
|
||||||
|
def test_search_invalid_api_key():
|
||||||
|
app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||||
|
with pytest.raises(Exception) as e:
|
||||||
|
app.search("test query")
|
||||||
|
assert "404" in str(e.value)
|
||||||
|
|
||||||
|
def test_search_with_invalid_params():
|
||||||
|
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||||
|
with pytest.raises(Exception) as e:
|
||||||
|
app.search("test query", {"invalid_param": "value"})
|
||||||
|
assert "ValidationError" in str(e.value)
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +21,28 @@ import websockets
|
|||||||
|
|
||||||
logger : logging.Logger = logging.getLogger("firecrawl")
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
|
class SearchParams(pydantic.BaseModel):
|
||||||
|
query: str
|
||||||
|
limit: Optional[int] = 5
|
||||||
|
tbs: Optional[str] = None
|
||||||
|
filter: Optional[str] = None
|
||||||
|
lang: Optional[str] = "en"
|
||||||
|
country: Optional[str] = "us"
|
||||||
|
location: Optional[str] = None
|
||||||
|
origin: Optional[str] = "api"
|
||||||
|
timeout: Optional[int] = 60000
|
||||||
|
scrapeOptions: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
|
class SearchResponse(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Response from the search operation.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
data: List[Dict[str, Any]]
|
||||||
|
warning: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
class ExtractParams(pydantic.BaseModel):
|
class ExtractParams(pydantic.BaseModel):
|
||||||
"""
|
"""
|
||||||
Parameters for the extract operation.
|
Parameters for the extract operation.
|
||||||
@ -109,22 +130,36 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'scrape URL')
|
self._handle_error(response, 'scrape URL')
|
||||||
|
|
||||||
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Perform a search using the Firecrawl API.
|
Search for content using the Firecrawl API.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (str): The search query.
|
query (str): The search query string.
|
||||||
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Any: The search results if the request is successful.
|
Dict[str, Any]: The search response containing success status and search results.
|
||||||
|
|
||||||
Raises:
|
|
||||||
NotImplementedError: If the search request is attempted on API version v1.
|
|
||||||
Exception: If the search request fails.
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("Search is not supported in v1.")
|
if params is None:
|
||||||
|
params = {}
|
||||||
|
|
||||||
|
if isinstance(params, dict):
|
||||||
|
search_params = SearchParams(query=query, **params)
|
||||||
|
else:
|
||||||
|
search_params = params
|
||||||
|
search_params.query = query
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.api_url}/v1/search",
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||||
|
json=search_params.dict(exclude_none=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise Exception(f"Request failed with status code {response.status_code}")
|
||||||
|
|
||||||
|
return response.json()
|
||||||
|
|
||||||
def crawl_url(self, url: str,
|
def crawl_url(self, url: str,
|
||||||
params: Optional[Dict[str, Any]] = None,
|
params: Optional[Dict[str, Any]] = None,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user