mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 22:46:08 +08:00
Merge branch 'main' into sdk-improv/async
This commit is contained in:
commit
e425667005
@ -5,7 +5,9 @@ const sleepForBatchBilling = () => sleep(20000);
|
|||||||
|
|
||||||
beforeAll(async () => {
|
beforeAll(async () => {
|
||||||
// Wait for previous test runs to stop billing processing
|
// Wait for previous test runs to stop billing processing
|
||||||
await sleep(40000);
|
if (!process.env.TEST_SUITE_SELF_HOSTED) {
|
||||||
|
await sleep(40000);
|
||||||
|
}
|
||||||
}, 50000);
|
}, 50000);
|
||||||
|
|
||||||
describe("Billing tests", () => {
|
describe("Billing tests", () => {
|
||||||
@ -95,7 +97,7 @@ describe("Billing tests", () => {
|
|||||||
const rc2 = (await creditUsage()).remaining_credits;
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
expect(rc1 - rc2).toBe(12);
|
expect(rc1 - rc2).toBe(12);
|
||||||
}, 300000);
|
}, 600000);
|
||||||
|
|
||||||
it("bills crawl correctly", async () => {
|
it("bills crawl correctly", async () => {
|
||||||
const rc1 = (await creditUsage()).remaining_credits;
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
@ -137,7 +139,7 @@ describe("Billing tests", () => {
|
|||||||
if (crawl1.success && crawl2.success) {
|
if (crawl1.success && crawl2.success) {
|
||||||
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
|
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
|
||||||
}
|
}
|
||||||
}, 300000);
|
}, 600000);
|
||||||
|
|
||||||
it("bills map correctly", async () => {
|
it("bills map correctly", async () => {
|
||||||
const rc1 = (await creditUsage()).remaining_credits;
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
@ -29,6 +29,28 @@ describe("Extract tests", () => {
|
|||||||
expect(typeof res.data.is_open_source).toBe("boolean");
|
expect(typeof res.data.is_open_source).toBe("boolean");
|
||||||
expect(res.data.is_open_source).toBe(true);
|
expect(res.data.is_open_source).toBe(true);
|
||||||
}, 60000);
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent("works with unsupported JSON schema parameters", async () => {
|
||||||
|
const res = await extract({
|
||||||
|
urls: ["https://firecrawl.dev"],
|
||||||
|
schema: {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"company_name": {
|
||||||
|
"type": "string",
|
||||||
|
"pattern": "^[a-zA-Z0-9]+$"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"company_name"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
origin: "api-sdk",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.data).toHaveProperty("company_name");
|
||||||
|
expect(typeof res.data.company_name).toBe("string")
|
||||||
|
}, 60000);
|
||||||
} else {
|
} else {
|
||||||
it.concurrent("dummy test", () => {
|
it.concurrent("dummy test", () => {
|
||||||
expect(true).toBe(true);
|
expect(true).toBe(true);
|
||||||
|
@ -10,6 +10,7 @@ export const deepResearchRequestSchema = z.object({
|
|||||||
maxDepth: z.number().min(1).max(12).default(7).describe('Maximum depth of research iterations'),
|
maxDepth: z.number().min(1).max(12).default(7).describe('Maximum depth of research iterations'),
|
||||||
maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
|
maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
|
||||||
timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
|
timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
|
||||||
|
analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(),
|
||||||
// @deprecated Use query instead
|
// @deprecated Use query instead
|
||||||
topic: z.string().describe('The topic or question to research').optional(),
|
topic: z.string().describe('The topic or question to research').optional(),
|
||||||
}).refine(data => data.query || data.topic, {
|
}).refine(data => data.query || data.topic, {
|
||||||
|
@ -56,6 +56,7 @@ export async function getMapResults({
|
|||||||
allowExternalLinks,
|
allowExternalLinks,
|
||||||
abort = new AbortController().signal, // noop
|
abort = new AbortController().signal, // noop
|
||||||
mock,
|
mock,
|
||||||
|
filterByPath = true,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@ -70,6 +71,7 @@ export async function getMapResults({
|
|||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
mock?: string;
|
mock?: string;
|
||||||
|
filterByPath?: boolean;
|
||||||
}): Promise<MapResult> {
|
}): Promise<MapResult> {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [url];
|
let links: string[] = [url];
|
||||||
@ -247,6 +249,29 @@ export async function getMapResults({
|
|||||||
links = links.filter((x) => isSameSubdomain(x, url));
|
links = links.filter((x) => isSameSubdomain(x, url));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Filter by path if enabled
|
||||||
|
if (filterByPath && !allowExternalLinks) {
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
const urlPath = urlObj.pathname;
|
||||||
|
// Only apply path filtering if the URL has a significant path (not just '/' or empty)
|
||||||
|
// This means we only filter by path if the user has not selected a root domain
|
||||||
|
if (urlPath && urlPath !== '/' && urlPath.length > 1) {
|
||||||
|
links = links.filter(link => {
|
||||||
|
try {
|
||||||
|
const linkObj = new URL(link);
|
||||||
|
return linkObj.pathname.startsWith(urlPath);
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// If URL parsing fails, continue without path filtering
|
||||||
|
logger.warn(`Failed to parse URL for path filtering: ${url}`, { error: e });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = removeDuplicateUrls(links);
|
links = removeDuplicateUrls(links);
|
||||||
}
|
}
|
||||||
@ -300,6 +325,7 @@ export async function mapController(
|
|||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
abort: abort.signal,
|
abort: abort.signal,
|
||||||
mock: req.body.useMock,
|
mock: req.body.useMock,
|
||||||
|
filterByPath: req.body.filterByPath !== false,
|
||||||
}),
|
}),
|
||||||
...(req.body.timeout !== undefined ? [
|
...(req.body.timeout !== undefined ? [
|
||||||
new Promise((resolve, reject) => setTimeout(() => {
|
new Promise((resolve, reject) => setTimeout(() => {
|
||||||
|
@ -19,12 +19,22 @@ export async function scrapeController(
|
|||||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||||
res: Response<ScrapeResponse>,
|
res: Response<ScrapeResponse>,
|
||||||
) {
|
) {
|
||||||
|
const jobId = uuidv4();
|
||||||
|
const preNormalizedBody = { ...req.body };
|
||||||
|
|
||||||
|
logger.debug("Scrape " + jobId + " starting", {
|
||||||
|
scrapeId: jobId,
|
||||||
|
request: req.body,
|
||||||
|
originalRequest: preNormalizedBody,
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
account: req.account,
|
||||||
|
});
|
||||||
|
|
||||||
req.body = scrapeRequestSchema.parse(req.body);
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
|
|
||||||
const origin = req.body.origin;
|
const origin = req.body.origin;
|
||||||
const timeout = req.body.timeout;
|
const timeout = req.body.timeout;
|
||||||
const jobId = uuidv4();
|
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
|
@ -114,6 +114,7 @@ export const actionsSchema = z
|
|||||||
z.object({
|
z.object({
|
||||||
type: z.literal("click"),
|
type: z.literal("click"),
|
||||||
selector: z.string(),
|
selector: z.string(),
|
||||||
|
all: z.boolean().default(false),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("screenshot"),
|
type: z.literal("screenshot"),
|
||||||
@ -314,7 +315,8 @@ export const extractV1Options = z
|
|||||||
.object({
|
.object({
|
||||||
urls: url
|
urls: url
|
||||||
.array()
|
.array()
|
||||||
.max(10, "Maximum of 10 URLs allowed per request while in beta."),
|
.max(10, "Maximum of 10 URLs allowed per request while in beta.")
|
||||||
|
.optional(),
|
||||||
prompt: z.string().max(10000).optional(),
|
prompt: z.string().max(10000).optional(),
|
||||||
systemPrompt: z.string().max(10000).optional(),
|
systemPrompt: z.string().max(10000).optional(),
|
||||||
schema: z
|
schema: z
|
||||||
@ -354,6 +356,12 @@ export const extractV1Options = z
|
|||||||
.optional(),
|
.optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
|
.refine(
|
||||||
|
(obj) => obj.urls || obj.prompt,
|
||||||
|
{
|
||||||
|
message: "Either 'urls' or 'prompt' must be provided.",
|
||||||
|
},
|
||||||
|
)
|
||||||
.transform((obj) => ({
|
.transform((obj) => ({
|
||||||
...obj,
|
...obj,
|
||||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||||
@ -506,6 +514,7 @@ export const mapRequestSchema = crawlerOptions
|
|||||||
limit: z.number().min(1).max(30000).default(5000),
|
limit: z.number().min(1).max(30000).default(5000),
|
||||||
timeout: z.number().positive().finite().optional(),
|
timeout: z.number().positive().finite().optional(),
|
||||||
useMock: z.string().optional(),
|
useMock: z.string().optional(),
|
||||||
|
filterByPath: z.boolean().default(true),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@ -1001,7 +1010,7 @@ export const generateLLMsTextRequestSchema = z.object({
|
|||||||
maxUrls: z
|
maxUrls: z
|
||||||
.number()
|
.number()
|
||||||
.min(1)
|
.min(1)
|
||||||
.max(100)
|
.max(5000)
|
||||||
.default(10)
|
.default(10)
|
||||||
.describe("Maximum number of URLs to process"),
|
.describe("Maximum number of URLs to process"),
|
||||||
showFullText: z
|
showFullText: z
|
||||||
|
@ -89,9 +89,6 @@ const HOST = process.env.HOST ?? "localhost";
|
|||||||
function startServer(port = DEFAULT_PORT) {
|
function startServer(port = DEFAULT_PORT) {
|
||||||
const server = app.listen(Number(port), HOST, () => {
|
const server = app.listen(Number(port), HOST, () => {
|
||||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||||
logger.info(
|
|
||||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
|
||||||
);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const exitHandler = () => {
|
const exitHandler = () => {
|
||||||
|
@ -27,7 +27,7 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
|||||||
plan: crawl.plan,
|
plan: crawl.plan,
|
||||||
});
|
});
|
||||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id, 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
||||||
@ -37,6 +37,7 @@ export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await redisConnection.expire("crawl:" + id, 24 * 60 * 60);
|
||||||
return JSON.parse(x);
|
return JSON.parse(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -56,7 +57,7 @@ export async function addCrawlJob(id: string, job_id: string) {
|
|||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
@ -69,7 +70,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
|||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobDone(
|
export async function addCrawlJobDone(
|
||||||
@ -87,7 +88,6 @@ export async function addCrawlJobDone(
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":jobs_done",
|
"crawl:" + id + ":jobs_done",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX",
|
|
||||||
);
|
);
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
@ -104,11 +104,11 @@ export async function addCrawlJobDone(
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX",
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
|
||||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,6 +117,7 @@ export async function getDoneJobsOrdered(
|
|||||||
start = 0,
|
start = 0,
|
||||||
end = -1,
|
end = -1,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
|
||||||
return await redisConnection.lrange(
|
return await redisConnection.lrange(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
start,
|
start,
|
||||||
@ -125,6 +126,7 @@ export async function getDoneJobsOrdered(
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinished(id: string) {
|
export async function isCrawlFinished(id: string) {
|
||||||
|
await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
|
||||||
return (
|
return (
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs")) &&
|
(await redisConnection.scard("crawl:" + id + ":jobs")) &&
|
||||||
@ -133,6 +135,7 @@ export async function isCrawlFinished(id: string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlKickoffFinished(id: string) {
|
export async function isCrawlKickoffFinished(id: string) {
|
||||||
|
await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
|
||||||
return (
|
return (
|
||||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||||
);
|
);
|
||||||
@ -159,9 +162,7 @@ export async function finishCrawl(id: string) {
|
|||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||||
if (set === 1) {
|
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
|
||||||
}
|
|
||||||
return set === 1;
|
return set === 1;
|
||||||
} else {
|
} else {
|
||||||
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
|
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
|
||||||
@ -294,14 +295,13 @@ export async function lockURL(
|
|||||||
res = x === permutations.length;
|
res = x === permutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":visited_unique",
|
"crawl:" + id + ":visited_unique",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX",
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -334,7 +334,6 @@ export async function lockURLs(
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":visited_unique",
|
"crawl:" + id + ":visited_unique",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX",
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
@ -353,7 +352,7 @@ export async function lockURLs(
|
|||||||
res = x === allPermutations.length;
|
res = x === allPermutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
|
||||||
|
|
||||||
logger.debug("lockURLs final result: " + res, { res });
|
logger.debug("lockURLs final result: " + res, { res });
|
||||||
return res;
|
return res;
|
||||||
|
@ -14,6 +14,7 @@ interface DeepResearchServiceOptions {
|
|||||||
maxDepth: number;
|
maxDepth: number;
|
||||||
maxUrls: number;
|
maxUrls: number;
|
||||||
timeLimit: number;
|
timeLimit: number;
|
||||||
|
analysisPrompt: string;
|
||||||
subId?: string;
|
subId?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -262,6 +263,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
options.query,
|
options.query,
|
||||||
state.getFindings(),
|
state.getFindings(),
|
||||||
state.getSummaries(),
|
state.getSummaries(),
|
||||||
|
options.analysisPrompt,
|
||||||
);
|
);
|
||||||
|
|
||||||
await state.addActivity({
|
await state.addActivity({
|
||||||
|
@ -253,6 +253,7 @@ export class ResearchLLMService {
|
|||||||
topic: string,
|
topic: string,
|
||||||
findings: DeepResearchFinding[],
|
findings: DeepResearchFinding[],
|
||||||
summaries: string[],
|
summaries: string[],
|
||||||
|
analysisPrompt: string,
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
const { extract } = await generateCompletions({
|
const { extract } = await generateCompletions({
|
||||||
logger: this.logger.child({
|
logger: this.logger.child({
|
||||||
@ -265,7 +266,9 @@ export class ResearchLLMService {
|
|||||||
"You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
|
"You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
|
||||||
new Date().toISOString().split("T")[0],
|
new Date().toISOString().split("T")[0],
|
||||||
prompt: trimToTokenLimit(
|
prompt: trimToTokenLimit(
|
||||||
`Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
|
analysisPrompt
|
||||||
|
? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
|
||||||
|
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
|
||||||
|
|
||||||
Research data:
|
Research data:
|
||||||
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
|
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
|
||||||
|
@ -21,6 +21,7 @@ export type Action =
|
|||||||
| {
|
| {
|
||||||
type: "click";
|
type: "click";
|
||||||
selector: string;
|
selector: string;
|
||||||
|
all?: boolean;
|
||||||
}
|
}
|
||||||
| {
|
| {
|
||||||
type: "screenshot";
|
type: "screenshot";
|
||||||
|
@ -105,3 +105,10 @@ export function buildBatchExtractSystemPrompt(
|
|||||||
export function buildBatchExtractPrompt(prompt: string): string {
|
export function buildBatchExtractPrompt(prompt: string): string {
|
||||||
return `Today is: ${new Date().toISOString()}\n${prompt}`;
|
return `Today is: ${new Date().toISOString()}\n${prompt}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export function buildRephraseToSerpPrompt(prompt: string): string {
|
||||||
|
return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results:
|
||||||
|
|
||||||
|
Original Prompt: "${prompt}"`;
|
||||||
|
}
|
||||||
|
@ -6,7 +6,7 @@ import {
|
|||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
import { logger as _logger } from "../logger";
|
import { logger as _logger } from "../logger";
|
||||||
import { processUrl } from "./url-processor";
|
import { generateBasicCompletion, processUrl } from "./url-processor";
|
||||||
import { scrapeDocument } from "./document-scraper";
|
import { scrapeDocument } from "./document-scraper";
|
||||||
import {
|
import {
|
||||||
generateCompletions,
|
generateCompletions,
|
||||||
@ -38,6 +38,8 @@ import { singleAnswerCompletion } from "./completions/singleAnswer";
|
|||||||
import { SourceTracker } from "./helpers/source-tracker";
|
import { SourceTracker } from "./helpers/source-tracker";
|
||||||
import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
||||||
import { normalizeUrl } from "../canonical-url";
|
import { normalizeUrl } from "../canonical-url";
|
||||||
|
import { search } from "../../search";
|
||||||
|
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||||
|
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
request: ExtractRequest;
|
request: ExtractRequest;
|
||||||
@ -84,16 +86,43 @@ export async function performExtraction(
|
|||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
let sources: Record<string, string[]> = {};
|
let sources: Record<string, string[]> = {};
|
||||||
|
|
||||||
|
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extract",
|
module: "extract",
|
||||||
method: "performExtraction",
|
method: "performExtraction",
|
||||||
extractId,
|
extractId,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) {
|
// If no URLs are provided, generate URLs from the prompt
|
||||||
|
if ((!request.urls || request.urls.length === 0) && request.prompt) {
|
||||||
|
logger.debug("Generating URLs from prompt...", {
|
||||||
|
prompt: request.prompt,
|
||||||
|
});
|
||||||
|
const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt));
|
||||||
|
const searchResults = await search({
|
||||||
|
query: rephrasedPrompt.replace('"', "").replace("'", ""),
|
||||||
|
num_results: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
request.urls = searchResults.map(result => result.url) as string[];
|
||||||
|
}
|
||||||
|
if (request.urls && request.urls.length === 0) {
|
||||||
|
logger.error("No search results found", {
|
||||||
|
query: request.prompt,
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: "No search results found",
|
||||||
|
extractId,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const urls = request.urls || ([] as string[]);
|
||||||
|
|
||||||
|
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) {
|
||||||
logger.debug("Loading cached docs...");
|
logger.debug("Loading cached docs...");
|
||||||
try {
|
try {
|
||||||
const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey);
|
const cache = await getCachedDocs(urls, request.__experimental_cacheKey);
|
||||||
for (const doc of cache) {
|
for (const doc of cache) {
|
||||||
if (doc.metadata.url) {
|
if (doc.metadata.url) {
|
||||||
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||||
@ -122,11 +151,10 @@ export async function performExtraction(
|
|||||||
let startMap = Date.now();
|
let startMap = Date.now();
|
||||||
let aggMapLinks: string[] = [];
|
let aggMapLinks: string[] = [];
|
||||||
logger.debug("Processing URLs...", {
|
logger.debug("Processing URLs...", {
|
||||||
urlCount: request.urls.length,
|
urlCount: request.urls?.length || 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process URLs
|
const urlPromises = urls.map((url) =>
|
||||||
const urlPromises = request.urls.map((url) =>
|
|
||||||
processUrl(
|
processUrl(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
@ -746,7 +774,7 @@ export async function performExtraction(
|
|||||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
mode: "extract",
|
mode: "extract",
|
||||||
url: request.urls.join(", "),
|
url: request.urls?.join(", ") || "",
|
||||||
scrapeOptions: request,
|
scrapeOptions: request,
|
||||||
origin: request.origin ?? "api",
|
origin: request.origin ?? "api",
|
||||||
num_tokens: totalTokensUsed,
|
num_tokens: totalTokensUsed,
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
import { supabase_rr_service, supabase_service } from "../../services/supabase";
|
import { supabase_rr_service, supabase_service } from "../../services/supabase";
|
||||||
import { logger } from "../logger";
|
import { logger } from "../logger";
|
||||||
|
|
||||||
export async function getTeamIdSyncB(teamId: string) {
|
import { withAuth } from "../withAuth";
|
||||||
|
|
||||||
|
async function getTeamIdSyncBOriginal(teamId: string) {
|
||||||
try {
|
try {
|
||||||
const { data, error } = await supabase_rr_service
|
const { data, error } = await supabase_rr_service
|
||||||
.from("eb-sync")
|
.from("eb-sync")
|
||||||
@ -17,3 +19,5 @@ export async function getTeamIdSyncB(teamId: string) {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const getTeamIdSyncB = withAuth(getTeamIdSyncBOriginal, null);
|
||||||
|
@ -52,7 +52,18 @@ function checkCreditsMiddleware(
|
|||||||
if (chunk) {
|
if (chunk) {
|
||||||
req.acuc = chunk;
|
req.acuc = chunk;
|
||||||
}
|
}
|
||||||
|
req.account = { remainingCredits };
|
||||||
if (!success) {
|
if (!success) {
|
||||||
|
if (!minimum && req.body && (req.body as any).limit !== undefined && remainingCredits > 0) {
|
||||||
|
logger.warn("Adjusting limit to remaining credits", {
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
remainingCredits,
|
||||||
|
request: req.body,
|
||||||
|
});
|
||||||
|
(req.body as any).limit = remainingCredits;
|
||||||
|
return next();
|
||||||
|
}
|
||||||
|
|
||||||
const currencyName = req.acuc.is_extract ? "tokens" : "credits"
|
const currencyName = req.acuc.is_extract ? "tokens" : "credits"
|
||||||
logger.error(
|
logger.error(
|
||||||
`Insufficient ${currencyName}: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`,
|
`Insufficient ${currencyName}: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`,
|
||||||
@ -72,7 +83,6 @@ function checkCreditsMiddleware(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
req.account = { remainingCredits };
|
|
||||||
next();
|
next();
|
||||||
})().catch((err) => next(err));
|
})().catch((err) => next(err));
|
||||||
};
|
};
|
||||||
|
@ -271,7 +271,7 @@ export class WebCrawler {
|
|||||||
return urlsHandler(urls);
|
return urlsHandler(urls);
|
||||||
} else {
|
} else {
|
||||||
let filteredLinks = this.filterLinks(
|
let filteredLinks = this.filterLinks(
|
||||||
[...new Set(urls)],
|
[...new Set(urls)].filter(x => this.filterURL(x, this.initialUrl) !== null),
|
||||||
leftOfLimit,
|
leftOfLimit,
|
||||||
this.maxCrawledDepth,
|
this.maxCrawledDepth,
|
||||||
fromMap,
|
fromMap,
|
||||||
@ -384,7 +384,6 @@ export class WebCrawler {
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + this.jobId + ":robots_blocked",
|
"crawl:" + this.jobId + ":robots_blocked",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX",
|
|
||||||
);
|
);
|
||||||
})();
|
})();
|
||||||
}
|
}
|
||||||
@ -456,7 +455,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}).filter(x => x !== null) as string[])];
|
}).filter(x => x !== null) as string[])];
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
this.logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
|
||||||
error,
|
error,
|
||||||
module: "scrapeURL", method: "extractMetadata"
|
module: "scrapeURL", method: "extractMetadata"
|
||||||
});
|
});
|
||||||
|
@ -41,7 +41,7 @@ export async function extractLinks(html: string, baseUrl: string): Promise<strin
|
|||||||
try {
|
try {
|
||||||
return await extractLinksRust(html, baseUrl);
|
return await extractLinksRust(html, baseUrl);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
|
||||||
error,
|
error,
|
||||||
module: "scrapeURL", method: "extractLinks"
|
module: "scrapeURL", method: "extractLinks"
|
||||||
});
|
});
|
||||||
|
@ -26,7 +26,7 @@ export async function extractMetadata(
|
|||||||
try {
|
try {
|
||||||
return await extractMetadataRust(meta, html);
|
return await extractMetadataRust(meta, html);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
meta.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
meta.logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
|
||||||
error,
|
error,
|
||||||
module: "scrapeURL", method: "extractMetadata"
|
module: "scrapeURL", method: "extractMetadata"
|
||||||
});
|
});
|
||||||
|
@ -65,7 +65,7 @@ export const htmlTransform = async (
|
|||||||
only_main_content: scrapeOptions.onlyMainContent,
|
only_main_content: scrapeOptions.onlyMainContent,
|
||||||
})
|
})
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
|
||||||
error,
|
error,
|
||||||
module: "scrapeURL", method: "extractLinks"
|
module: "scrapeURL", method: "extractLinks"
|
||||||
});
|
});
|
||||||
|
@ -365,7 +365,37 @@ export async function performLLMExtract(
|
|||||||
export function removeDefaultProperty(schema: any): any {
|
export function removeDefaultProperty(schema: any): any {
|
||||||
if (typeof schema !== "object" || schema === null) return schema;
|
if (typeof schema !== "object" || schema === null) return schema;
|
||||||
|
|
||||||
const { default: _, ...rest } = schema;
|
const rest = { ...schema };
|
||||||
|
|
||||||
|
// unsupported global keys
|
||||||
|
delete rest.default;
|
||||||
|
|
||||||
|
// unsupported object keys
|
||||||
|
delete rest.patternProperties;
|
||||||
|
delete rest.unevaluatedProperties;
|
||||||
|
delete rest.propertyNames;
|
||||||
|
delete rest.minProperties;
|
||||||
|
delete rest.maxProperties;
|
||||||
|
|
||||||
|
// unsupported string keys
|
||||||
|
delete rest.minLength;
|
||||||
|
delete rest.maxLength;
|
||||||
|
delete rest.pattern;
|
||||||
|
delete rest.format;
|
||||||
|
|
||||||
|
// unsupported number keys
|
||||||
|
delete rest.minimum;
|
||||||
|
delete rest.maximum;
|
||||||
|
delete rest.multipleOf;
|
||||||
|
|
||||||
|
// unsupported array keys
|
||||||
|
delete rest.unevaluatedItems;
|
||||||
|
delete rest.contains;
|
||||||
|
delete rest.minContains;
|
||||||
|
delete rest.maxContains;
|
||||||
|
delete rest.minItems;
|
||||||
|
delete rest.maxItems;
|
||||||
|
delete rest.uniqueItems;
|
||||||
|
|
||||||
for (const key in rest) {
|
for (const key in rest) {
|
||||||
if (Array.isArray(rest[key])) {
|
if (Array.isArray(rest[key])) {
|
||||||
|
@ -26,7 +26,7 @@ export async function searxng_search(
|
|||||||
// location: options.location, //not possible with SearXNG
|
// location: options.location, //not possible with SearXNG
|
||||||
// num: options.num_results, //not possible with SearXNG
|
// num: options.num_results, //not possible with SearXNG
|
||||||
engines: process.env.SEARXNG_ENGINES || "",
|
engines: process.env.SEARXNG_ENGINES || "",
|
||||||
categories: process.env.SEARXNG_CATEGORIES || "general",
|
categories: process.env.SEARXNG_CATEGORIES || "",
|
||||||
pageno: options.page ?? 1,
|
pageno: options.page ?? 1,
|
||||||
format: "json"
|
format: "json"
|
||||||
};
|
};
|
||||||
|
@ -156,7 +156,8 @@ const workerFun = async (queue: Queue, jobProcessor: (token: string, job: Job) =
|
|||||||
const canAcceptConnection = await monitor.acceptConnection();
|
const canAcceptConnection = await monitor.acceptConnection();
|
||||||
|
|
||||||
if (!canAcceptConnection) {
|
if (!canAcceptConnection) {
|
||||||
logger.info("Cant accept connection");
|
console.log("Can't accept connection due to RAM/CPU load");
|
||||||
|
logger.info("Can't accept connection due to RAM/CPU load");
|
||||||
cantAcceptConnectionCount++;
|
cantAcceptConnectionCount++;
|
||||||
|
|
||||||
if (cantAcceptConnectionCount >= 25) {
|
if (cantAcceptConnectionCount >= 25) {
|
||||||
|
@ -79,11 +79,13 @@ async function addScrapeJobRaw(
|
|||||||
// If above by 2x, send them an email
|
// If above by 2x, send them an email
|
||||||
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
|
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
|
||||||
if(concurrencyQueueJobs > maxConcurrency) {
|
if(concurrencyQueueJobs > maxConcurrency) {
|
||||||
logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency);
|
logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id);
|
||||||
// sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => {
|
// sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => {
|
||||||
// logger.error("Error sending notification (concurrency limit reached): ", error);
|
// logger.error("Error sending notification (concurrency limit reached): ", error);
|
||||||
// });
|
// });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
webScraperOptions.concurrencyLimited = true;
|
||||||
|
|
||||||
await _addScrapeJobToConcurrencyQueue(
|
await _addScrapeJobToConcurrencyQueue(
|
||||||
webScraperOptions,
|
webScraperOptions,
|
||||||
@ -168,7 +170,7 @@ export async function addScrapeJobs(
|
|||||||
|
|
||||||
// equals 2x the max concurrency
|
// equals 2x the max concurrency
|
||||||
if(addToCQ.length > maxConcurrency) {
|
if(addToCQ.length > maxConcurrency) {
|
||||||
logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency);
|
logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency, "Team ID: ", jobs[0].data.team_id);
|
||||||
// sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => {
|
// sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => {
|
||||||
// logger.error("Error sending notification (concurrency limit reached): ", error);
|
// logger.error("Error sending notification (concurrency limit reached): ", error);
|
||||||
// });
|
// });
|
||||||
|
@ -412,6 +412,7 @@ const processDeepResearchJobInternal = async (
|
|||||||
timeLimit: job.data.request.timeLimit,
|
timeLimit: job.data.request.timeLimit,
|
||||||
subId: job.data.subId,
|
subId: job.data.subId,
|
||||||
maxUrls: job.data.request.maxUrls,
|
maxUrls: job.data.request.maxUrls,
|
||||||
|
analysisPrompt: job.data.request.analysisPrompt,
|
||||||
});
|
});
|
||||||
|
|
||||||
if(result.success) {
|
if(result.success) {
|
||||||
@ -567,7 +568,8 @@ const workerFun = async (
|
|||||||
const token = uuidv4();
|
const token = uuidv4();
|
||||||
const canAcceptConnection = await monitor.acceptConnection();
|
const canAcceptConnection = await monitor.acceptConnection();
|
||||||
if (!canAcceptConnection) {
|
if (!canAcceptConnection) {
|
||||||
console.log("Cant accept connection");
|
console.log("Can't accept connection due to RAM/CPU load");
|
||||||
|
logger.info("Can't accept connection due to RAM/CPU load");
|
||||||
cantAcceptConnectionCount++;
|
cantAcceptConnectionCount++;
|
||||||
|
|
||||||
if (cantAcceptConnectionCount >= 25) {
|
if (cantAcceptConnectionCount >= 25) {
|
||||||
@ -922,6 +924,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
delete doc.rawHtml;
|
delete doc.rawHtml;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (job.data.concurrencyLimited) {
|
||||||
|
doc.warning = "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." + (doc.warning ? " " + doc.warning : "");
|
||||||
|
}
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success: true,
|
success: true,
|
||||||
result: {
|
result: {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.19.1",
|
"version": "1.20.1",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -104,6 +104,7 @@ export type Action = {
|
|||||||
} | {
|
} | {
|
||||||
type: "click",
|
type: "click",
|
||||||
selector: string,
|
selector: string,
|
||||||
|
all?: boolean,
|
||||||
} | {
|
} | {
|
||||||
type: "screenshot",
|
type: "screenshot",
|
||||||
fullPage?: boolean,
|
fullPage?: boolean,
|
||||||
@ -371,6 +372,10 @@ export interface DeepResearchParams {
|
|||||||
* @default 20
|
* @default 20
|
||||||
*/
|
*/
|
||||||
maxUrls?: number;
|
maxUrls?: number;
|
||||||
|
/**
|
||||||
|
* The prompt to use for the final analysis
|
||||||
|
*/
|
||||||
|
analysisPrompt?: string;
|
||||||
/**
|
/**
|
||||||
* Experimental flag for streaming steps
|
* Experimental flag for streaming steps
|
||||||
*/
|
*/
|
||||||
@ -1135,14 +1140,14 @@ export default class FirecrawlApp {
|
|||||||
/**
|
/**
|
||||||
* Extracts information from URLs using the Firecrawl API.
|
* Extracts information from URLs using the Firecrawl API.
|
||||||
* Currently in Beta. Expect breaking changes on future minor versions.
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
||||||
* @param url - The URL to extract information from.
|
* @param urls - The URLs to extract information from. Optional if using other methods for data extraction.
|
||||||
* @param params - Additional parameters for the extract request.
|
* @param params - Additional parameters for the extract request.
|
||||||
* @returns The response from the extract operation.
|
* @returns The response from the extract operation.
|
||||||
*/
|
*/
|
||||||
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
async extract<T extends zt.ZodSchema = any>(urls?: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
||||||
const headers = this.prepareHeaders();
|
const headers = this.prepareHeaders();
|
||||||
|
|
||||||
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
|
let jsonData: { urls?: string[] } & ExtractParams<T> = { urls: urls, ...params };
|
||||||
let jsonSchema: any;
|
let jsonSchema: any;
|
||||||
try {
|
try {
|
||||||
if (!params?.schema) {
|
if (!params?.schema) {
|
||||||
@ -1348,12 +1353,14 @@ export default class FirecrawlApp {
|
|||||||
checkInterval: number
|
checkInterval: number
|
||||||
): Promise<CrawlStatusResponse | ErrorResponse> {
|
): Promise<CrawlStatusResponse | ErrorResponse> {
|
||||||
try {
|
try {
|
||||||
|
let failedTries = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
let statusResponse: AxiosResponse = await this.getRequest(
|
let statusResponse: AxiosResponse = await this.getRequest(
|
||||||
`${this.apiUrl}/v1/crawl/${id}`,
|
`${this.apiUrl}/v1/crawl/${id}`,
|
||||||
headers
|
headers
|
||||||
);
|
);
|
||||||
if (statusResponse.status === 200) {
|
if (statusResponse.status === 200) {
|
||||||
|
failedTries = 0;
|
||||||
let statusData = statusResponse.data;
|
let statusData = statusResponse.data;
|
||||||
if (statusData.status === "completed") {
|
if (statusData.status === "completed") {
|
||||||
if ("data" in statusData) {
|
if ("data" in statusData) {
|
||||||
@ -1385,7 +1392,10 @@ export default class FirecrawlApp {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
this.handleError(statusResponse, "check crawl status");
|
failedTries++;
|
||||||
|
if (failedTries >= 3) {
|
||||||
|
this.handleError(statusResponse, "check crawl status");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
@ -1399,7 +1409,7 @@ export default class FirecrawlApp {
|
|||||||
* @param {string} action - The action being performed when the error occurred.
|
* @param {string} action - The action being performed when the error occurred.
|
||||||
*/
|
*/
|
||||||
handleError(response: AxiosResponse, action: string): void {
|
handleError(response: AxiosResponse, action: string): void {
|
||||||
if ([400, 402, 408, 409, 500].includes(response.status)) {
|
if ([400, 402, 403, 408, 409, 500].includes(response.status)) {
|
||||||
const errorMessage: string =
|
const errorMessage: string =
|
||||||
response.data.error || "Unknown error occurred";
|
response.data.error || "Unknown error occurred";
|
||||||
const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : '';
|
const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : '';
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.13.5"
|
__version__ = "1.14.1"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -309,6 +309,7 @@ class DeepResearchParams(pydantic.BaseModel):
|
|||||||
maxDepth: Optional[int] = 7
|
maxDepth: Optional[int] = 7
|
||||||
timeLimit: Optional[int] = 270
|
timeLimit: Optional[int] = 270
|
||||||
maxUrls: Optional[int] = 20
|
maxUrls: Optional[int] = 20
|
||||||
|
analysisPrompt: Optional[str] = None
|
||||||
__experimental_streamSteps: Optional[bool] = None
|
__experimental_streamSteps: Optional[bool] = None
|
||||||
|
|
||||||
class DeepResearchResponse(pydantic.BaseModel):
|
class DeepResearchResponse(pydantic.BaseModel):
|
||||||
@ -1126,7 +1127,7 @@ class FirecrawlApp:
|
|||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: Optional[List[str]] = None,
|
||||||
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
||||||
"""
|
"""
|
||||||
Extract structured information from URLs.
|
Extract structured information from URLs.
|
||||||
@ -1164,6 +1165,9 @@ class FirecrawlApp:
|
|||||||
if not params or (not params.get('prompt') and not params.get('schema')):
|
if not params or (not params.get('prompt') and not params.get('schema')):
|
||||||
raise ValueError("Either prompt or schema is required")
|
raise ValueError("Either prompt or schema is required")
|
||||||
|
|
||||||
|
if not urls and not params.get('prompt'):
|
||||||
|
raise ValueError("Either urls or prompt is required")
|
||||||
|
|
||||||
schema = params.get('schema')
|
schema = params.get('schema')
|
||||||
if schema:
|
if schema:
|
||||||
if hasattr(schema, 'model_json_schema'):
|
if hasattr(schema, 'model_json_schema'):
|
||||||
@ -1180,6 +1184,8 @@ class FirecrawlApp:
|
|||||||
'origin': f'python-sdk@{get_version()}'
|
'origin': f'python-sdk@{get_version()}'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if not request_data['urls']:
|
||||||
|
request_data['urls'] = []
|
||||||
# Only add prompt and systemPrompt if they exist
|
# Only add prompt and systemPrompt if they exist
|
||||||
if params.get('prompt'):
|
if params.get('prompt'):
|
||||||
request_data['prompt'] = params['prompt']
|
request_data['prompt'] = params['prompt']
|
||||||
@ -1669,6 +1675,8 @@ class FirecrawlApp:
|
|||||||
"""
|
"""
|
||||||
if status_code == 402:
|
if status_code == 402:
|
||||||
return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
||||||
|
elif status_code == 403:
|
||||||
|
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
||||||
elif status_code == 408:
|
elif status_code == 408:
|
||||||
return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
||||||
elif status_code == 409:
|
elif status_code == 409:
|
||||||
|
180
examples/claude-3.7-stock-analyzer/claude-3.7-stock-analyzer.py
Normal file
180
examples/claude-3.7-stock-analyzer/claude-3.7-stock-analyzer.py
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
import os
|
||||||
|
from firecrawl import FirecrawlApp
|
||||||
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import anthropic
|
||||||
|
from e2b_code_interpreter import Sandbox
|
||||||
|
import base64
|
||||||
|
|
||||||
|
# ANSI color codes
|
||||||
|
class Colors:
|
||||||
|
CYAN = '\033[96m'
|
||||||
|
YELLOW = '\033[93m'
|
||||||
|
GREEN = '\033[92m'
|
||||||
|
RED = '\033[91m'
|
||||||
|
MAGENTA = '\033[95m'
|
||||||
|
BLUE = '\033[94m'
|
||||||
|
RESET = '\033[0m'
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Retrieve API keys from environment variables
|
||||||
|
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||||
|
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||||
|
e2b_api_key = os.getenv("E2B_API_KEY")
|
||||||
|
|
||||||
|
# Initialize the FirecrawlApp and Anthropic client
|
||||||
|
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||||
|
client = anthropic.Anthropic(api_key=anthropic_api_key)
|
||||||
|
sandbox = Sandbox(api_key=e2b_api_key)
|
||||||
|
|
||||||
|
# Find the relevant stock pages via map
|
||||||
|
def find_relevant_page_via_map(stock_search_term, url, app):
|
||||||
|
try:
|
||||||
|
print(f"{Colors.CYAN}Searching for stock: {stock_search_term}{Colors.RESET}")
|
||||||
|
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
|
||||||
|
|
||||||
|
map_search_parameter = stock_search_term
|
||||||
|
|
||||||
|
print(f"{Colors.GREEN}Search parameter: {map_search_parameter}{Colors.RESET}")
|
||||||
|
|
||||||
|
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
|
||||||
|
map_website = app.map_url(url, params={"search": map_search_parameter})
|
||||||
|
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
|
||||||
|
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
|
||||||
|
return map_website['links']
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Function to plot the scores using e2b
|
||||||
|
def plot_scores(stock_names, stock_scores):
|
||||||
|
print(f"{Colors.YELLOW}Plotting scores...{Colors.RESET}")
|
||||||
|
code_to_run = f"""
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
stock_names = {stock_names}
|
||||||
|
stock_scores = {stock_scores}
|
||||||
|
|
||||||
|
plt.figure(figsize=(10, 5))
|
||||||
|
plt.bar(stock_names, stock_scores, color='blue')
|
||||||
|
plt.xlabel('Stock Names')
|
||||||
|
plt.ylabel('Scores')
|
||||||
|
plt.title('Stock Investment Scores')
|
||||||
|
plt.xticks(rotation=45)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('chart.png')
|
||||||
|
plt.show()
|
||||||
|
"""
|
||||||
|
# Run the code inside the sandbox
|
||||||
|
execution = sandbox.run_code(code_to_run)
|
||||||
|
|
||||||
|
# Check if there are any results
|
||||||
|
if execution.results and execution.results[0].png:
|
||||||
|
first_result = execution.results[0]
|
||||||
|
|
||||||
|
# Get the directory where the current python file is located
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
# Save the png to a file in the examples directory. The png is in base64 format.
|
||||||
|
with open(os.path.join(current_dir, 'chart.png'), 'wb') as f:
|
||||||
|
f.write(base64.b64decode(first_result.png))
|
||||||
|
print('Chart saved as examples/chart.png')
|
||||||
|
else:
|
||||||
|
print(f"{Colors.RED}No results returned from the sandbox execution.{Colors.RESET}")
|
||||||
|
|
||||||
|
# Analyze the top stocks and provide investment recommendation
|
||||||
|
def analyze_top_stocks(map_website, app, client):
|
||||||
|
try:
|
||||||
|
# Get top 5 links from the map result
|
||||||
|
top_links = map_website[:10]
|
||||||
|
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
|
||||||
|
|
||||||
|
# Scrape the pages in batch
|
||||||
|
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
|
||||||
|
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
|
||||||
|
|
||||||
|
# Prepare content for LLM
|
||||||
|
stock_contents = []
|
||||||
|
for scrape_result in batch_scrape_result['data']:
|
||||||
|
stock_contents.append({
|
||||||
|
'content': scrape_result['markdown']
|
||||||
|
})
|
||||||
|
|
||||||
|
# Pass all the content to the LLM to analyze and decide which stock to invest in
|
||||||
|
analyze_prompt = f"""
|
||||||
|
Based on the following information about different stocks from their Robinhood pages, analyze and determine which stock is the best investment opportunity. DO NOT include any other text, just the JSON.
|
||||||
|
|
||||||
|
Return the result in the following JSON format. Only return the JSON, nothing else. Do not include backticks or any other formatting, just the JSON.
|
||||||
|
{{
|
||||||
|
"scores": [
|
||||||
|
{{
|
||||||
|
"stock_name": "<stock_name>",
|
||||||
|
"score": <score-out-of-100>
|
||||||
|
}},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
Stock Information:
|
||||||
|
"""
|
||||||
|
|
||||||
|
for stock in stock_contents:
|
||||||
|
analyze_prompt += f"Content:\n{stock['content']}\n"
|
||||||
|
|
||||||
|
print(f"{Colors.YELLOW}Analyzing stock information with LLM...{Colors.RESET}")
|
||||||
|
analyze_prompt += f"\n\nStart JSON:\n"
|
||||||
|
completion = client.messages.create(
|
||||||
|
model="claude-3-7-sonnet-20250219",
|
||||||
|
max_tokens=1000,
|
||||||
|
temperature=0,
|
||||||
|
system="You are a financial analyst. Only return the JSON, nothing else.",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": analyze_prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
result = completion.content[0].text
|
||||||
|
print(f"{Colors.GREEN}Analysis completed. Here is the recommendation:{Colors.RESET}")
|
||||||
|
print(f"{Colors.MAGENTA}{result}{Colors.RESET}")
|
||||||
|
|
||||||
|
# Plot the scores using e2b
|
||||||
|
try:
|
||||||
|
result_json = json.loads(result)
|
||||||
|
scores = result_json['scores']
|
||||||
|
stock_names = [score['stock_name'] for score in scores]
|
||||||
|
stock_scores = [score['score'] for score in scores]
|
||||||
|
|
||||||
|
plot_scores(stock_names, stock_scores)
|
||||||
|
except json.JSONDecodeError as json_err:
|
||||||
|
print(f"{Colors.RED}Error decoding JSON response: {str(json_err)}{Colors.RESET}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.RED}Error encountered during stock analysis: {str(e)}{Colors.RESET}")
|
||||||
|
|
||||||
|
# Main function to execute the process
|
||||||
|
def main():
|
||||||
|
# Get user input
|
||||||
|
stock_search_term = input(f"{Colors.BLUE}Enter the stock you're interested in: {Colors.RESET}")
|
||||||
|
if not stock_search_term.strip():
|
||||||
|
print(f"{Colors.RED}No stock entered. Exiting.{Colors.RESET}")
|
||||||
|
return
|
||||||
|
|
||||||
|
url = "https://robinhood.com/stocks"
|
||||||
|
|
||||||
|
print(f"{Colors.YELLOW}Initiating stock analysis process...{Colors.RESET}")
|
||||||
|
# Find the relevant pages
|
||||||
|
map_website = find_relevant_page_via_map(stock_search_term, url, app)
|
||||||
|
|
||||||
|
if map_website:
|
||||||
|
print(f"{Colors.GREEN}Relevant stock pages identified. Proceeding with detailed analysis...{Colors.RESET}")
|
||||||
|
# Analyze top stocks
|
||||||
|
analyze_top_stocks(map_website, app, client)
|
||||||
|
else:
|
||||||
|
print(f"{Colors.RED}No relevant stock pages identified. Consider refining the search term or trying a different stock.{Colors.RESET}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user