Nicolas 6634d236bf
(feat/fire-1) FIRE-1 (#1462)
* wip

* integrating smart-scrape

* integrate smartscrape into llmExtract

* wip

* smart scrape multiple links

* fixes

* fix

* wip

* it worked!

* wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON

* wip

* retry model

* retry models

* feat/scrape+json+extract interfaces ready

* vertex -> googleapi

* fix/transformArrayToObject. required params on schema is still a bug

* change model

* o3-mini -> gemini

* Update extractSmartScrape.ts

* sessionId

* sessionId

* Nick: f-0 start

* Update extraction-service-f0.ts

* Update types.ts

* Nick:

* Update queue-worker.ts

* Nick: new interface

* rename analyzeSchemaAndPrompt -> F0

* refactor: rename agent ID to model in types and extract logic

* agent

* id->model

* id->model

* refactor: standardize agent model handling and validation across extraction logic

* livecast agent

* (feat/f1) sdks (#1459)

* feat: add FIRE-1 agent support to Python and JavaScript SDKs

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* feat: add FIRE-1 agent support to scrape methods in both SDKs

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* feat: add prompt and sessionId to AgentOptions interface

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* Update index.ts

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>

* feat(v1): rate limits

* Update types.ts

* Update llmExtract.ts

* add cost tracking

* remove

* Update requests.http

* fix smart scrape cost calc

* log sm cost

* fix counts

* fix

* expose cost tracking

* models fix

* temp: skipLibcheck

* get rid of it

* fix ts

* dont skip lib check

* Update extractSmartScrape.ts

* Update queue-worker.ts

* Update smartScrape.ts

* Update requests.http

* fix(rate-limiter):

* types: fire-1 refine

* bill 150

* fix credits used on crawl

* ban from crawl

* route cost limit warning

* Update generic-ai.ts

* genres

* Update llmExtract.ts

* test server diff

* cletu

---------

Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Co-authored-by: Thomas Kosmas <thomas510111@gmail.com>
Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com>
Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello@sideguide.dev>
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
2025-04-15 00:19:45 -07:00

309 lines
8.4 KiB
TypeScript

import { ExtractorOptions, PageOptions } from "./../../lib/entities";
import { Request, Response } from "express";
import {
billTeam,
checkTeamCredits,
} from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import {
fromLegacyCombo,
toLegacyDocument,
url as urlSchema,
} from "../v1/types";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import {
defaultPageOptions,
defaultExtractorOptions,
defaultTimeout,
defaultOrigin,
} from "../../lib/default-values";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
import { v4 as uuidv4 } from "uuid";
import { logger } from "../../lib/logger";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions } from "../v1/types";
import { ZodError } from "zod";
import { Document as V0Document } from "./../../lib/entities";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { getJobFromGCS } from "../../lib/gcs-jobs";
export async function scrapeHelper(
jobId: string,
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
): Promise<{
success: boolean;
error?: string;
data?: V0Document | { url: string };
returnCode: number;
}> {
const url = urlSchema.parse(req.body.url);
if (typeof url !== "string") {
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
return {
success: false,
error: BLOCKLISTED_URL_MESSAGE,
returnCode: 403,
};
}
const jobPriority = await getJobPriority({ team_id, basePriority: 10 });
const { scrapeOptions, internalOptions } = fromLegacyCombo(
pageOptions,
extractorOptions,
timeout,
crawlerOptions,
team_id,
);
await addScrapeJob(
{
url,
mode: "single_urls",
team_id,
scrapeOptions,
internalOptions,
origin: req.body.origin ?? defaultOrigin,
is_scrape: true,
},
{},
jobId,
jobPriority,
);
let doc;
const err = await Sentry.startSpan(
{
name: "Wait for job to finish",
op: "bullmq.wait",
attributes: { job: jobId },
},
async (span) => {
try {
doc = await waitForJob(jobId, timeout);
} catch (e) {
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")
) {
span.setAttribute("timedOut", true);
return {
success: false,
error: "Request timed out",
returnCode: 408,
};
} else if (
typeof e === "string" &&
(e.includes("Error generating completions: ") ||
e.includes("Invalid schema for function") ||
e.includes(
"LLM extraction did not match the extraction schema you provided.",
))
) {
return {
success: false,
error: e,
returnCode: 500,
};
} else {
throw e;
}
}
span.setAttribute("result", JSON.stringify(doc));
return null;
},
);
if (err !== null) {
return err;
}
await getScrapeQueue().remove(jobId);
if (!doc) {
console.error("!!! PANIC DOC IS", doc);
return {
success: true,
error: "No page found",
returnCode: 200,
data: doc,
};
}
delete doc.index;
delete doc.provider;
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (
!pageOptions.includeRawHtml &&
extractorOptions.mode == "llm-extraction-from-raw-html"
) {
if (doc.rawHtml) {
delete doc.rawHtml;
}
}
if (!pageOptions.includeHtml) {
if (doc.html) {
delete doc.html;
}
}
return {
success: true,
data: toLegacyDocument(doc, internalOptions),
returnCode: 200,
};
}
export async function scrapeController(req: Request, res: Response) {
try {
let earlyReturn = false;
// make sure to authenticate user first, Bearer <token>
const auth = await authenticateUser(req, res, RateLimiterMode.Scrape);
if (!auth.success) {
return res.status(auth.status).json({ error: auth.error });
}
const { team_id, chunk } = auth;
redisConnection.sadd("teams_using_v0", team_id)
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = {
...defaultExtractorOptions,
...req.body.extractorOptions,
};
const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout;
if (extractorOptions.mode.includes("llm-extraction")) {
if (
typeof extractorOptions.extractionSchema !== "object" ||
extractorOptions.extractionSchema === null
) {
return res.status(400).json({
error:
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
});
}
pageOptions.onlyMainContent = true;
timeout = req.body.timeout ?? 90000;
}
// checkCredits
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(chunk, team_id, 1);
if (!creditsCheckSuccess) {
earlyReturn = true;
return res.status(402).json({
error:
"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing",
});
}
} catch (error) {
logger.error(error);
earlyReturn = true;
return res.status(500).json({
error:
"Error checking team credits. Please contact help@firecrawl.com for help.",
});
}
const jobId = uuidv4();
const startTime = new Date().getTime();
const result = await scrapeHelper(
jobId,
req,
team_id,
crawlerOptions,
pageOptions,
extractorOptions,
timeout,
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
result.data && (result.data as V0Document).markdown
? numTokensFromString(
(result.data as V0Document).markdown!,
"gpt-3.5-turbo",
)
: 0;
if (result.success) {
let creditsToBeBilled = 1;
const creditsPerLLMExtract = 4;
if (extractorOptions.mode.includes("llm-extraction")) {
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
creditsToBeBilled += creditsPerLLMExtract;
}
let startTimeBilling = new Date().getTime();
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch(
(error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
{ error },
);
// Optionally, you could notify an admin or add to a retry queue here
},
);
}
}
let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && (doc as V0Document).rawHtml) {
delete (doc as V0Document).rawHtml;
}
}
if (pageOptions && pageOptions.includeExtract) {
if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
delete (doc as V0Document).markdown;
}
}
return res.status(result.returnCode).json(result);
} catch (error) {
Sentry.captureException(error);
logger.error("Scrape error occcurred", { error });
return res.status(500).json({
error:
error instanceof ZodError
? "Invalid URL"
: typeof error === "string"
? error
: (error?.message ?? "Internal Server Error"),
});
}
}