mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 03:35:56 +08:00
Merge branch 'main' into mog/index
This commit is contained in:
commit
d7fef33224
5
.github/workflows/test-server-self-host.yml
vendored
5
.github/workflows/test-server-self-host.yml
vendored
@ -128,7 +128,7 @@ jobs:
|
|||||||
- name: Kill SearXNG
|
- name: Kill SearXNG
|
||||||
if: always() && matrix.search == 'searxng'
|
if: always() && matrix.search == 'searxng'
|
||||||
run: |
|
run: |
|
||||||
docker logs searxng > searxng/searxng.log 2>&1
|
docker logs searxng > searxng.log 2>&1
|
||||||
docker kill searxng
|
docker kill searxng
|
||||||
working-directory: ./
|
working-directory: ./
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
@ -149,5 +149,4 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
|
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
|
||||||
path: |
|
path: |
|
||||||
./searxng/searxng.log
|
./searxng.log
|
||||||
./searxng/settings.yml
|
|
||||||
|
@ -69,7 +69,6 @@
|
|||||||
"@google-cloud/storage": "^7.16.0",
|
"@google-cloud/storage": "^7.16.0",
|
||||||
"@nangohq/node": "^0.40.8",
|
"@nangohq/node": "^0.40.8",
|
||||||
"@openrouter/ai-sdk-provider": "^0.4.5",
|
"@openrouter/ai-sdk-provider": "^0.4.5",
|
||||||
"@pinecone-database/pinecone": "^4.0.0",
|
|
||||||
"@sentry/cli": "^2.33.1",
|
"@sentry/cli": "^2.33.1",
|
||||||
"@sentry/node": "^8.26.0",
|
"@sentry/node": "^8.26.0",
|
||||||
"@sentry/profiling-node": "^8.26.0",
|
"@sentry/profiling-node": "^8.26.0",
|
||||||
|
12
apps/api/pnpm-lock.yaml
generated
12
apps/api/pnpm-lock.yaml
generated
@ -59,9 +59,6 @@ importers:
|
|||||||
'@openrouter/ai-sdk-provider':
|
'@openrouter/ai-sdk-provider':
|
||||||
specifier: ^0.4.5
|
specifier: ^0.4.5
|
||||||
version: 0.4.5(zod@3.24.2)
|
version: 0.4.5(zod@3.24.2)
|
||||||
'@pinecone-database/pinecone':
|
|
||||||
specifier: ^4.0.0
|
|
||||||
version: 4.0.0
|
|
||||||
'@sentry/cli':
|
'@sentry/cli':
|
||||||
specifier: ^2.33.1
|
specifier: ^2.33.1
|
||||||
version: 2.33.1(encoding@0.1.13)
|
version: 2.33.1(encoding@0.1.13)
|
||||||
@ -1232,10 +1229,6 @@ packages:
|
|||||||
'@pdf-lib/upng@1.0.1':
|
'@pdf-lib/upng@1.0.1':
|
||||||
resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==}
|
resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==}
|
||||||
|
|
||||||
'@pinecone-database/pinecone@4.0.0':
|
|
||||||
resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==}
|
|
||||||
engines: {node: '>=18.0.0'}
|
|
||||||
|
|
||||||
'@pkgjs/parseargs@0.11.0':
|
'@pkgjs/parseargs@0.11.0':
|
||||||
resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==}
|
resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==}
|
||||||
engines: {node: '>=14'}
|
engines: {node: '>=14'}
|
||||||
@ -6331,10 +6324,6 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
pako: 1.0.11
|
pako: 1.0.11
|
||||||
|
|
||||||
'@pinecone-database/pinecone@4.0.0':
|
|
||||||
dependencies:
|
|
||||||
encoding: 0.1.13
|
|
||||||
|
|
||||||
'@pkgjs/parseargs@0.11.0':
|
'@pkgjs/parseargs@0.11.0':
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
@ -7921,6 +7910,7 @@ snapshots:
|
|||||||
encoding@0.1.13:
|
encoding@0.1.13:
|
||||||
dependencies:
|
dependencies:
|
||||||
iconv-lite: 0.6.3
|
iconv-lite: 0.6.3
|
||||||
|
optional: true
|
||||||
|
|
||||||
end-of-stream@1.4.4:
|
end-of-stream@1.4.4:
|
||||||
dependencies:
|
dependencies:
|
||||||
|
@ -78,24 +78,24 @@ describe("Scrape tests", () => {
|
|||||||
expect(JSON.stringify(status)).toBe(JSON.stringify(response));
|
expect(JSON.stringify(status)).toBe(JSON.stringify(response));
|
||||||
}, 60000);
|
}, 60000);
|
||||||
|
|
||||||
// describe("Ad blocking (f-e dependant)", () => {
|
describe("Ad blocking (f-e dependant)", () => {
|
||||||
// it.concurrent("blocks ads by default", async () => {
|
it.concurrent("blocks ads by default", async () => {
|
||||||
// const response = await scrape({
|
const response = await scrape({
|
||||||
// url: "https://www.allrecipes.com/recipe/18185/yum/",
|
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||||
// });
|
});
|
||||||
|
|
||||||
// expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||||
// }, 30000);
|
}, 30000);
|
||||||
|
|
||||||
// it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||||
// const response = await scrape({
|
const response = await scrape({
|
||||||
// url: "https://www.allrecipes.com/recipe/18185/yum/",
|
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||||
// blockAds: false,
|
blockAds: false,
|
||||||
// });
|
});
|
||||||
|
|
||||||
// expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
||||||
// }, 30000);
|
}, 30000);
|
||||||
// });
|
});
|
||||||
|
|
||||||
describe("Index", () => {
|
describe("Index", () => {
|
||||||
it.concurrent("caches properly", async () => {
|
it.concurrent("caches properly", async () => {
|
||||||
|
@ -82,6 +82,7 @@ export async function scrapeHelper(
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
|
startTime: Date.now(),
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
|
@ -112,6 +112,7 @@ export async function searchHelper(
|
|||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions,
|
internalOptions,
|
||||||
|
startTime: Date.now(),
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
|
@ -13,6 +13,7 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
|||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { supabaseGetJobById } from "../../lib/supabase-jobs";
|
import { supabaseGetJobById } from "../../lib/supabase-jobs";
|
||||||
|
import { calculateCreditsToBeBilled } from "../../lib/scrape-billing";
|
||||||
|
|
||||||
export async function scrapeController(
|
export async function scrapeController(
|
||||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||||
@ -30,7 +31,6 @@ export async function scrapeController(
|
|||||||
});
|
});
|
||||||
|
|
||||||
req.body = scrapeRequestSchema.parse(req.body);
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
let earlyReturn = false;
|
|
||||||
|
|
||||||
const origin = req.body.origin;
|
const origin = req.body.origin;
|
||||||
const timeout = req.body.timeout;
|
const timeout = req.body.timeout;
|
||||||
@ -42,6 +42,8 @@ export async function scrapeController(
|
|||||||
});
|
});
|
||||||
//
|
//
|
||||||
|
|
||||||
|
const isDirectToBullMQ = process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
@ -52,13 +54,16 @@ export async function scrapeController(
|
|||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||||
unnormalizedSourceURL: preNormalizedBody.url,
|
unnormalizedSourceURL: preNormalizedBody.url,
|
||||||
|
useCache: req.body.__experimental_cache ? true : false,
|
||||||
|
bypassBilling: isDirectToBullMQ,
|
||||||
},
|
},
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
startTime,
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority,
|
jobPriority,
|
||||||
|
isDirectToBullMQ,
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait =
|
const totalWait =
|
||||||
@ -124,57 +129,13 @@ export async function scrapeController(
|
|||||||
|
|
||||||
await getScrapeQueue().remove(jobId);
|
await getScrapeQueue().remove(jobId);
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
|
||||||
const numTokens =
|
|
||||||
doc && doc.extract
|
|
||||||
? // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
|
||||||
0 // TODO: fix
|
|
||||||
: 0;
|
|
||||||
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
|
||||||
if (earlyReturn) {
|
|
||||||
// Don't bill if we're early returning
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
|
|
||||||
creditsToBeBilled = 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
|
||||||
// @Nick this is a hack pushed at 2AM pls help - mogery
|
|
||||||
const job = await supabaseGetJobById(jobId);
|
|
||||||
if (!job?.cost_tracking) {
|
|
||||||
logger.warn("No cost tracking found for job", {
|
|
||||||
jobId,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
creditsToBeBilled = Math.ceil((job?.cost_tracking?.totalCost ?? 1) * 1800);
|
|
||||||
} else {
|
|
||||||
creditsToBeBilled = 150;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doc?.metadata?.proxyUsed === "stealth") {
|
|
||||||
creditsToBeBilled += 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
|
|
||||||
(error) => {
|
|
||||||
logger.error(
|
|
||||||
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`,
|
|
||||||
);
|
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!req.body.formats.includes("rawHtml")) {
|
if (!req.body.formats.includes("rawHtml")) {
|
||||||
if (doc && doc.rawHtml) {
|
if (doc && doc.rawHtml) {
|
||||||
delete doc.rawHtml;
|
delete doc.rawHtml;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: doc,
|
data: doc,
|
||||||
|
@ -40,23 +40,23 @@ export async function searchAndScrapeSearchResult(
|
|||||||
try {
|
try {
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
query,
|
query,
|
||||||
num_results: 5
|
num_results: 5,
|
||||||
});
|
});
|
||||||
|
|
||||||
const documents = await Promise.all(
|
const documents = await Promise.all(
|
||||||
searchResults.map(result =>
|
searchResults.map((result) =>
|
||||||
scrapeSearchResult(
|
scrapeSearchResult(
|
||||||
{
|
{
|
||||||
url: result.url,
|
url: result.url,
|
||||||
title: result.title,
|
title: result.title,
|
||||||
description: result.description
|
description: result.description,
|
||||||
},
|
},
|
||||||
options,
|
options,
|
||||||
logger,
|
logger,
|
||||||
costTracking,
|
costTracking,
|
||||||
flags
|
flags,
|
||||||
)
|
),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
@ -76,6 +76,8 @@ async function scrapeSearchResult(
|
|||||||
logger: Logger,
|
logger: Logger,
|
||||||
costTracking: CostTracking,
|
costTracking: CostTracking,
|
||||||
flags: TeamFlags,
|
flags: TeamFlags,
|
||||||
|
directToBullMQ: boolean = false,
|
||||||
|
isSearchPreview: boolean = false,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
@ -99,14 +101,15 @@ async function scrapeSearchResult(
|
|||||||
mode: "single_urls" as Mode,
|
mode: "single_urls" as Mode,
|
||||||
team_id: options.teamId,
|
team_id: options.teamId,
|
||||||
scrapeOptions: options.scrapeOptions,
|
scrapeOptions: options.scrapeOptions,
|
||||||
internalOptions: { teamId: options.teamId, useCache: true },
|
internalOptions: { teamId: options.teamId, useCache: true, bypassBilling: true },
|
||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
|
startTime: Date.now(),
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority,
|
jobPriority,
|
||||||
|
directToBullMQ,
|
||||||
);
|
);
|
||||||
|
|
||||||
const doc: Document = await waitForJob(jobId, options.timeout);
|
const doc: Document = await waitForJob(jobId, options.timeout);
|
||||||
@ -169,6 +172,8 @@ export async function searchController(
|
|||||||
};
|
};
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const costTracking = new CostTracking();
|
const costTracking = new CostTracking();
|
||||||
|
const isSearchPreview =
|
||||||
|
process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
req.body = searchRequestSchema.parse(req.body);
|
req.body = searchRequestSchema.parse(req.body);
|
||||||
@ -197,7 +202,9 @@ export async function searchController(
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (req.body.ignoreInvalidURLs) {
|
if (req.body.ignoreInvalidURLs) {
|
||||||
searchResults = searchResults.filter((result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null));
|
searchResults = searchResults.filter(
|
||||||
|
(result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Searching completed", {
|
logger.info("Searching completed", {
|
||||||
@ -224,12 +231,20 @@ export async function searchController(
|
|||||||
} else {
|
} else {
|
||||||
logger.info("Scraping search results");
|
logger.info("Scraping search results");
|
||||||
const scrapePromises = searchResults.map((result) =>
|
const scrapePromises = searchResults.map((result) =>
|
||||||
scrapeSearchResult(result, {
|
scrapeSearchResult(
|
||||||
|
result,
|
||||||
|
{
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
timeout: req.body.timeout,
|
timeout: req.body.timeout,
|
||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
}, logger, costTracking, req.acuc?.flags ?? null),
|
},
|
||||||
|
logger,
|
||||||
|
costTracking,
|
||||||
|
req.acuc?.flags ?? null,
|
||||||
|
(req.acuc?.price_credits ?? 0) <= 3000,
|
||||||
|
isSearchPreview,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
const docs = await Promise.all(scrapePromises);
|
const docs = await Promise.all(scrapePromises);
|
||||||
@ -255,11 +270,23 @@ export async function searchController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Bill team once for all successful results
|
// Bill team once for all successful results
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.length).catch((error) => {
|
if (!isSearchPreview) {
|
||||||
|
billTeam(
|
||||||
|
req.auth.team_id,
|
||||||
|
req.acuc?.sub_id,
|
||||||
|
responseData.data.reduce((a, x) => {
|
||||||
|
if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) {
|
||||||
|
return a + x.metadata.numPages;
|
||||||
|
} else {
|
||||||
|
return a + 1;
|
||||||
|
}
|
||||||
|
}, 0),
|
||||||
|
).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`,
|
`Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
@ -269,7 +296,8 @@ export async function searchController(
|
|||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
});
|
});
|
||||||
|
|
||||||
logJob({
|
logJob(
|
||||||
|
{
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: true,
|
success: true,
|
||||||
num_docs: responseData.data.length,
|
num_docs: responseData.data.length,
|
||||||
@ -281,10 +309,12 @@ export async function searchController(
|
|||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
});
|
},
|
||||||
|
false,
|
||||||
|
isSearchPreview,
|
||||||
|
);
|
||||||
|
|
||||||
return res.status(200).json(responseData);
|
return res.status(200).json(responseData);
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (
|
if (
|
||||||
error instanceof Error &&
|
error instanceof Error &&
|
||||||
|
@ -311,6 +311,8 @@ const baseScrapeOptions = z
|
|||||||
proxy: z.enum(["basic", "stealth", "auto"]).optional(),
|
proxy: z.enum(["basic", "stealth", "auto"]).optional(),
|
||||||
maxAge: z.number().int().gte(0).safe().default(0),
|
maxAge: z.number().int().gte(0).safe().default(0),
|
||||||
storeInCache: z.boolean().default(true),
|
storeInCache: z.boolean().default(true),
|
||||||
|
__experimental_cache: z.boolean().default(false).optional(),
|
||||||
|
__searchPreviewToken: z.string().optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@ -1172,6 +1174,7 @@ export const searchRequestSchema = z
|
|||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
ignoreInvalidURLs: z.boolean().optional().default(false),
|
ignoreInvalidURLs: z.boolean().optional().default(false),
|
||||||
|
__searchPreviewToken: z.string().optional(),
|
||||||
scrapeOptions: baseScrapeOptions
|
scrapeOptions: baseScrapeOptions
|
||||||
.extend({
|
.extend({
|
||||||
formats: z
|
formats: z
|
||||||
|
@ -95,8 +95,13 @@ function startServer(port = DEFAULT_PORT) {
|
|||||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
const exitHandler = () => {
|
const exitHandler = async () => {
|
||||||
logger.info("SIGTERM signal received: closing HTTP server");
|
logger.info("SIGTERM signal received: closing HTTP server");
|
||||||
|
if (process.env.IS_KUBERNETES === "true") {
|
||||||
|
// Account for GCE load balancer drain timeout
|
||||||
|
logger.info("Waiting 60s for GCE load balancer drain timeout");
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 60000));
|
||||||
|
}
|
||||||
server.close(() => {
|
server.close(() => {
|
||||||
logger.info("Server closed.");
|
logger.info("Server closed.");
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
|
@ -133,6 +133,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
blockAds: false,
|
blockAds: false,
|
||||||
maxAge: 0,
|
maxAge: 0,
|
||||||
dontStoreInCache: false,
|
dontStoreInCache: false,
|
||||||
|
__experimental_cache: true,
|
||||||
},
|
},
|
||||||
}, logger, costTracking, acuc?.flags ?? null);
|
}, logger, costTracking, acuc?.flags ?? null);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
|
@ -49,6 +49,7 @@ export async function scrapeDocument(
|
|||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
from_extract: true,
|
from_extract: true,
|
||||||
|
startTime: Date.now(),
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
|
@ -47,6 +47,7 @@ export async function scrapeDocument_F0(
|
|||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
from_extract: true,
|
from_extract: true,
|
||||||
|
startTime: Date.now(),
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
|
@ -1,164 +0,0 @@
|
|||||||
import { Pinecone } from "@pinecone-database/pinecone";
|
|
||||||
import { Document } from "../../../controllers/v1/types";
|
|
||||||
import { logger } from "../../logger";
|
|
||||||
import { embed } from "ai";
|
|
||||||
import { getEmbeddingModel } from "../../generic-ai";
|
|
||||||
|
|
||||||
const pinecone = new Pinecone({
|
|
||||||
apiKey: process.env.PINECONE_API_KEY!,
|
|
||||||
});
|
|
||||||
|
|
||||||
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
|
|
||||||
|
|
||||||
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
|
|
||||||
|
|
||||||
export interface PageMetadata {
|
|
||||||
url: string;
|
|
||||||
originUrl: string;
|
|
||||||
title?: string;
|
|
||||||
description?: string;
|
|
||||||
crawlId?: string;
|
|
||||||
teamId?: string;
|
|
||||||
timestamp: number;
|
|
||||||
markdown?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function getEmbedding(text: string) {
|
|
||||||
const { embedding } = await embed({
|
|
||||||
model: getEmbeddingModel("text-embedding-3-small"),
|
|
||||||
value: text,
|
|
||||||
});
|
|
||||||
|
|
||||||
return embedding;
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizeUrl(url: string) {
|
|
||||||
const urlO = new URL(url);
|
|
||||||
if (!urlO.hostname.startsWith("www.")) {
|
|
||||||
urlO.hostname = "www." + urlO.hostname;
|
|
||||||
}
|
|
||||||
return urlO.href;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function indexPage({
|
|
||||||
document,
|
|
||||||
originUrl,
|
|
||||||
crawlId,
|
|
||||||
teamId,
|
|
||||||
}: {
|
|
||||||
document: Document;
|
|
||||||
originUrl: string;
|
|
||||||
crawlId?: string;
|
|
||||||
teamId?: string;
|
|
||||||
}) {
|
|
||||||
try {
|
|
||||||
const index = pinecone.index(INDEX_NAME);
|
|
||||||
|
|
||||||
// Trim markdown if it's too long
|
|
||||||
let trimmedMarkdown = document.markdown;
|
|
||||||
if (
|
|
||||||
trimmedMarkdown &&
|
|
||||||
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
|
|
||||||
) {
|
|
||||||
trimmedMarkdown = trimmedMarkdown.slice(
|
|
||||||
0,
|
|
||||||
Math.floor(MAX_METADATA_SIZE / 2),
|
|
||||||
); // Using half the size to be safe with UTF-8 encoding
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create text to embed
|
|
||||||
const textToEmbed = [
|
|
||||||
document.metadata.title,
|
|
||||||
document.metadata.description,
|
|
||||||
trimmedMarkdown,
|
|
||||||
]
|
|
||||||
.filter(Boolean)
|
|
||||||
.join("\n\n");
|
|
||||||
|
|
||||||
// Get embedding from OpenAI
|
|
||||||
const embedding = await getEmbedding(textToEmbed);
|
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(
|
|
||||||
document.metadata.sourceURL || document.metadata.url!,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Prepare metadata
|
|
||||||
const metadata: PageMetadata = {
|
|
||||||
url: normalizedUrl,
|
|
||||||
originUrl: normalizeUrl(originUrl),
|
|
||||||
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
|
|
||||||
description:
|
|
||||||
document.metadata.description ?? document.metadata.ogDescription ?? "",
|
|
||||||
crawlId,
|
|
||||||
teamId,
|
|
||||||
markdown: trimmedMarkdown,
|
|
||||||
timestamp: Date.now(),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Upsert to Pinecone
|
|
||||||
await index.upsert([
|
|
||||||
{
|
|
||||||
id: normalizedUrl,
|
|
||||||
values: embedding,
|
|
||||||
metadata: {
|
|
||||||
...metadata,
|
|
||||||
[document.metadata.sourceURL || document.metadata.url!]: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]);
|
|
||||||
|
|
||||||
logger.debug("Successfully indexed page in Pinecone", {
|
|
||||||
url: metadata.url,
|
|
||||||
crawlId,
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
logger.error("Failed to index page in Pinecone", {
|
|
||||||
error,
|
|
||||||
url: document.metadata.sourceURL || document.metadata.url,
|
|
||||||
crawlId,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function searchSimilarPages(
|
|
||||||
query: string,
|
|
||||||
originUrl?: string,
|
|
||||||
limit: number = 1000,
|
|
||||||
): Promise<any[]> {
|
|
||||||
try {
|
|
||||||
const index = pinecone.index(INDEX_NAME);
|
|
||||||
|
|
||||||
// Get query embedding from OpenAI
|
|
||||||
const queryEmbedding = await getEmbedding(query);
|
|
||||||
|
|
||||||
const queryParams: any = {
|
|
||||||
vector: queryEmbedding,
|
|
||||||
topK: limit,
|
|
||||||
includeMetadata: true,
|
|
||||||
};
|
|
||||||
|
|
||||||
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
|
|
||||||
// Add filter if originUrl is provided
|
|
||||||
if (normalizedOriginUrl) {
|
|
||||||
queryParams.filter = {
|
|
||||||
originUrl: { $eq: normalizedOriginUrl },
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const results = await index.query(queryParams);
|
|
||||||
return results.matches.map((match) => ({
|
|
||||||
url: match.metadata?.url,
|
|
||||||
title: match.metadata?.title,
|
|
||||||
description: match.metadata?.description,
|
|
||||||
score: match.score,
|
|
||||||
markdown: match.metadata?.markdown,
|
|
||||||
}));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error("Failed to search similar pages in Pinecone", {
|
|
||||||
error,
|
|
||||||
query,
|
|
||||||
originUrl,
|
|
||||||
});
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
@ -4,7 +4,6 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
|||||||
import { logger } from "../logger";
|
import { logger } from "../logger";
|
||||||
import { CohereClient } from "cohere-ai";
|
import { CohereClient } from "cohere-ai";
|
||||||
import { extractConfig } from "./config";
|
import { extractConfig } from "./config";
|
||||||
import { searchSimilarPages } from "./index/pinecone";
|
|
||||||
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { buildRerankerUserPrompt } from "./build-prompts";
|
import { buildRerankerUserPrompt } from "./build-prompts";
|
||||||
import { buildRerankerSystemPrompt } from "./build-prompts";
|
import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||||
|
49
apps/api/src/lib/scrape-billing.ts
Normal file
49
apps/api/src/lib/scrape-billing.ts
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import { Document, ScrapeOptions } from "../controllers/v1/types";
|
||||||
|
import { supabaseGetJobById } from "./supabase-jobs";
|
||||||
|
import { logger } from "./logger";
|
||||||
|
import { CostTracking } from "./extract/extraction-service";
|
||||||
|
|
||||||
|
const creditsPerPDFPage = 1;
|
||||||
|
const stealthProxyCostBonus = 4;
|
||||||
|
|
||||||
|
export async function calculateCreditsToBeBilled(options: ScrapeOptions, document: Document, jobId: string, costTracking?: any) {
|
||||||
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
|
if ((options.extract && options.formats?.includes("extract")) || (options.formats?.includes("changeTracking") && options.changeTrackingOptions?.modes?.includes("json"))) {
|
||||||
|
creditsToBeBilled = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.agent?.model?.toLowerCase() === "fire-1" || options.extract?.agent?.model?.toLowerCase() === "fire-1" || options.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
// @Nick this is a hack pushed at 2AM pls help - mogery
|
||||||
|
if (!costTracking) {
|
||||||
|
const job = await supabaseGetJobById(jobId);
|
||||||
|
costTracking = job?.cost_tracking;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!costTracking) {
|
||||||
|
logger.warn("No cost tracking found for job", {
|
||||||
|
jobId,
|
||||||
|
scrapeId: jobId
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (costTracking instanceof CostTracking) {
|
||||||
|
costTracking = costTracking.toJSON();
|
||||||
|
}
|
||||||
|
|
||||||
|
creditsToBeBilled = Math.ceil((costTracking?.totalCost ?? 1) * 1800);
|
||||||
|
} else {
|
||||||
|
creditsToBeBilled = 150;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document.metadata.numPages !== undefined && document.metadata.numPages > 1) {
|
||||||
|
creditsToBeBilled += creditsPerPDFPage * (document.metadata.numPages - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document?.metadata?.proxyUsed === "stealth") {
|
||||||
|
creditsToBeBilled += stealthProxyCostBonus;
|
||||||
|
}
|
||||||
|
|
||||||
|
return creditsToBeBilled;
|
||||||
|
}
|
@ -15,7 +15,6 @@ import {
|
|||||||
ScrapeUrlResponse,
|
ScrapeUrlResponse,
|
||||||
} from "../scraper/scrapeURL";
|
} from "../scraper/scrapeURL";
|
||||||
import { Engine } from "../scraper/scrapeURL/engines";
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
|
||||||
import { CostTracking } from "../lib/extract/extraction-service";
|
import { CostTracking } from "../lib/extract/extraction-service";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
|
@ -224,7 +224,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
mobile: meta.options.mobile,
|
mobile: meta.options.mobile,
|
||||||
timeout, // TODO: better timeout logic
|
timeout, // TODO: better timeout logic
|
||||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||||
blockAds: meta.options.blockAds,
|
|
||||||
mobileProxy: meta.featureFlags.has("stealthProxy"),
|
mobileProxy: meta.featureFlags.has("stealthProxy"),
|
||||||
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
|
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
|
||||||
// TODO: scrollXPaths
|
// TODO: scrollXPaths
|
||||||
|
@ -12,7 +12,6 @@ export type FireEngineScrapeRequestCommon = {
|
|||||||
headers?: { [K: string]: string };
|
headers?: { [K: string]: string };
|
||||||
|
|
||||||
blockMedia?: boolean; // default: true
|
blockMedia?: boolean; // default: true
|
||||||
blockAds?: boolean; // default: true
|
|
||||||
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
|
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
|
||||||
|
|
||||||
// useProxy?: boolean; // unused, default: true
|
// useProxy?: boolean; // unused, default: true
|
||||||
@ -39,7 +38,6 @@ export type FireEngineScrapeRequestChromeCDP = {
|
|||||||
blockMedia?: true; // cannot be false
|
blockMedia?: true; // cannot be false
|
||||||
mobile?: boolean;
|
mobile?: boolean;
|
||||||
disableSmartWaitCache?: boolean;
|
disableSmartWaitCache?: boolean;
|
||||||
blockAds?: boolean; // default: true
|
|
||||||
saveScrapeResultToGCS?: boolean;
|
saveScrapeResultToGCS?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -58,7 +56,6 @@ export type FireEngineScrapeRequestTLSClient = {
|
|||||||
engine: "tlsclient";
|
engine: "tlsclient";
|
||||||
atsv?: boolean; // v0 only, default: false
|
atsv?: boolean; // v0 only, default: false
|
||||||
disableJsDom?: boolean; // v0 only, default: false
|
disableJsDom?: boolean; // v0 only, default: false
|
||||||
// blockAds?: boolean; // default: true
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
|
@ -74,6 +74,7 @@ export const featureFlags = [
|
|||||||
"skipTlsVerification",
|
"skipTlsVerification",
|
||||||
"useFastMode",
|
"useFastMode",
|
||||||
"stealthProxy",
|
"stealthProxy",
|
||||||
|
"disableAdblock",
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
export type FeatureFlag = (typeof featureFlags)[number];
|
export type FeatureFlag = (typeof featureFlags)[number];
|
||||||
@ -95,6 +96,7 @@ export const featureFlagOptions: {
|
|||||||
mobile: { priority: 10 },
|
mobile: { priority: 10 },
|
||||||
skipTlsVerification: { priority: 10 },
|
skipTlsVerification: { priority: 10 },
|
||||||
stealthProxy: { priority: 20 },
|
stealthProxy: { priority: 20 },
|
||||||
|
disableAdblock: { priority: 10 },
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
export type EngineScrapeResult = {
|
export type EngineScrapeResult = {
|
||||||
@ -171,6 +173,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: 1000, // cache should always be tried first
|
quality: 1000, // cache should always be tried first
|
||||||
},
|
},
|
||||||
@ -205,6 +208,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: 50,
|
quality: 50,
|
||||||
},
|
},
|
||||||
@ -222,6 +226,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: 45,
|
quality: 45,
|
||||||
},
|
},
|
||||||
@ -256,6 +261,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: true,
|
stealthProxy: true,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: -2,
|
quality: -2,
|
||||||
},
|
},
|
||||||
@ -273,6 +279,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: true,
|
stealthProxy: true,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: -5,
|
quality: -5,
|
||||||
},
|
},
|
||||||
@ -290,6 +297,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: true,
|
||||||
},
|
},
|
||||||
quality: 40,
|
quality: 40,
|
||||||
},
|
},
|
||||||
@ -307,6 +315,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: true,
|
stealthProxy: true,
|
||||||
|
disableAdblock: true,
|
||||||
},
|
},
|
||||||
quality: -10,
|
quality: -10,
|
||||||
},
|
},
|
||||||
@ -324,6 +333,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false,
|
useFastMode: false,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: 20,
|
quality: 20,
|
||||||
},
|
},
|
||||||
@ -341,6 +351,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: 10,
|
quality: 10,
|
||||||
},
|
},
|
||||||
@ -358,6 +369,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: true,
|
stealthProxy: true,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: -15,
|
quality: -15,
|
||||||
},
|
},
|
||||||
@ -375,6 +387,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: false,
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: 5,
|
quality: 5,
|
||||||
},
|
},
|
||||||
@ -392,6 +405,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: true, // kinda...
|
stealthProxy: true, // kinda...
|
||||||
|
disableAdblock: true,
|
||||||
},
|
},
|
||||||
quality: -20,
|
quality: -20,
|
||||||
},
|
},
|
||||||
@ -409,6 +423,7 @@ export const engineOptions: {
|
|||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true,
|
useFastMode: true,
|
||||||
stealthProxy: true, // kinda...
|
stealthProxy: true, // kinda...
|
||||||
|
disableAdblock: true,
|
||||||
},
|
},
|
||||||
quality: -20,
|
quality: -20,
|
||||||
},
|
},
|
||||||
|
@ -120,6 +120,10 @@ function buildFeatureFlags(
|
|||||||
flags.add("docx");
|
flags.add("docx");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (options.blockAds === false) {
|
||||||
|
flags.add("disableAdblock");
|
||||||
|
}
|
||||||
|
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,6 +191,7 @@ export type InternalOptions = {
|
|||||||
unnormalizedSourceURL?: string;
|
unnormalizedSourceURL?: string;
|
||||||
|
|
||||||
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
||||||
|
bypassBilling?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = {
|
export type EngineResultsTracker = {
|
||||||
|
@ -5,6 +5,58 @@ import { logger } from "../lib/logger";
|
|||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
|
|
||||||
|
export async function fire_engine_search(
|
||||||
|
q: string,
|
||||||
|
options: {
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
lang?: string;
|
||||||
|
country?: string;
|
||||||
|
location?: string;
|
||||||
|
numResults: number;
|
||||||
|
page?: number;
|
||||||
|
},
|
||||||
|
abort?: AbortSignal,
|
||||||
|
): Promise<SearchResult[]> {
|
||||||
|
try {
|
||||||
|
let data = JSON.stringify({
|
||||||
|
query: q,
|
||||||
|
lang: options.lang,
|
||||||
|
country: options.country,
|
||||||
|
location: options.location,
|
||||||
|
tbs: options.tbs,
|
||||||
|
numResults: options.numResults,
|
||||||
|
page: options.page ?? 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"X-Disable-Cache": "true",
|
||||||
|
},
|
||||||
|
body: data,
|
||||||
|
signal: abort,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const responseData = await response.json();
|
||||||
|
return responseData;
|
||||||
|
} else {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(error);
|
||||||
|
Sentry.captureException(error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function fireEngineMap(
|
export async function fireEngineMap(
|
||||||
q: string,
|
q: string,
|
||||||
options: {
|
options: {
|
||||||
@ -34,7 +86,7 @@ export async function fireEngineMap(
|
|||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
|
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/map`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
@ -4,6 +4,7 @@ import { googleSearch } from "./googlesearch";
|
|||||||
import { searchapi_search } from "./searchapi";
|
import { searchapi_search } from "./searchapi";
|
||||||
import { serper_search } from "./serper";
|
import { serper_search } from "./serper";
|
||||||
import { searxng_search } from "./searxng";
|
import { searxng_search } from "./searxng";
|
||||||
|
import { fire_engine_search } from "./fireEngine";
|
||||||
|
|
||||||
export async function search({
|
export async function search({
|
||||||
query,
|
query,
|
||||||
@ -31,8 +32,19 @@ export async function search({
|
|||||||
timeout?: number;
|
timeout?: number;
|
||||||
}): Promise<SearchResult[]> {
|
}): Promise<SearchResult[]> {
|
||||||
try {
|
try {
|
||||||
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
|
const results = await fire_engine_search(query, {
|
||||||
|
numResults: num_results,
|
||||||
|
tbs,
|
||||||
|
filter,
|
||||||
|
lang,
|
||||||
|
country,
|
||||||
|
location,
|
||||||
|
});
|
||||||
|
if (results.length > 0) return results;
|
||||||
|
}
|
||||||
if (process.env.SERPER_API_KEY) {
|
if (process.env.SERPER_API_KEY) {
|
||||||
return await serper_search(query, {
|
const results = await serper_search(query, {
|
||||||
num_results,
|
num_results,
|
||||||
tbs,
|
tbs,
|
||||||
filter,
|
filter,
|
||||||
@ -40,9 +52,10 @@ export async function search({
|
|||||||
country,
|
country,
|
||||||
location,
|
location,
|
||||||
});
|
});
|
||||||
|
if (results.length > 0) return results;
|
||||||
}
|
}
|
||||||
if (process.env.SEARCHAPI_API_KEY) {
|
if (process.env.SEARCHAPI_API_KEY) {
|
||||||
return await searchapi_search(query, {
|
const results = await searchapi_search(query, {
|
||||||
num_results,
|
num_results,
|
||||||
tbs,
|
tbs,
|
||||||
filter,
|
filter,
|
||||||
@ -50,9 +63,10 @@ export async function search({
|
|||||||
country,
|
country,
|
||||||
location,
|
location,
|
||||||
});
|
});
|
||||||
|
if (results.length > 0) return results;
|
||||||
}
|
}
|
||||||
if (process.env.SEARXNG_ENDPOINT) {
|
if (process.env.SEARXNG_ENDPOINT) {
|
||||||
return await searxng_search(query, {
|
const results = await searxng_search(query, {
|
||||||
num_results,
|
num_results,
|
||||||
tbs,
|
tbs,
|
||||||
filter,
|
filter,
|
||||||
@ -60,6 +74,7 @@ export async function search({
|
|||||||
country,
|
country,
|
||||||
location,
|
location,
|
||||||
});
|
});
|
||||||
|
if (results.length > 0) return results;
|
||||||
}
|
}
|
||||||
return await googleSearch(
|
return await googleSearch(
|
||||||
query,
|
query,
|
||||||
|
@ -21,13 +21,14 @@ function cleanOfNull<T>(x: T): T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
export async function logJob(job: FirecrawlJob, force: boolean = false, bypassLogging: boolean = false) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if (!useDbAuthentication) {
|
if (!useDbAuthentication) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Redact any pages that have an authorization header
|
// Redact any pages that have an authorization header
|
||||||
// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
|
// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
|
||||||
// if (
|
// if (
|
||||||
@ -63,12 +64,17 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
|||||||
is_migrated: true,
|
is_migrated: true,
|
||||||
cost_tracking: job.cost_tracking,
|
cost_tracking: job.cost_tracking,
|
||||||
pdf_num_pages: job.pdf_num_pages ?? null,
|
pdf_num_pages: job.pdf_num_pages ?? null,
|
||||||
|
credits_billed: job.credits_billed ?? null,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (process.env.GCS_BUCKET_NAME) {
|
if (process.env.GCS_BUCKET_NAME) {
|
||||||
await saveJobToGCS(job);
|
await saveJobToGCS(job);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bypassLogging) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (force) {
|
if (force) {
|
||||||
let i = 0,
|
let i = 0,
|
||||||
done = false;
|
done = false;
|
||||||
|
@ -97,6 +97,7 @@ async function addScrapeJobRaw(
|
|||||||
options: any,
|
options: any,
|
||||||
jobId: string,
|
jobId: string,
|
||||||
jobPriority: number,
|
jobPriority: number,
|
||||||
|
directToBullMQ: boolean = false,
|
||||||
) {
|
) {
|
||||||
const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay;
|
const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay;
|
||||||
|
|
||||||
@ -127,7 +128,7 @@ async function addScrapeJobRaw(
|
|||||||
|
|
||||||
const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id);
|
const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id);
|
||||||
|
|
||||||
if (concurrencyLimited) {
|
if (concurrencyLimited && !directToBullMQ) {
|
||||||
// Detect if they hit their concurrent limit
|
// Detect if they hit their concurrent limit
|
||||||
// If above by 2x, send them an email
|
// If above by 2x, send them an email
|
||||||
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
|
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
|
||||||
@ -161,6 +162,7 @@ export async function addScrapeJob(
|
|||||||
options: any = {},
|
options: any = {},
|
||||||
jobId: string = uuidv4(),
|
jobId: string = uuidv4(),
|
||||||
jobPriority: number = 10,
|
jobPriority: number = 10,
|
||||||
|
directToBullMQ: boolean = false,
|
||||||
) {
|
) {
|
||||||
if (Sentry.isInitialized()) {
|
if (Sentry.isInitialized()) {
|
||||||
const size = JSON.stringify(webScraperOptions).length;
|
const size = JSON.stringify(webScraperOptions).length;
|
||||||
@ -187,11 +189,12 @@ export async function addScrapeJob(
|
|||||||
options,
|
options,
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority,
|
jobPriority,
|
||||||
|
directToBullMQ,
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
|
await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority, directToBullMQ);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,10 +66,10 @@ export function getIndexQueue() {
|
|||||||
connection: redisConnection,
|
connection: redisConnection,
|
||||||
defaultJobOptions: {
|
defaultJobOptions: {
|
||||||
removeOnComplete: {
|
removeOnComplete: {
|
||||||
age: 90000, // 25 hours
|
age: 3600, // 1 hour
|
||||||
},
|
},
|
||||||
removeOnFail: {
|
removeOnFail: {
|
||||||
age: 90000, // 25 hours
|
age: 3600, // 1 hour
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -120,10 +120,10 @@ export function getBillingQueue() {
|
|||||||
connection: redisConnection,
|
connection: redisConnection,
|
||||||
defaultJobOptions: {
|
defaultJobOptions: {
|
||||||
removeOnComplete: {
|
removeOnComplete: {
|
||||||
age: 90000, // 25 hours
|
age: 3600, // 1 hour
|
||||||
},
|
},
|
||||||
removeOnFail: {
|
removeOnFail: {
|
||||||
age: 90000, // 25 hours
|
age: 3600, // 1 hour
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
@ -61,7 +61,6 @@ import {
|
|||||||
} from "../lib/concurrency-limit";
|
} from "../lib/concurrency-limit";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
import {
|
import {
|
||||||
ExtractResult,
|
ExtractResult,
|
||||||
@ -85,9 +84,11 @@ import https from "https";
|
|||||||
import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
|
import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
|
||||||
import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
|
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
||||||
import { redisEvictConnection } from "./redis";
|
import { redisEvictConnection } from "./redis";
|
||||||
import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index";
|
import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -320,7 +321,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
scrapeOptions: sc.scrapeOptions,
|
scrapeOptions: sc.scrapeOptions,
|
||||||
crawlerOptions: sc.crawlerOptions,
|
crawlerOptions: sc.crawlerOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
});
|
}, false, job.data.internalOptions?.bypassBilling ?? false);
|
||||||
logger.info("Logged crawl!");
|
logger.info("Logged crawl!");
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
@ -372,8 +373,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
|
job.data.internalOptions?.bypassBilling ?? false,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
// v1 web hooks, call when done with no data, but with event completed
|
// v1 web hooks, call when done with no data, but with event completed
|
||||||
if (job.data.v1 && job.data.webhook) {
|
if (job.data.v1 && job.data.webhook) {
|
||||||
callWebhook(
|
callWebhook(
|
||||||
@ -1133,6 +1136,59 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) {
|
||||||
|
let creditsToBeBilled: number | null = null;
|
||||||
|
|
||||||
|
if (job.data.is_scrape !== true && !job.data.internalOptions?.bypassBilling) {
|
||||||
|
creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, document, job.id, costTracking);
|
||||||
|
|
||||||
|
if (
|
||||||
|
job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! &&
|
||||||
|
process.env.USE_DB_AUTHENTICATION === "true"
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
const billingJobId = uuidv4();
|
||||||
|
logger.debug(
|
||||||
|
`Adding billing job to queue for team ${job.data.team_id}`,
|
||||||
|
{
|
||||||
|
billingJobId,
|
||||||
|
credits: creditsToBeBilled,
|
||||||
|
is_extract: false,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
// Add directly to the billing queue - the billing worker will handle the rest
|
||||||
|
await getBillingQueue().add(
|
||||||
|
"bill_team",
|
||||||
|
{
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
subscription_id: undefined,
|
||||||
|
credits: creditsToBeBilled,
|
||||||
|
is_extract: false,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
originating_job_id: job.id,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
jobId: billingJobId,
|
||||||
|
priority: 10,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
return creditsToBeBilled;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Failed to add billing job to queue for team ${job.data.team_id} for ${creditsToBeBilled} credits`,
|
||||||
|
{ error },
|
||||||
|
);
|
||||||
|
Sentry.captureException(error);
|
||||||
|
return creditsToBeBilled;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return creditsToBeBilled;
|
||||||
|
}
|
||||||
|
|
||||||
async function processJob(job: Job & { id: string }, token: string) {
|
async function processJob(job: Job & { id: string }, token: string) {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "queue-worker",
|
module: "queue-worker",
|
||||||
@ -1143,25 +1199,9 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
teamId: job.data?.team_id ?? undefined,
|
teamId: job.data?.team_id ?? undefined,
|
||||||
});
|
});
|
||||||
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
||||||
const start = Date.now();
|
const start = job.data.startTime ?? Date.now();
|
||||||
|
const remainingTime = job.data.scrapeOptions.timeout ? (job.data.scrapeOptions.timeout - (Date.now() - start)) : undefined;
|
||||||
|
|
||||||
// Check if the job URL is researchhub and block it immediately
|
|
||||||
// TODO: remove this once solve the root issue
|
|
||||||
// if (
|
|
||||||
// job.data.url &&
|
|
||||||
// (job.data.url.includes("researchhub.com") ||
|
|
||||||
// job.data.url.includes("ebay.com"))
|
|
||||||
// ) {
|
|
||||||
// logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
|
|
||||||
// const data = {
|
|
||||||
// success: false,
|
|
||||||
// document: null,
|
|
||||||
// project_id: job.data.project_id,
|
|
||||||
// error:
|
|
||||||
// "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
|
|
||||||
// };
|
|
||||||
// return data;
|
|
||||||
// }
|
|
||||||
const costTracking = new CostTracking();
|
const costTracking = new CostTracking();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -1172,6 +1212,11 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
current_url: "",
|
current_url: "",
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (remainingTime !== undefined && remainingTime < 0) {
|
||||||
|
throw new Error("timeout");
|
||||||
|
}
|
||||||
|
const signal = remainingTime ? AbortSignal.timeout(remainingTime) : undefined;
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
if (job.data.crawl_id) {
|
||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
if (sc && sc.cancelled) {
|
if (sc && sc.cancelled) {
|
||||||
@ -1185,16 +1230,22 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
token,
|
token,
|
||||||
costTracking,
|
costTracking,
|
||||||
}),
|
}),
|
||||||
...(job.data.scrapeOptions.timeout !== undefined
|
...(remainingTime !== undefined
|
||||||
? [
|
? [
|
||||||
(async () => {
|
(async () => {
|
||||||
await sleep(job.data.scrapeOptions.timeout);
|
await sleep(remainingTime);
|
||||||
throw new Error("timeout");
|
throw new Error("timeout");
|
||||||
})(),
|
})(),
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
try {
|
||||||
|
signal?.throwIfAborted();
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("timeout");
|
||||||
|
}
|
||||||
|
|
||||||
if (!pipeline.success) {
|
if (!pipeline.success) {
|
||||||
throw pipeline.error;
|
throw pipeline.error;
|
||||||
}
|
}
|
||||||
@ -1295,45 +1346,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Logging job to DB...");
|
|
||||||
await logJob(
|
|
||||||
{
|
|
||||||
job_id: job.id as string,
|
|
||||||
success: true,
|
|
||||||
num_docs: 1,
|
|
||||||
docs: [doc],
|
|
||||||
time_taken: timeTakenInSeconds,
|
|
||||||
team_id: job.data.team_id,
|
|
||||||
mode: job.data.mode,
|
|
||||||
url: job.data.url,
|
|
||||||
crawlerOptions: sc.crawlerOptions,
|
|
||||||
scrapeOptions: job.data.scrapeOptions,
|
|
||||||
origin: job.data.origin,
|
|
||||||
crawl_id: job.data.crawl_id,
|
|
||||||
cost_tracking: costTracking,
|
|
||||||
pdf_num_pages: doc.metadata.numPages,
|
|
||||||
},
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
|
||||||
logger.debug("Calling webhook with success...", {
|
|
||||||
webhook: job.data.webhook,
|
|
||||||
});
|
|
||||||
await callWebhook(
|
|
||||||
job.data.team_id,
|
|
||||||
job.data.crawl_id,
|
|
||||||
data,
|
|
||||||
job.data.webhook,
|
|
||||||
job.data.v1,
|
|
||||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
|
||||||
|
|
||||||
if (job.data.crawlerOptions !== null) {
|
if (job.data.crawlerOptions !== null) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(
|
const crawler = crawlToCrawler(
|
||||||
@ -1429,8 +1441,65 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
signal?.throwIfAborted();
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("timeout");
|
||||||
|
}
|
||||||
|
|
||||||
|
const credits_billed = await billScrapeJob(job, doc, logger, costTracking);
|
||||||
|
|
||||||
|
logger.debug("Logging job to DB...");
|
||||||
|
await logJob(
|
||||||
|
{
|
||||||
|
job_id: job.id as string,
|
||||||
|
success: true,
|
||||||
|
num_docs: 1,
|
||||||
|
docs: [doc],
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: job.data.mode,
|
||||||
|
url: job.data.url,
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
cost_tracking: costTracking,
|
||||||
|
pdf_num_pages: doc.metadata.numPages,
|
||||||
|
credits_billed,
|
||||||
|
},
|
||||||
|
true,
|
||||||
|
job.data.internalOptions?.bypassBilling ?? false,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||||
|
logger.debug("Calling webhook with success...", {
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
});
|
||||||
|
await callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
|
data,
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1,
|
||||||
|
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug("Declaring job as done...");
|
||||||
|
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
||||||
|
|
||||||
await finishCrawlIfNeeded(job, sc);
|
await finishCrawlIfNeeded(job, sc);
|
||||||
} else {
|
} else {
|
||||||
|
try {
|
||||||
|
signal?.throwIfAborted();
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("timeout");
|
||||||
|
}
|
||||||
|
|
||||||
|
const credits_billed = await billScrapeJob(job, doc, logger, costTracking);
|
||||||
|
|
||||||
await logJob({
|
await logJob({
|
||||||
job_id: job.id,
|
job_id: job.id,
|
||||||
success: true,
|
success: true,
|
||||||
@ -1446,66 +1515,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
num_tokens: 0, // TODO: fix
|
num_tokens: 0, // TODO: fix
|
||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
pdf_num_pages: doc.metadata.numPages,
|
pdf_num_pages: doc.metadata.numPages,
|
||||||
});
|
credits_billed,
|
||||||
}
|
}, false, job.data.internalOptions?.bypassBilling ?? false);
|
||||||
|
|
||||||
if (job.data.is_scrape !== true) {
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
|
||||||
if ((job.data.scrapeOptions.extract && job.data.scrapeOptions.formats?.includes("extract")) || (job.data.scrapeOptions.formats?.includes("changeTracking") && job.data.scrapeOptions.changeTrackingOptions?.modes?.includes("json"))) {
|
|
||||||
creditsToBeBilled = 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (job.data.scrapeOptions.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.extract?.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
|
||||||
creditsToBeBilled = Math.ceil((costTracking.toJSON().totalCost ?? 1) * 1800);
|
|
||||||
} else {
|
|
||||||
creditsToBeBilled = 150;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doc.metadata?.proxyUsed === "stealth") {
|
|
||||||
creditsToBeBilled += 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! &&
|
|
||||||
process.env.USE_DB_AUTHENTICATION === "true"
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const billingJobId = uuidv4();
|
|
||||||
logger.debug(
|
|
||||||
`Adding billing job to queue for team ${job.data.team_id}`,
|
|
||||||
{
|
|
||||||
billingJobId,
|
|
||||||
credits: creditsToBeBilled,
|
|
||||||
is_extract: false,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
// Add directly to the billing queue - the billing worker will handle the rest
|
|
||||||
await getBillingQueue().add(
|
|
||||||
"bill_team",
|
|
||||||
{
|
|
||||||
team_id: job.data.team_id,
|
|
||||||
subscription_id: undefined,
|
|
||||||
credits: creditsToBeBilled,
|
|
||||||
is_extract: false,
|
|
||||||
timestamp: new Date().toISOString(),
|
|
||||||
originating_job_id: job.id,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
jobId: billingJobId,
|
|
||||||
priority: 10,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(
|
|
||||||
`Failed to add billing job to queue for team ${job.data.team_id} for ${creditsToBeBilled} credits`,
|
|
||||||
{ error },
|
|
||||||
);
|
|
||||||
Sentry.captureException(error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`🐂 Job done ${job.id}`);
|
logger.info(`🐂 Job done ${job.id}`);
|
||||||
@ -1604,6 +1615,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
|
job.data.internalOptions?.bypassBilling ?? false,
|
||||||
);
|
);
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
@ -44,9 +44,15 @@ export interface WebScraperOptions {
|
|||||||
sitemapped?: boolean;
|
sitemapped?: boolean;
|
||||||
webhook?: z.infer<typeof webhookSchema>;
|
webhook?: z.infer<typeof webhookSchema>;
|
||||||
v1?: boolean;
|
v1?: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disables billing on the worker side.
|
||||||
|
*/
|
||||||
is_scrape?: boolean;
|
is_scrape?: boolean;
|
||||||
|
|
||||||
isCrawlSourceScrape?: boolean;
|
isCrawlSourceScrape?: boolean;
|
||||||
from_extract?: boolean;
|
from_extract?: boolean;
|
||||||
|
startTime?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RunWebScraperParams {
|
export interface RunWebScraperParams {
|
||||||
@ -95,6 +101,7 @@ export interface FirecrawlJob {
|
|||||||
sources?: Record<string, string[]>;
|
sources?: Record<string, string[]>;
|
||||||
cost_tracking?: CostTracking;
|
cost_tracking?: CostTracking;
|
||||||
pdf_num_pages?: number;
|
pdf_num_pages?: number;
|
||||||
|
credits_billed?: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FirecrawlScrapeResponse {
|
export interface FirecrawlScrapeResponse {
|
||||||
|
2
apps/rust-sdk/Cargo.lock
generated
2
apps/rust-sdk/Cargo.lock
generated
@ -680,7 +680,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "firecrawl"
|
name = "firecrawl"
|
||||||
version = "1.1.0"
|
version = "1.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"assert_matches",
|
"assert_matches",
|
||||||
"axum",
|
"axum",
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "firecrawl"
|
name = "firecrawl"
|
||||||
author= "Mendable.ai"
|
author= "Mendable.ai"
|
||||||
version = "1.1.0"
|
version = "1.2.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
homepage = "https://www.firecrawl.dev/"
|
homepage = "https://www.firecrawl.dev/"
|
||||||
|
@ -99,6 +99,49 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Options for webhook notifications
|
||||||
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct WebhookOptions {
|
||||||
|
/// URL to send webhook notifications to
|
||||||
|
pub url: String,
|
||||||
|
|
||||||
|
/// Custom headers to include in webhook requests
|
||||||
|
pub headers: Option<HashMap<String, String>>,
|
||||||
|
|
||||||
|
/// Custom data included in all webhook payloads
|
||||||
|
pub metadata: Option<HashMap<String, String>>,
|
||||||
|
|
||||||
|
/// Event types to receive
|
||||||
|
pub events: Option<Vec<WebhookEvent>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<String> for WebhookOptions {
|
||||||
|
fn from(value: String) -> Self {
|
||||||
|
Self {
|
||||||
|
url: value,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub enum WebhookEvent {
|
||||||
|
/// Crawl finished successfully
|
||||||
|
Completed,
|
||||||
|
|
||||||
|
/// Crawl encountered an error
|
||||||
|
Failed,
|
||||||
|
|
||||||
|
/// Individual page scraped
|
||||||
|
Page,
|
||||||
|
|
||||||
|
/// Crawl job initiated
|
||||||
|
Started,
|
||||||
|
}
|
||||||
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
@ -132,7 +175,7 @@ pub struct CrawlOptions {
|
|||||||
pub allow_external_links: Option<bool>,
|
pub allow_external_links: Option<bool>,
|
||||||
|
|
||||||
/// URL to send Webhook crawl events to.
|
/// URL to send Webhook crawl events to.
|
||||||
pub webhook: Option<String>,
|
pub webhook: Option<WebhookOptions>,
|
||||||
|
|
||||||
/// Idempotency key to send to the crawl endpoint.
|
/// Idempotency key to send to the crawl endpoint.
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user