mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 13:45:54 +08:00
Merge branch 'main' into mog/pdf-pages-billing
This commit is contained in:
commit
3ad202f234
@ -48,4 +48,12 @@ describe("Batch scrape tests", () => {
|
|||||||
}, 180000);
|
}, 180000);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
it.concurrent("sourceURL stays unnormalized", async () => {
|
||||||
|
const response = await batchScrape({
|
||||||
|
urls: ["https://firecrawl.dev/?pagewanted=all&et_blog"],
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.body.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||||
|
}, 35000);
|
||||||
});
|
});
|
||||||
|
@ -366,4 +366,21 @@ describe("Scrape tests", () => {
|
|||||||
}, 30000);
|
}, 30000);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
it.concurrent("sourceURL stays unnormalized", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://firecrawl.dev/?pagewanted=all&et_blog",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("application/json content type is markdownified properly", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://jsonplaceholder.typicode.com/todos/1",
|
||||||
|
formats: ["markdown"],
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.markdown).toContain("```json");
|
||||||
|
}, 30000);
|
||||||
});
|
});
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { redisConnection } from "../../../services/queue-service";
|
import { redisEvictConnection } from "../../../services/redis";
|
||||||
import { supabase_service } from "../../../services/supabase";
|
import { supabase_service } from "../../../services/supabase";
|
||||||
import { logger as _logger } from "../../../lib/logger";
|
import { logger as _logger } from "../../../lib/logger";
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
@ -10,7 +10,7 @@ async function cclog() {
|
|||||||
|
|
||||||
let cursor = 0;
|
let cursor = 0;
|
||||||
do {
|
do {
|
||||||
const result = await redisConnection.scan(cursor, "MATCH", "concurrency-limiter:*", "COUNT", 100000);
|
const result = await redisEvictConnection.scan(cursor, "MATCH", "concurrency-limiter:*", "COUNT", 100000);
|
||||||
cursor = parseInt(result[0], 10);
|
cursor = parseInt(result[0], 10);
|
||||||
const usable = result[1].filter(x => !x.includes("preview_"));
|
const usable = result[1].filter(x => !x.includes("preview_"));
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ async function cclog() {
|
|||||||
|
|
||||||
for (const x of usable) {
|
for (const x of usable) {
|
||||||
const at = new Date();
|
const at = new Date();
|
||||||
const concurrency = await redisConnection.zrangebyscore(x, Date.now(), Infinity);
|
const concurrency = await redisEvictConnection.zrangebyscore(x, Date.now(), Infinity);
|
||||||
if (concurrency) {
|
if (concurrency) {
|
||||||
entries.push({
|
entries.push({
|
||||||
team_id: x.split(":")[1],
|
team_id: x.split(":")[1],
|
||||||
|
@ -6,7 +6,7 @@ import { logger } from "../../../src/lib/logger";
|
|||||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function crawlCancelController(req: Request, res: Response) {
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
@ -20,7 +20,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const { team_id } = auth;
|
const { team_id } = auth;
|
||||||
|
|
||||||
redisConnection.sadd("teams_using_v0", team_id)
|
redisEvictConnection.sadd("teams_using_v0", team_id)
|
||||||
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
||||||
|
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { RateLimiterMode } from "../../../src/types";
|
import { RateLimiterMode } from "../../../src/types";
|
||||||
import { getScrapeQueue, redisConnection } from "../../../src/services/queue-service";
|
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||||
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
import { logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||||
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
|
||||||
@ -80,7 +81,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const { team_id } = auth;
|
const { team_id } = auth;
|
||||||
|
|
||||||
redisConnection.sadd("teams_using_v0", team_id)
|
redisEvictConnection.sadd("teams_using_v0", team_id)
|
||||||
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
||||||
|
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
@ -24,7 +24,7 @@ import {
|
|||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl,
|
StoredCrawl,
|
||||||
} from "../../../src/lib/crawl-redis";
|
} from "../../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue, redisConnection } from "../../../src/services/queue-service";
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
@ -41,7 +41,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const { team_id, chunk } = auth;
|
const { team_id, chunk } = auth;
|
||||||
|
|
||||||
redisConnection.sadd("teams_using_v0", team_id)
|
redisEvictConnection.sadd("teams_using_v0", team_id)
|
||||||
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
||||||
|
|
||||||
if (req.headers["x-idempotency-key"]) {
|
if (req.headers["x-idempotency-key"]) {
|
||||||
|
@ -2,7 +2,7 @@ import { AuthResponse, RateLimiterMode } from "../../types";
|
|||||||
|
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
|
|
||||||
export const keyAuthController = async (req: Request, res: Response) => {
|
export const keyAuthController = async (req: Request, res: Response) => {
|
||||||
@ -13,7 +13,7 @@ export const keyAuthController = async (req: Request, res: Response) => {
|
|||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|
||||||
redisConnection.sadd("teams_using_v0", auth.team_id)
|
redisEvictConnection.sadd("teams_using_v0", auth.team_id)
|
||||||
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id: auth.team_id }));
|
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id: auth.team_id }));
|
||||||
|
|
||||||
// if success, return success: true
|
// if success, return success: true
|
||||||
|
@ -21,7 +21,8 @@ import {
|
|||||||
defaultOrigin,
|
defaultOrigin,
|
||||||
} from "../../lib/default-values";
|
} from "../../lib/default-values";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
@ -184,7 +185,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const { team_id, chunk } = auth;
|
const { team_id, chunk } = auth;
|
||||||
|
|
||||||
redisConnection.sadd("teams_using_v0", team_id)
|
redisEvictConnection.sadd("teams_using_v0", team_id)
|
||||||
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
@ -11,7 +11,8 @@ import { search } from "../../search";
|
|||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
@ -167,7 +168,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
const { team_id, chunk } = auth;
|
const { team_id, chunk } = auth;
|
||||||
|
|
||||||
redisConnection.sadd("teams_using_v0", team_id)
|
redisEvictConnection.sadd("teams_using_v0", team_id)
|
||||||
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
.catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id }));
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
|
@ -22,7 +22,6 @@ import { getJobPriority } from "../../lib/job-priority";
|
|||||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
@ -30,6 +29,8 @@ export async function batchScrapeController(
|
|||||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||||
res: Response<BatchScrapeResponse>,
|
res: Response<BatchScrapeResponse>,
|
||||||
) {
|
) {
|
||||||
|
const preNormalizedBody = { ...req.body };
|
||||||
|
|
||||||
if (req.body?.ignoreInvalidURLs === true) {
|
if (req.body?.ignoreInvalidURLs === true) {
|
||||||
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
|
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
|
||||||
} else {
|
} else {
|
||||||
@ -46,6 +47,7 @@ export async function batchScrapeController(
|
|||||||
});
|
});
|
||||||
|
|
||||||
let urls = req.body.urls;
|
let urls = req.body.urls;
|
||||||
|
let unnormalizedURLs = preNormalizedBody.urls;
|
||||||
let invalidURLs: string[] | undefined = undefined;
|
let invalidURLs: string[] | undefined = undefined;
|
||||||
|
|
||||||
if (req.body.ignoreInvalidURLs) {
|
if (req.body.ignoreInvalidURLs) {
|
||||||
@ -53,11 +55,13 @@ export async function batchScrapeController(
|
|||||||
|
|
||||||
let pendingURLs = urls;
|
let pendingURLs = urls;
|
||||||
urls = [];
|
urls = [];
|
||||||
|
unnormalizedURLs = [];
|
||||||
for (const u of pendingURLs) {
|
for (const u of pendingURLs) {
|
||||||
try {
|
try {
|
||||||
const nu = urlSchema.parse(u);
|
const nu = urlSchema.parse(u);
|
||||||
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
||||||
urls.push(nu);
|
urls.push(nu);
|
||||||
|
unnormalizedURLs.push(u);
|
||||||
} else {
|
} else {
|
||||||
invalidURLs.push(u);
|
invalidURLs.push(u);
|
||||||
}
|
}
|
||||||
@ -86,12 +90,6 @@ export async function batchScrapeController(
|
|||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
let { remainingCredits } = req.account!;
|
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
|
||||||
if (!useDbAuthentication) {
|
|
||||||
remainingCredits = Infinity;
|
|
||||||
}
|
|
||||||
|
|
||||||
const sc: StoredCrawl = req.body.appendToId
|
const sc: StoredCrawl = req.body.appendToId
|
||||||
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
|
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
|
||||||
: {
|
: {
|
||||||
@ -127,7 +125,7 @@ export async function batchScrapeController(
|
|||||||
delete (scrapeOptions as any).urls;
|
delete (scrapeOptions as any).urls;
|
||||||
delete (scrapeOptions as any).appendToId;
|
delete (scrapeOptions as any).appendToId;
|
||||||
|
|
||||||
const jobs = urls.map((x) => {
|
const jobs = urls.map((x, i) => {
|
||||||
return {
|
return {
|
||||||
data: {
|
data: {
|
||||||
url: x,
|
url: x,
|
||||||
@ -142,6 +140,7 @@ export async function batchScrapeController(
|
|||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||||
|
unnormalizedSourceURL: unnormalizedURLs[i],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
|
@ -4,7 +4,7 @@ import {
|
|||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
|
|
||||||
// Basically just middleware and error wrapping
|
// Basically just middleware and error wrapping
|
||||||
export async function concurrencyCheckController(
|
export async function concurrencyCheckController(
|
||||||
@ -13,7 +13,7 @@ export async function concurrencyCheckController(
|
|||||||
) {
|
) {
|
||||||
const concurrencyLimiterKey = "concurrency-limiter:" + req.auth.team_id;
|
const concurrencyLimiterKey = "concurrency-limiter:" + req.auth.team_id;
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const activeJobsOfTeam = await redisConnection.zrangebyscore(
|
const activeJobsOfTeam = await redisEvictConnection.zrangebyscore(
|
||||||
concurrencyLimiterKey,
|
concurrencyLimiterKey,
|
||||||
now,
|
now,
|
||||||
Infinity,
|
Infinity,
|
||||||
|
@ -8,7 +8,8 @@ import {
|
|||||||
getCrawl,
|
getCrawl,
|
||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
|
import { redisEvictConnection } from "../../../src/services/redis";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { Job } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@ -65,7 +66,7 @@ export async function crawlErrorsController(
|
|||||||
url: x.data.url,
|
url: x.data.url,
|
||||||
error: x.failedReason,
|
error: x.failedReason,
|
||||||
})),
|
})),
|
||||||
robotsBlocked: await redisConnection.smembers(
|
robotsBlocked: await redisEvictConnection.smembers(
|
||||||
"crawl:" + req.params.jobId + ":robots_blocked",
|
"crawl:" + req.params.jobId + ":robots_blocked",
|
||||||
),
|
),
|
||||||
});
|
});
|
||||||
|
@ -33,11 +33,12 @@ export type PseudoJob<T> = {
|
|||||||
timestamp: number,
|
timestamp: number,
|
||||||
data: {
|
data: {
|
||||||
scrapeOptions: any,
|
scrapeOptions: any,
|
||||||
|
teamId?: string,
|
||||||
},
|
},
|
||||||
failedReason?: string,
|
failedReason?: string,
|
||||||
}
|
}
|
||||||
|
|
||||||
export type DBJob = { docs: any, success: boolean, page_options: any, date_added: any, message: string | null }
|
export type DBJob = { docs: any, success: boolean, page_options: any, date_added: any, message: string | null, team_id: string}
|
||||||
|
|
||||||
export async function getJob(id: string): Promise<PseudoJob<any> | null> {
|
export async function getJob(id: string): Promise<PseudoJob<any> | null> {
|
||||||
const [bullJob, dbJob, gcsJob] = await Promise.all([
|
const [bullJob, dbJob, gcsJob] = await Promise.all([
|
||||||
|
@ -18,10 +18,11 @@ export async function getExtractJob(id: string): Promise<PseudoJob<ExtractResult
|
|||||||
|
|
||||||
const job: PseudoJob<any> = {
|
const job: PseudoJob<any> = {
|
||||||
id,
|
id,
|
||||||
getState: bullJob ? bullJob.getState : (() => dbJob!.success ? "completed" : "failed"),
|
getState: bullJob ? bullJob.getState.bind(bullJob) : (() => dbJob!.success ? "completed" : "failed"),
|
||||||
returnvalue: data,
|
returnvalue: data,
|
||||||
data: {
|
data: {
|
||||||
scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options,
|
scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options,
|
||||||
|
teamId: bullJob ? bullJob.data.teamId : dbJob!.team_id,
|
||||||
},
|
},
|
||||||
timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(),
|
timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(),
|
||||||
failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined,
|
failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined,
|
||||||
@ -36,7 +37,9 @@ export async function extractStatusController(
|
|||||||
) {
|
) {
|
||||||
const extract = await getExtract(req.params.jobId);
|
const extract = await getExtract(req.params.jobId);
|
||||||
|
|
||||||
if (!extract) {
|
let status = extract?.status;
|
||||||
|
|
||||||
|
if (extract && extract.team_id !== req.auth.team_id) {
|
||||||
return res.status(404).json({
|
return res.status(404).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Extract job not found",
|
error: "Extract job not found",
|
||||||
@ -45,34 +48,46 @@ export async function extractStatusController(
|
|||||||
|
|
||||||
let data: ExtractResult | [] = [];
|
let data: ExtractResult | [] = [];
|
||||||
|
|
||||||
if (extract.status === "completed") {
|
if (!extract || extract.status === "completed") {
|
||||||
const jobData = await getExtractJob(req.params.jobId);
|
const jobData = await getExtractJob(req.params.jobId);
|
||||||
if (!jobData) {
|
if ((!jobData && !extract) || (jobData && jobData.data.teamId !== req.auth.team_id)) {
|
||||||
return res.status(404).json({
|
return res.status(404).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Job not found",
|
error: "Extract job not found",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!jobData.returnvalue) {
|
if (jobData) {
|
||||||
|
const jobStatus = await jobData.getState();
|
||||||
|
|
||||||
|
if (jobStatus === "completed") {
|
||||||
|
status = "completed";
|
||||||
|
} else if (jobStatus === "failed") {
|
||||||
|
status = "failed";
|
||||||
|
} else {
|
||||||
|
status = "processing";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!jobData?.returnvalue) {
|
||||||
// if we got in the split-second where the redis is updated but the bull isn't
|
// if we got in the split-second where the redis is updated but the bull isn't
|
||||||
// just pretend it's still processing - MG
|
// just pretend it's still processing - MG
|
||||||
extract.status = "processing";
|
status = "processing";
|
||||||
} else {
|
} else {
|
||||||
data = jobData.returnvalue ?? [];
|
data = jobData.returnvalue ?? [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: extract.status === "failed" ? false : true,
|
success: status === "failed" ? false : true,
|
||||||
data,
|
data,
|
||||||
status: extract.status,
|
status,
|
||||||
error: extract?.error ?? undefined,
|
error: extract?.error ?? undefined,
|
||||||
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
|
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
|
||||||
steps: extract.showSteps ? extract.steps : undefined,
|
steps: extract?.showSteps ? extract.steps : undefined,
|
||||||
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
|
llmUsage: extract?.showLLMUsage ? extract.llmUsage : undefined,
|
||||||
sources: extract.showSources ? extract.sources : undefined,
|
sources: extract?.showSources ? extract.sources : undefined,
|
||||||
costTracking: extract.showCostTracking ? extract.costTracking : undefined,
|
costTracking: extract?.showCostTracking ? extract.costTracking : undefined,
|
||||||
sessionIds: extract.sessionIds ? extract.sessionIds : undefined,
|
sessionIds: extract?.sessionIds ? extract.sessionIds : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,18 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
|
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
|
||||||
import { getJob } from "./crawl-status";
|
import { getJob } from "./crawl-status";
|
||||||
|
import { logger as _logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function scrapeStatusController(req: any, res: any) {
|
export async function scrapeStatusController(req: any, res: any) {
|
||||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
const logger = _logger.child({
|
||||||
|
module: "scrape-status",
|
||||||
|
method: "scrapeStatusController",
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
jobId: req.params.jobId,
|
||||||
|
scrapeId: req.params.jobId,
|
||||||
|
});
|
||||||
|
|
||||||
|
const job = await supabaseGetJobByIdOnlyData(req.params.jobId, logger);
|
||||||
|
|
||||||
if (!job) {
|
if (!job) {
|
||||||
return res.status(404).json({
|
return res.status(404).json({
|
||||||
|
@ -52,6 +52,7 @@ export async function scrapeController(
|
|||||||
internalOptions: {
|
internalOptions: {
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||||
|
unnormalizedSourceURL: preNormalizedBody.url,
|
||||||
},
|
},
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
|
@ -750,6 +750,7 @@ export type Document = {
|
|||||||
scrapeId?: string;
|
scrapeId?: string;
|
||||||
error?: string;
|
error?: string;
|
||||||
numPages?: number;
|
numPages?: number;
|
||||||
|
contentType?: string;
|
||||||
proxyUsed: "basic" | "stealth";
|
proxyUsed: "basic" | "stealth";
|
||||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||||
};
|
};
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { redisConnection } from "../services/queue-service";
|
import { redisEvictConnection } from "../services/redis";
|
||||||
import type { JobsOptions } from "bullmq";
|
import type { JobsOptions } from "bullmq";
|
||||||
|
|
||||||
const constructKey = (team_id: string) => "concurrency-limiter:" + team_id;
|
const constructKey = (team_id: string) => "concurrency-limiter:" + team_id;
|
||||||
@ -12,14 +12,14 @@ export async function cleanOldConcurrencyLimitEntries(
|
|||||||
team_id: string,
|
team_id: string,
|
||||||
now: number = Date.now(),
|
now: number = Date.now(),
|
||||||
) {
|
) {
|
||||||
await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
|
await redisEvictConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getConcurrencyLimitActiveJobs(
|
export async function getConcurrencyLimitActiveJobs(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
now: number = Date.now(),
|
now: number = Date.now(),
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
return await redisConnection.zrangebyscore(
|
return await redisEvictConnection.zrangebyscore(
|
||||||
constructKey(team_id),
|
constructKey(team_id),
|
||||||
now,
|
now,
|
||||||
Infinity,
|
Infinity,
|
||||||
@ -32,7 +32,7 @@ export async function pushConcurrencyLimitActiveJob(
|
|||||||
timeout: number,
|
timeout: number,
|
||||||
now: number = Date.now(),
|
now: number = Date.now(),
|
||||||
) {
|
) {
|
||||||
await redisConnection.zadd(
|
await redisEvictConnection.zadd(
|
||||||
constructKey(team_id),
|
constructKey(team_id),
|
||||||
now + timeout,
|
now + timeout,
|
||||||
id,
|
id,
|
||||||
@ -43,7 +43,7 @@ export async function removeConcurrencyLimitActiveJob(
|
|||||||
team_id: string,
|
team_id: string,
|
||||||
id: string,
|
id: string,
|
||||||
) {
|
) {
|
||||||
await redisConnection.zrem(constructKey(team_id), id);
|
await redisEvictConnection.zrem(constructKey(team_id), id);
|
||||||
}
|
}
|
||||||
|
|
||||||
export type ConcurrencyLimitedJob = {
|
export type ConcurrencyLimitedJob = {
|
||||||
@ -56,8 +56,8 @@ export type ConcurrencyLimitedJob = {
|
|||||||
export async function takeConcurrencyLimitedJob(
|
export async function takeConcurrencyLimitedJob(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
): Promise<ConcurrencyLimitedJob | null> {
|
): Promise<ConcurrencyLimitedJob | null> {
|
||||||
await redisConnection.zremrangebyscore(constructQueueKey(team_id), -Infinity, Date.now());
|
await redisEvictConnection.zremrangebyscore(constructQueueKey(team_id), -Infinity, Date.now());
|
||||||
const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
|
const res = await redisEvictConnection.zmpop(1, constructQueueKey(team_id), "MIN");
|
||||||
if (res === null || res === undefined) {
|
if (res === null || res === undefined) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -70,7 +70,7 @@ export async function pushConcurrencyLimitedJob(
|
|||||||
job: ConcurrencyLimitedJob,
|
job: ConcurrencyLimitedJob,
|
||||||
timeout: number,
|
timeout: number,
|
||||||
) {
|
) {
|
||||||
await redisConnection.zadd(
|
await redisEvictConnection.zadd(
|
||||||
constructQueueKey(team_id),
|
constructQueueKey(team_id),
|
||||||
Date.now() + timeout,
|
Date.now() + timeout,
|
||||||
JSON.stringify(job),
|
JSON.stringify(job),
|
||||||
@ -80,11 +80,11 @@ export async function pushConcurrencyLimitedJob(
|
|||||||
export async function getConcurrencyLimitedJobs(
|
export async function getConcurrencyLimitedJobs(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
) {
|
) {
|
||||||
return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
|
return new Set((await redisEvictConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
|
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
|
||||||
const count = await redisConnection.zcard(constructQueueKey(team_id));
|
const count = await redisEvictConnection.zcard(constructQueueKey(team_id));
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,14 +92,14 @@ export async function cleanOldCrawlConcurrencyLimitEntries(
|
|||||||
crawl_id: string,
|
crawl_id: string,
|
||||||
now: number = Date.now(),
|
now: number = Date.now(),
|
||||||
) {
|
) {
|
||||||
await redisConnection.zremrangebyscore(constructCrawlKey(crawl_id), -Infinity, now);
|
await redisEvictConnection.zremrangebyscore(constructCrawlKey(crawl_id), -Infinity, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawlConcurrencyLimitActiveJobs(
|
export async function getCrawlConcurrencyLimitActiveJobs(
|
||||||
crawl_id: string,
|
crawl_id: string,
|
||||||
now: number = Date.now(),
|
now: number = Date.now(),
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
return await redisConnection.zrangebyscore(
|
return await redisEvictConnection.zrangebyscore(
|
||||||
constructCrawlKey(crawl_id),
|
constructCrawlKey(crawl_id),
|
||||||
now,
|
now,
|
||||||
Infinity,
|
Infinity,
|
||||||
@ -112,7 +112,7 @@ export async function pushCrawlConcurrencyLimitActiveJob(
|
|||||||
timeout: number,
|
timeout: number,
|
||||||
now: number = Date.now(),
|
now: number = Date.now(),
|
||||||
) {
|
) {
|
||||||
await redisConnection.zadd(
|
await redisEvictConnection.zadd(
|
||||||
constructCrawlKey(crawl_id),
|
constructCrawlKey(crawl_id),
|
||||||
now + timeout,
|
now + timeout,
|
||||||
id,
|
id,
|
||||||
@ -123,13 +123,13 @@ export async function removeCrawlConcurrencyLimitActiveJob(
|
|||||||
crawl_id: string,
|
crawl_id: string,
|
||||||
id: string,
|
id: string,
|
||||||
) {
|
) {
|
||||||
await redisConnection.zrem(constructCrawlKey(crawl_id), id);
|
await redisEvictConnection.zrem(constructCrawlKey(crawl_id), id);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function takeCrawlConcurrencyLimitedJob(
|
export async function takeCrawlConcurrencyLimitedJob(
|
||||||
crawl_id: string,
|
crawl_id: string,
|
||||||
): Promise<ConcurrencyLimitedJob | null> {
|
): Promise<ConcurrencyLimitedJob | null> {
|
||||||
const res = await redisConnection.zmpop(1, constructCrawlQueueKey(crawl_id), "MIN");
|
const res = await redisEvictConnection.zmpop(1, constructCrawlQueueKey(crawl_id), "MIN");
|
||||||
if (res === null || res === undefined) {
|
if (res === null || res === undefined) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -140,7 +140,7 @@ export async function pushCrawlConcurrencyLimitedJob(
|
|||||||
crawl_id: string,
|
crawl_id: string,
|
||||||
job: ConcurrencyLimitedJob,
|
job: ConcurrencyLimitedJob,
|
||||||
) {
|
) {
|
||||||
await redisConnection.zadd(
|
await redisEvictConnection.zadd(
|
||||||
constructCrawlQueueKey(crawl_id),
|
constructCrawlQueueKey(crawl_id),
|
||||||
job.priority ?? 1,
|
job.priority ?? 1,
|
||||||
JSON.stringify(job),
|
JSON.stringify(job),
|
||||||
@ -150,10 +150,10 @@ export async function pushCrawlConcurrencyLimitedJob(
|
|||||||
export async function getCrawlConcurrencyLimitedJobs(
|
export async function getCrawlConcurrencyLimitedJobs(
|
||||||
crawl_id: string,
|
crawl_id: string,
|
||||||
) {
|
) {
|
||||||
return new Set((await redisConnection.zrange(constructCrawlQueueKey(crawl_id), 0, -1)).map(x => JSON.parse(x).id));
|
return new Set((await redisEvictConnection.zrange(constructCrawlQueueKey(crawl_id), 0, -1)).map(x => JSON.parse(x).id));
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawlConcurrencyQueueJobsCount(crawl_id: string): Promise<number> {
|
export async function getCrawlConcurrencyQueueJobsCount(crawl_id: string): Promise<number> {
|
||||||
const count = await redisConnection.zcard(constructCrawlQueueKey(crawl_id));
|
const count = await redisEvictConnection.zcard(constructCrawlQueueKey(crawl_id));
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { InternalOptions } from "../scraper/scrapeURL";
|
import { InternalOptions } from "../scraper/scrapeURL";
|
||||||
import { ScrapeOptions, TeamFlags } from "../controllers/v1/types";
|
import { ScrapeOptions, TeamFlags } from "../controllers/v1/types";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisEvictConnection } from "../services/redis";
|
||||||
import { logger as _logger } from "./logger";
|
import { logger as _logger } from "./logger";
|
||||||
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
||||||
|
|
||||||
@ -24,24 +24,24 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
|||||||
crawlId: id,
|
crawlId: id,
|
||||||
teamId: crawl.team_id,
|
teamId: crawl.team_id,
|
||||||
});
|
});
|
||||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
await redisEvictConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id, 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
||||||
const x = await redisConnection.get("crawl:" + id);
|
const x = await redisEvictConnection.get("crawl:" + id);
|
||||||
|
|
||||||
if (x === null) {
|
if (x === null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id, 24 * 60 * 60);
|
||||||
return JSON.parse(x);
|
return JSON.parse(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawlExpiry(id: string): Promise<Date> {
|
export async function getCrawlExpiry(id: string): Promise<Date> {
|
||||||
const d = new Date();
|
const d = new Date();
|
||||||
const ttl = await redisConnection.pttl("crawl:" + id);
|
const ttl = await redisEvictConnection.pttl("crawl:" + id);
|
||||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||||
d.setMilliseconds(0);
|
d.setMilliseconds(0);
|
||||||
return d;
|
return d;
|
||||||
@ -54,8 +54,8 @@ export async function addCrawlJob(id: string, job_id: string) {
|
|||||||
method: "addCrawlJob",
|
method: "addCrawlJob",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
await redisEvictConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
@ -67,8 +67,8 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
|||||||
method: "addCrawlJobs",
|
method: "addCrawlJobs",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
await redisEvictConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobDone(
|
export async function addCrawlJobDone(
|
||||||
@ -82,32 +82,32 @@ export async function addCrawlJobDone(
|
|||||||
method: "addCrawlJobDone",
|
method: "addCrawlJobDone",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisEvictConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
await redisConnection.expire(
|
await redisEvictConnection.expire(
|
||||||
"crawl:" + id + ":jobs_done",
|
"crawl:" + id + ":jobs_done",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
await redisEvictConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||||
} else {
|
} else {
|
||||||
// in case it's already been pushed, make sure it's removed
|
// in case it's already been pushed, make sure it's removed
|
||||||
await redisConnection.lrem(
|
await redisEvictConnection.lrem(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
-1,
|
-1,
|
||||||
job_id,
|
job_id,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire(
|
await redisEvictConnection.expire(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
|
||||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
return await redisEvictConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDoneJobsOrdered(
|
export async function getDoneJobsOrdered(
|
||||||
@ -115,8 +115,8 @@ export async function getDoneJobsOrdered(
|
|||||||
start = 0,
|
start = 0,
|
||||||
end = -1,
|
end = -1,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60);
|
||||||
return await redisConnection.lrange(
|
return await redisEvictConnection.lrange(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
start,
|
start,
|
||||||
end,
|
end,
|
||||||
@ -124,27 +124,27 @@ export async function getDoneJobsOrdered(
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinished(id: string) {
|
export async function isCrawlFinished(id: string) {
|
||||||
await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
|
||||||
return (
|
return (
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
(await redisEvictConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs")) &&
|
(await redisEvictConnection.scard("crawl:" + id + ":jobs")) &&
|
||||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
(await redisEvictConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlKickoffFinished(id: string) {
|
export async function isCrawlKickoffFinished(id: string) {
|
||||||
await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60);
|
||||||
return (
|
return (
|
||||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
(await redisEvictConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinishedLocked(id: string) {
|
export async function isCrawlFinishedLocked(id: string) {
|
||||||
return await redisConnection.exists("crawl:" + id + ":finish");
|
return await redisEvictConnection.exists("crawl:" + id + ":finish");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function finishCrawlKickoff(id: string) {
|
export async function finishCrawlKickoff(id: string) {
|
||||||
await redisConnection.set(
|
await redisEvictConnection.set(
|
||||||
"crawl:" + id + ":kickoff:finish",
|
"crawl:" + id + ":kickoff:finish",
|
||||||
"yes",
|
"yes",
|
||||||
"EX",
|
"EX",
|
||||||
@ -159,19 +159,19 @@ export async function finishCrawlPre(id: string) {
|
|||||||
method: "finishCrawlPre",
|
method: "finishCrawlPre",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
const set = await redisConnection.setnx("crawl:" + id + ":finished_pre", "yes");
|
const set = await redisEvictConnection.setnx("crawl:" + id + ":finished_pre", "yes");
|
||||||
await redisConnection.expire("crawl:" + id + ":finished_pre", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":finished_pre", 24 * 60 * 60);
|
||||||
return set === 1;
|
return set === 1;
|
||||||
} else {
|
} else {
|
||||||
_logger.debug("Crawl can not be pre-finished yet, not marking as finished.", {
|
// _logger.debug("Crawl can not be pre-finished yet, not marking as finished.", {
|
||||||
module: "crawl-redis",
|
// module: "crawl-redis",
|
||||||
method: "finishCrawlPre",
|
// method: "finishCrawlPre",
|
||||||
crawlId: id,
|
// crawlId: id,
|
||||||
jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
|
// jobs_done: await redisEvictConnection.scard("crawl:" + id + ":jobs_done"),
|
||||||
jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
|
// jobs: await redisEvictConnection.scard("crawl:" + id + ":jobs"),
|
||||||
kickoff_finished:
|
// kickoff_finished:
|
||||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
// (await redisEvictConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
||||||
});
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,16 +181,16 @@ export async function finishCrawl(id: string) {
|
|||||||
method: "finishCrawl",
|
method: "finishCrawl",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.set("crawl:" + id + ":finish", "yes");
|
await redisEvictConnection.set("crawl:" + id + ":finish", "yes");
|
||||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawlJobs(id: string): Promise<string[]> {
|
export async function getCrawlJobs(id: string): Promise<string[]> {
|
||||||
return await redisConnection.smembers("crawl:" + id + ":jobs");
|
return await redisEvictConnection.smembers("crawl:" + id + ":jobs");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getCrawlJobCount(id: string): Promise<number> {
|
export async function getCrawlJobCount(id: string): Promise<number> {
|
||||||
return await redisConnection.scard("crawl:" + id + ":jobs");
|
return await redisEvictConnection.scard("crawl:" + id + ":jobs");
|
||||||
}
|
}
|
||||||
|
|
||||||
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
||||||
@ -276,12 +276,12 @@ export async function lockURL(
|
|||||||
|
|
||||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||||
if (
|
if (
|
||||||
(await redisConnection.scard("crawl:" + id + ":visited_unique")) >=
|
(await redisEvictConnection.scard("crawl:" + id + ":visited_unique")) >=
|
||||||
sc.crawlerOptions.limit
|
sc.crawlerOptions.limit
|
||||||
) {
|
) {
|
||||||
logger.debug(
|
// logger.debug(
|
||||||
"Crawl has already hit visited_unique limit, not locking URL.",
|
// "Crawl has already hit visited_unique limit, not locking URL.",
|
||||||
);
|
// );
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -291,22 +291,22 @@ export async function lockURL(
|
|||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
|
res = (await redisEvictConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
|
||||||
} else {
|
} else {
|
||||||
const permutations = generateURLPermutations(url).map((x) => x.href);
|
const permutations = generateURLPermutations(url).map((x) => x.href);
|
||||||
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
||||||
const x = await redisConnection.sadd(
|
const x = await redisEvictConnection.sadd(
|
||||||
"crawl:" + id + ":visited",
|
"crawl:" + id + ":visited",
|
||||||
...permutations,
|
...permutations,
|
||||||
);
|
);
|
||||||
res = x === permutations.length;
|
res = x === permutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
await redisEvictConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||||
await redisConnection.expire(
|
await redisEvictConnection.expire(
|
||||||
"crawl:" + id + ":visited_unique",
|
"crawl:" + id + ":visited_unique",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
);
|
);
|
||||||
@ -336,29 +336,29 @@ export async function lockURLs(
|
|||||||
|
|
||||||
// Add to visited_unique set
|
// Add to visited_unique set
|
||||||
logger.debug("Locking " + urls.length + " URLs...");
|
logger.debug("Locking " + urls.length + " URLs...");
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
await redisEvictConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
||||||
await redisConnection.expire(
|
await redisEvictConnection.expire(
|
||||||
"crawl:" + id + ":visited_unique",
|
"crawl:" + id + ":visited_unique",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
);
|
);
|
||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls);
|
const x = await redisEvictConnection.sadd("crawl:" + id + ":visited", ...urls);
|
||||||
res = x === urls.length;
|
res = x === urls.length;
|
||||||
} else {
|
} else {
|
||||||
const allPermutations = urls.flatMap((url) =>
|
const allPermutations = urls.flatMap((url) =>
|
||||||
generateURLPermutations(url).map((x) => x.href),
|
generateURLPermutations(url).map((x) => x.href),
|
||||||
);
|
);
|
||||||
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
||||||
const x = await redisConnection.sadd(
|
const x = await redisEvictConnection.sadd(
|
||||||
"crawl:" + id + ":visited",
|
"crawl:" + id + ":visited",
|
||||||
...allPermutations,
|
...allPermutations,
|
||||||
);
|
);
|
||||||
res = x === allPermutations.length;
|
res = x === allPermutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
|
await redisEvictConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60);
|
||||||
|
|
||||||
logger.debug("lockURLs final result: " + res, { res });
|
logger.debug("lockURLs final result: " + res, { res });
|
||||||
return res;
|
return res;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../services/redis";
|
||||||
import { logger as _logger } from "../logger";
|
import { logger as _logger } from "../logger";
|
||||||
|
|
||||||
export enum DeepResearchStep {
|
export enum DeepResearchStep {
|
||||||
@ -52,12 +52,12 @@ const DEEP_RESEARCH_TTL = 6 * 60 * 60;
|
|||||||
|
|
||||||
export async function saveDeepResearch(id: string, research: StoredDeepResearch) {
|
export async function saveDeepResearch(id: string, research: StoredDeepResearch) {
|
||||||
_logger.debug("Saving deep research " + id + " to Redis...");
|
_logger.debug("Saving deep research " + id + " to Redis...");
|
||||||
await redisConnection.set("deep-research:" + id, JSON.stringify(research));
|
await redisEvictConnection.set("deep-research:" + id, JSON.stringify(research));
|
||||||
await redisConnection.expire("deep-research:" + id, DEEP_RESEARCH_TTL);
|
await redisEvictConnection.expire("deep-research:" + id, DEEP_RESEARCH_TTL);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDeepResearch(id: string): Promise<StoredDeepResearch | null> {
|
export async function getDeepResearch(id: string): Promise<StoredDeepResearch | null> {
|
||||||
const x = await redisConnection.get("deep-research:" + id);
|
const x = await redisEvictConnection.get("deep-research:" + id);
|
||||||
return x ? JSON.parse(x) : null;
|
return x ? JSON.parse(x) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,13 +91,13 @@ export async function updateDeepResearch(
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
await redisConnection.set("deep-research:" + id, JSON.stringify(updatedResearch));
|
await redisEvictConnection.set("deep-research:" + id, JSON.stringify(updatedResearch));
|
||||||
await redisConnection.expire("deep-research:" + id, DEEP_RESEARCH_TTL);
|
await redisEvictConnection.expire("deep-research:" + id, DEEP_RESEARCH_TTL);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDeepResearchExpiry(id: string): Promise<Date> {
|
export async function getDeepResearchExpiry(id: string): Promise<Date> {
|
||||||
const d = new Date();
|
const d = new Date();
|
||||||
const ttl = await redisConnection.pttl("deep-research:" + id);
|
const ttl = await redisEvictConnection.pttl("deep-research:" + id);
|
||||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||||
d.setMilliseconds(0);
|
d.setMilliseconds(0);
|
||||||
return d;
|
return d;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../services/redis";
|
||||||
import { logger as _logger } from "../logger";
|
import { logger as _logger } from "../logger";
|
||||||
import { CostTracking } from "./extraction-service";
|
import { CostTracking } from "./extraction-service";
|
||||||
|
|
||||||
@ -61,12 +61,12 @@ export async function saveExtract(id: string, extract: StoredExtract) {
|
|||||||
discoveredLinks: step.discoveredLinks?.slice(0, STEPS_MAX_DISCOVERED_LINKS)
|
discoveredLinks: step.discoveredLinks?.slice(0, STEPS_MAX_DISCOVERED_LINKS)
|
||||||
}))
|
}))
|
||||||
};
|
};
|
||||||
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
await redisEvictConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
||||||
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
|
await redisEvictConnection.expire("extract:" + id, EXTRACT_TTL);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getExtract(id: string): Promise<StoredExtract | null> {
|
export async function getExtract(id: string): Promise<StoredExtract | null> {
|
||||||
const x = await redisConnection.get("extract:" + id);
|
const x = await redisEvictConnection.get("extract:" + id);
|
||||||
return x ? JSON.parse(x) : null;
|
return x ? JSON.parse(x) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,13 +111,13 @@ export async function updateExtract(
|
|||||||
|
|
||||||
console.log(minimalExtract.sessionIds)
|
console.log(minimalExtract.sessionIds)
|
||||||
|
|
||||||
await redisConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
await redisEvictConnection.set("extract:" + id, JSON.stringify(minimalExtract));
|
||||||
await redisConnection.expire("extract:" + id, EXTRACT_TTL);
|
await redisEvictConnection.expire("extract:" + id, EXTRACT_TTL);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getExtractExpiry(id: string): Promise<Date> {
|
export async function getExtractExpiry(id: string): Promise<Date> {
|
||||||
const d = new Date();
|
const d = new Date();
|
||||||
const ttl = await redisConnection.pttl("extract:" + id);
|
const ttl = await redisEvictConnection.pttl("extract:" + id);
|
||||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||||
d.setMilliseconds(0);
|
d.setMilliseconds(0);
|
||||||
return d;
|
return d;
|
||||||
|
@ -105,9 +105,9 @@ export async function getJobFromGCS(jobId: string): Promise<Document[] | null> {
|
|||||||
|
|
||||||
// TODO: fix the any type (we have multiple Document types in the codebase)
|
// TODO: fix the any type (we have multiple Document types in the codebase)
|
||||||
export async function getDocFromGCS(url: string): Promise<any | null> {
|
export async function getDocFromGCS(url: string): Promise<any | null> {
|
||||||
logger.info(`Getting f-engine document from GCS`, {
|
// logger.info(`Getting f-engine document from GCS`, {
|
||||||
url,
|
// url,
|
||||||
});
|
// });
|
||||||
try {
|
try {
|
||||||
if (!process.env.GCS_FIRE_ENGINE_BUCKET_NAME) {
|
if (!process.env.GCS_FIRE_ENGINE_BUCKET_NAME) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../services/redis";
|
||||||
import { logger as _logger } from "../logger";
|
import { logger as _logger } from "../logger";
|
||||||
|
|
||||||
export interface GenerationData {
|
export interface GenerationData {
|
||||||
@ -20,12 +20,12 @@ const GENERATION_TTL = 24 * 60 * 60;
|
|||||||
|
|
||||||
export async function saveGeneratedLlmsTxt(id: string, data: GenerationData): Promise<void> {
|
export async function saveGeneratedLlmsTxt(id: string, data: GenerationData): Promise<void> {
|
||||||
_logger.debug("Saving llmstxt generation " + id + " to Redis...");
|
_logger.debug("Saving llmstxt generation " + id + " to Redis...");
|
||||||
await redisConnection.set("generation:" + id, JSON.stringify(data));
|
await redisEvictConnection.set("generation:" + id, JSON.stringify(data));
|
||||||
await redisConnection.expire("generation:" + id, GENERATION_TTL);
|
await redisEvictConnection.expire("generation:" + id, GENERATION_TTL);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getGeneratedLlmsTxt(id: string): Promise<GenerationData | null> {
|
export async function getGeneratedLlmsTxt(id: string): Promise<GenerationData | null> {
|
||||||
const x = await redisConnection.get("generation:" + id);
|
const x = await redisEvictConnection.get("generation:" + id);
|
||||||
return x ? JSON.parse(x) : null;
|
return x ? JSON.parse(x) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -41,13 +41,13 @@ export async function updateGeneratedLlmsTxt(
|
|||||||
...data
|
...data
|
||||||
};
|
};
|
||||||
|
|
||||||
await redisConnection.set("generation:" + id, JSON.stringify(updatedGeneration));
|
await redisEvictConnection.set("generation:" + id, JSON.stringify(updatedGeneration));
|
||||||
await redisConnection.expire("generation:" + id, GENERATION_TTL);
|
await redisEvictConnection.expire("generation:" + id, GENERATION_TTL);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getGeneratedLlmsTxtExpiry(id: string): Promise<Date> {
|
export async function getGeneratedLlmsTxtExpiry(id: string): Promise<Date> {
|
||||||
const d = new Date();
|
const d = new Date();
|
||||||
const ttl = await redisConnection.pttl("generation:" + id);
|
const ttl = await redisEvictConnection.pttl("generation:" + id);
|
||||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||||
d.setMilliseconds(0);
|
d.setMilliseconds(0);
|
||||||
return d;
|
return d;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { getACUC, getACUCTeam } from "../controllers/auth";
|
import { getACUC, getACUCTeam } from "../controllers/auth";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisEvictConnection } from "../services/redis";
|
||||||
import { logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
|
|
||||||
const SET_KEY_PREFIX = "limit_team_id:";
|
const SET_KEY_PREFIX = "limit_team_id:";
|
||||||
@ -9,10 +9,10 @@ export async function addJobPriority(team_id, job_id) {
|
|||||||
const setKey = SET_KEY_PREFIX + team_id;
|
const setKey = SET_KEY_PREFIX + team_id;
|
||||||
|
|
||||||
// Add scrape job id to the set
|
// Add scrape job id to the set
|
||||||
await redisConnection.sadd(setKey, job_id);
|
await redisEvictConnection.sadd(setKey, job_id);
|
||||||
|
|
||||||
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
|
||||||
await redisConnection.expire(setKey, 60);
|
await redisEvictConnection.expire(setKey, 60);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
|
||||||
}
|
}
|
||||||
@ -23,7 +23,7 @@ export async function deleteJobPriority(team_id, job_id) {
|
|||||||
const setKey = SET_KEY_PREFIX + team_id;
|
const setKey = SET_KEY_PREFIX + team_id;
|
||||||
|
|
||||||
// remove job_id from the set
|
// remove job_id from the set
|
||||||
await redisConnection.srem(setKey, job_id);
|
await redisEvictConnection.srem(setKey, job_id);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
|
||||||
}
|
}
|
||||||
@ -48,7 +48,7 @@ export async function getJobPriority({
|
|||||||
const setKey = SET_KEY_PREFIX + team_id;
|
const setKey = SET_KEY_PREFIX + team_id;
|
||||||
|
|
||||||
// Get the length of the set
|
// Get the length of the set
|
||||||
const setLength = await redisConnection.scard(setKey);
|
const setLength = await redisEvictConnection.scard(setKey);
|
||||||
|
|
||||||
// Determine the priority based on the plan and set length
|
// Determine the priority based on the plan and set length
|
||||||
let planModifier = acuc?.plan_priority.planModifier ?? 1;
|
let planModifier = acuc?.plan_priority.planModifier ?? 1;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import type { Logger } from "winston";
|
||||||
import { supabase_rr_service, supabase_service } from "../services/supabase";
|
import { supabase_rr_service, supabase_service } from "../services/supabase";
|
||||||
import { logger } from "./logger";
|
import { logger } from "./logger";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
@ -73,7 +74,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
|||||||
return data;
|
return data;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
export const supabaseGetJobByIdOnlyData = async (jobId: string, logger?: Logger) => {
|
||||||
const { data, error } = await supabase_rr_service
|
const { data, error } = await supabase_rr_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
.select("team_id")
|
.select("team_id")
|
||||||
@ -81,6 +82,9 @@ export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
|||||||
.single();
|
.single();
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
|
if (logger) {
|
||||||
|
logger.error("Error in supabaseGetJobByIdOnlyData", { error });
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import { getURLDepth } from "./utils/maxDepthUtils";
|
|||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import https from "https";
|
import https from "https";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisEvictConnection } from "../../services/redis";
|
||||||
import { extractLinks } from "../../lib/html-transformer";
|
import { extractLinks } from "../../lib/html-transformer";
|
||||||
import { TimeoutSignal } from "../../controllers/v1/types";
|
import { TimeoutSignal } from "../../controllers/v1/types";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
@ -287,7 +287,7 @@ export class WebCrawler {
|
|||||||
let uniqueURLs: string[] = [];
|
let uniqueURLs: string[] = [];
|
||||||
for (const url of filteredLinks) {
|
for (const url of filteredLinks) {
|
||||||
if (
|
if (
|
||||||
await redisConnection.sadd(
|
await redisEvictConnection.sadd(
|
||||||
"sitemap:" + this.jobId + ":links",
|
"sitemap:" + this.jobId + ":links",
|
||||||
normalizeUrl(url),
|
normalizeUrl(url),
|
||||||
)
|
)
|
||||||
@ -296,7 +296,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire(
|
await redisEvictConnection.expire(
|
||||||
"sitemap:" + this.jobId + ":links",
|
"sitemap:" + this.jobId + ":links",
|
||||||
3600,
|
3600,
|
||||||
"NX",
|
"NX",
|
||||||
@ -324,7 +324,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
if (
|
if (
|
||||||
await redisConnection.sadd(
|
await redisEvictConnection.sadd(
|
||||||
"sitemap:" + this.jobId + ":links",
|
"sitemap:" + this.jobId + ":links",
|
||||||
normalizeUrl(this.initialUrl),
|
normalizeUrl(this.initialUrl),
|
||||||
)
|
)
|
||||||
@ -334,6 +334,12 @@ export class WebCrawler {
|
|||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await redisEvictConnection.expire(
|
||||||
|
"sitemap:" + this.jobId + ":links",
|
||||||
|
3600,
|
||||||
|
"NX",
|
||||||
|
);
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.message === "Sitemap fetch timeout") {
|
if (error.message === "Sitemap fetch timeout") {
|
||||||
@ -384,11 +390,11 @@ export class WebCrawler {
|
|||||||
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
) {
|
) {
|
||||||
(async () => {
|
(async () => {
|
||||||
await redisConnection.sadd(
|
await redisEvictConnection.sadd(
|
||||||
"crawl:" + this.jobId + ":robots_blocked",
|
"crawl:" + this.jobId + ":robots_blocked",
|
||||||
fullUrl,
|
fullUrl,
|
||||||
);
|
);
|
||||||
await redisConnection.expire(
|
await redisEvictConnection.expire(
|
||||||
"crawl:" + this.jobId + ":robots_blocked",
|
"crawl:" + this.jobId + ":robots_blocked",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
);
|
);
|
||||||
|
@ -30,7 +30,7 @@ export async function scrapeURLWithFetch(
|
|||||||
url: string;
|
url: string;
|
||||||
body: string,
|
body: string,
|
||||||
status: number;
|
status: number;
|
||||||
headers: any;
|
headers: [string, string][];
|
||||||
};
|
};
|
||||||
|
|
||||||
if (meta.mock !== null) {
|
if (meta.mock !== null) {
|
||||||
@ -117,5 +117,8 @@ export async function scrapeURLWithFetch(
|
|||||||
url: response.url,
|
url: response.url,
|
||||||
html: response.body,
|
html: response.body,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
contentType: (response.headers.find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -245,18 +245,18 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
meta.options.formats.includes("screenshot") ||
|
meta.options.formats.includes("screenshot") ||
|
||||||
meta.options.formats.includes("screenshot@fullPage")
|
meta.options.formats.includes("screenshot@fullPage")
|
||||||
) {
|
) {
|
||||||
meta.logger.debug(
|
// meta.logger.debug(
|
||||||
"Transforming screenshots from actions into screenshot field",
|
// "Transforming screenshots from actions into screenshot field",
|
||||||
{ screenshots: response.screenshots },
|
// { screenshots: response.screenshots },
|
||||||
);
|
// );
|
||||||
if (response.screenshots) {
|
if (response.screenshots) {
|
||||||
response.screenshot = response.screenshots.slice(-1)[0];
|
response.screenshot = response.screenshots.slice(-1)[0];
|
||||||
response.screenshots = response.screenshots.slice(0, -1);
|
response.screenshots = response.screenshots.slice(0, -1);
|
||||||
}
|
}
|
||||||
meta.logger.debug("Screenshot transformation done", {
|
// meta.logger.debug("Screenshot transformation done", {
|
||||||
screenshots: response.screenshots,
|
// screenshots: response.screenshots,
|
||||||
screenshot: response.screenshot,
|
// screenshot: response.screenshot,
|
||||||
});
|
// });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
|
|
||||||
screenshot: response.screenshot,
|
screenshot: response.screenshot,
|
||||||
...(actions.length > 0
|
...(actions.length > 0
|
||||||
? {
|
? {
|
||||||
@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
|
|
||||||
...(response.screenshots !== undefined && response.screenshots.length > 0
|
...(response.screenshots !== undefined && response.screenshots.length > 0
|
||||||
? {
|
? {
|
||||||
screenshot: response.screenshots[0],
|
screenshot: response.screenshots[0],
|
||||||
@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
html: response.content,
|
html: response.content,
|
||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -111,6 +111,8 @@ export type EngineScrapeResult = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
numPages?: number;
|
numPages?: number;
|
||||||
|
|
||||||
|
contentType?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
const engineHandlers: {
|
const engineHandlers: {
|
||||||
@ -383,9 +385,8 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
if (cacheIndex !== -1) {
|
if (cacheIndex !== -1) {
|
||||||
_engines.splice(cacheIndex, 1);
|
_engines.splice(cacheIndex, 1);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
meta.logger.debug("Cache engine enabled by useCache option");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const prioritySum = [...meta.featureFlags].reduce(
|
const prioritySum = [...meta.featureFlags].reduce(
|
||||||
(a, x) => a + featureFlagOptions[x].priority,
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
0,
|
0,
|
||||||
@ -424,24 +425,6 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
|
|
||||||
if (supportScore >= priorityThreshold) {
|
if (supportScore >= priorityThreshold) {
|
||||||
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
||||||
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, {
|
|
||||||
supportScore,
|
|
||||||
prioritySum,
|
|
||||||
priorityThreshold,
|
|
||||||
featureFlags: [...meta.featureFlags],
|
|
||||||
unsupportedFeatures,
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
meta.logger.debug(
|
|
||||||
`Engine ${engine} does not meet feature priority threshold`,
|
|
||||||
{
|
|
||||||
supportScore,
|
|
||||||
prioritySum,
|
|
||||||
priorityThreshold,
|
|
||||||
featureFlags: [...meta.featureFlags],
|
|
||||||
unsupportedFeatures,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -459,6 +442,10 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
meta.logger.info("Selected engines", {
|
||||||
|
selectedEngines,
|
||||||
|
});
|
||||||
|
|
||||||
return selectedEngines;
|
return selectedEngines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,6 +182,7 @@ export type InternalOptions = {
|
|||||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
urlInvisibleInCurrentCrawl?: boolean;
|
urlInvisibleInCurrentCrawl?: boolean;
|
||||||
|
unnormalizedSourceURL?: string;
|
||||||
|
|
||||||
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
||||||
};
|
};
|
||||||
@ -373,11 +374,12 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
screenshot: result.result.screenshot,
|
screenshot: result.result.screenshot,
|
||||||
actions: result.result.actions,
|
actions: result.result.actions,
|
||||||
metadata: {
|
metadata: {
|
||||||
sourceURL: meta.url,
|
sourceURL: meta.internalOptions.unnormalizedSourceURL ?? meta.url,
|
||||||
url: result.result.url,
|
url: result.result.url,
|
||||||
statusCode: result.result.statusCode,
|
statusCode: result.result.statusCode,
|
||||||
error: result.result.error,
|
error: result.result.error,
|
||||||
numPages: result.result.numPages,
|
numPages: result.result.numPages,
|
||||||
|
contentType: result.result.contentType,
|
||||||
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (document.metadata.contentType?.includes("application/json")) {
|
||||||
|
if (document.rawHtml === undefined) {
|
||||||
|
throw new Error(
|
||||||
|
"rawHtml is undefined -- this transformer is being called out of order",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.markdown = "```json\n" + document.rawHtml + "\n```";
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
document.markdown = await parseMarkdown(document.html);
|
document.markdown = await parseMarkdown(document.html);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ import { getACUC, setCachedACUC, setCachedACUCTeam } from "../../controllers/aut
|
|||||||
// Configuration constants
|
// Configuration constants
|
||||||
const BATCH_KEY = "billing_batch";
|
const BATCH_KEY = "billing_batch";
|
||||||
const BATCH_LOCK_KEY = "billing_batch_lock";
|
const BATCH_LOCK_KEY = "billing_batch_lock";
|
||||||
const BATCH_SIZE = 50; // Batch size for processing
|
const BATCH_SIZE = 100; // Batch size for processing
|
||||||
const BATCH_TIMEOUT = 15000; // 15 seconds processing interval
|
const BATCH_TIMEOUT = 15000; // 15 seconds processing interval
|
||||||
const LOCK_TIMEOUT = 30000; // 30 seconds lock timeout
|
const LOCK_TIMEOUT = 30000; // 30 seconds lock timeout
|
||||||
|
|
||||||
|
@ -47,18 +47,18 @@ async function indexJob(job: FirecrawlJob): Promise<void> {
|
|||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const errorData = await response.json();
|
const errorData = await response.json();
|
||||||
logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
|
// logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
|
||||||
error: errorData,
|
// error: errorData,
|
||||||
scrapeId: job.job_id,
|
// scrapeId: job.job_id,
|
||||||
});
|
// });
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });
|
// logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error sending job to external server: ${error.message}`, {
|
// logger.error(`Error sending job to external server: ${error.message}`, {
|
||||||
error,
|
// error,
|
||||||
scrapeId: job.job_id,
|
// scrapeId: job.job_id,
|
||||||
});
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import { sendSlackWebhook } from "../alerts/slack";
|
|||||||
import { getNotificationString } from "./notification_string";
|
import { getNotificationString } from "./notification_string";
|
||||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||||
import { redlock } from "../redlock";
|
import { redlock } from "../redlock";
|
||||||
import { redisConnection } from "../queue-service";
|
import { redisEvictConnection } from "../redis";
|
||||||
|
|
||||||
const emailTemplates: Record<
|
const emailTemplates: Record<
|
||||||
NotificationType,
|
NotificationType,
|
||||||
@ -268,14 +268,14 @@ export async function sendNotificationWithCustomDays(
|
|||||||
) => {
|
) => {
|
||||||
const redisKey = "notification_sent:" + notificationType + ":" + team_id;
|
const redisKey = "notification_sent:" + notificationType + ":" + team_id;
|
||||||
|
|
||||||
const didSendRecentNotification = (await redisConnection.get(redisKey)) !== null;
|
const didSendRecentNotification = (await redisEvictConnection.get(redisKey)) !== null;
|
||||||
|
|
||||||
if (didSendRecentNotification && !bypassRecentChecks) {
|
if (didSendRecentNotification && !bypassRecentChecks) {
|
||||||
logger.debug(`Notification already sent within the last ${daysBetweenEmails} days for team_id: ${team_id} and notificationType: ${notificationType}`);
|
logger.debug(`Notification already sent within the last ${daysBetweenEmails} days for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||||
return { success: true };
|
return { success: true };
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.set(redisKey, "1", "EX", daysBetweenEmails * 24 * 60 * 60);
|
await redisEvictConnection.set(redisKey, "1", "EX", daysBetweenEmails * 24 * 60 * 60);
|
||||||
|
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
const pastDate = new Date(now.getTime() - daysBetweenEmails * 24 * 60 * 60 * 1000);
|
const pastDate = new Date(now.getTime() - daysBetweenEmails * 24 * 60 * 60 * 1000);
|
||||||
@ -289,13 +289,13 @@ export async function sendNotificationWithCustomDays(
|
|||||||
|
|
||||||
if (recentNotificationsError) {
|
if (recentNotificationsError) {
|
||||||
logger.debug(`Error fetching recent notifications: ${recentNotificationsError}`);
|
logger.debug(`Error fetching recent notifications: ${recentNotificationsError}`);
|
||||||
await redisConnection.del(redisKey); // free up redis, let it try again
|
await redisEvictConnection.del(redisKey); // free up redis, let it try again
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recentNotifications.length > 0 && !bypassRecentChecks) {
|
if (recentNotifications.length > 0 && !bypassRecentChecks) {
|
||||||
logger.debug(`Notification already sent within the last ${daysBetweenEmails} days for team_id: ${team_id} and notificationType: ${notificationType}`);
|
logger.debug(`Notification already sent within the last ${daysBetweenEmails} days for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||||
await redisConnection.set(redisKey, "1", "EX", daysBetweenEmails * 24 * 60 * 60);
|
await redisEvictConnection.set(redisKey, "1", "EX", daysBetweenEmails * 24 * 60 * 60);
|
||||||
return { success: true };
|
return { success: true };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -310,7 +310,7 @@ export async function sendNotificationWithCustomDays(
|
|||||||
|
|
||||||
if (emailsError) {
|
if (emailsError) {
|
||||||
logger.debug(`Error fetching emails: ${emailsError}`);
|
logger.debug(`Error fetching emails: ${emailsError}`);
|
||||||
await redisConnection.del(redisKey); // free up redis, let it try again
|
await redisEvictConnection.del(redisKey); // free up redis, let it try again
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -341,7 +341,7 @@ export async function sendNotificationWithCustomDays(
|
|||||||
|
|
||||||
if (insertError) {
|
if (insertError) {
|
||||||
logger.debug(`Error inserting notification record: ${insertError}`);
|
logger.debug(`Error inserting notification record: ${insertError}`);
|
||||||
await redisConnection.del(redisKey); // free up redis, let it try again
|
await redisEvictConnection.del(redisKey); // free up redis, let it try again
|
||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,13 +132,13 @@ async function addScrapeJobRaw(
|
|||||||
// If above by 2x, send them an email
|
// If above by 2x, send them an email
|
||||||
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
|
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
|
||||||
if(concurrencyQueueJobs > maxConcurrency) {
|
if(concurrencyQueueJobs > maxConcurrency) {
|
||||||
logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id);
|
// logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id);
|
||||||
|
|
||||||
// Only send notification if it's not a crawl or batch scrape
|
// Only send notification if it's not a crawl or batch scrape
|
||||||
const shouldSendNotification = await shouldSendConcurrencyLimitNotification(webScraperOptions.team_id);
|
const shouldSendNotification = await shouldSendConcurrencyLimitNotification(webScraperOptions.team_id);
|
||||||
if (shouldSendNotification) {
|
if (shouldSendNotification) {
|
||||||
sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => {
|
sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => {
|
||||||
logger.error("Error sending notification (concurrency limit reached): ", error);
|
logger.error("Error sending notification (concurrency limit reached)", { error });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -231,13 +231,13 @@ export async function addScrapeJobs(
|
|||||||
|
|
||||||
// equals 2x the max concurrency
|
// equals 2x the max concurrency
|
||||||
if(addToCQ.length > maxConcurrency) {
|
if(addToCQ.length > maxConcurrency) {
|
||||||
logger.info(`Concurrency limited 2x (multiple) - Concurrency queue jobs: ${addToCQ.length} Max concurrency: ${maxConcurrency} Team ID: ${jobs[0].data.team_id}`);
|
// logger.info(`Concurrency limited 2x (multiple) - Concurrency queue jobs: ${addToCQ.length} Max concurrency: ${maxConcurrency} Team ID: ${jobs[0].data.team_id}`);
|
||||||
// Only send notification if it's not a crawl or batch scrape
|
// Only send notification if it's not a crawl or batch scrape
|
||||||
if (!isCrawlOrBatchScrape(dontAddToCCQ[0].data)) {
|
if (!isCrawlOrBatchScrape(dontAddToCCQ[0].data)) {
|
||||||
const shouldSendNotification = await shouldSendConcurrencyLimitNotification(dontAddToCCQ[0].data.team_id);
|
const shouldSendNotification = await shouldSendConcurrencyLimitNotification(dontAddToCCQ[0].data.team_id);
|
||||||
if (shouldSendNotification) {
|
if (shouldSendNotification) {
|
||||||
sendNotificationWithCustomDays(dontAddToCCQ[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => {
|
sendNotificationWithCustomDays(dontAddToCCQ[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 15, false).catch((error) => {
|
||||||
logger.error("Error sending notification (concurrency limit reached): ", error);
|
logger.error("Error sending notification (concurrency limit reached)", { error });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,7 +131,3 @@ export function getBillingQueue() {
|
|||||||
}
|
}
|
||||||
return billingQueue;
|
return billingQueue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE
|
|
||||||
// import { QueueEvents } from 'bullmq';
|
|
||||||
// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() });
|
|
||||||
|
@ -6,11 +6,8 @@ import {
|
|||||||
getScrapeQueue,
|
getScrapeQueue,
|
||||||
getExtractQueue,
|
getExtractQueue,
|
||||||
getDeepResearchQueue,
|
getDeepResearchQueue,
|
||||||
redisConnection,
|
|
||||||
scrapeQueueName,
|
|
||||||
extractQueueName,
|
|
||||||
deepResearchQueueName,
|
|
||||||
getIndexQueue,
|
getIndexQueue,
|
||||||
|
redisConnection,
|
||||||
getGenerateLlmsTxtQueue,
|
getGenerateLlmsTxtQueue,
|
||||||
getBillingQueue,
|
getBillingQueue,
|
||||||
} from "./queue-service";
|
} from "./queue-service";
|
||||||
@ -89,6 +86,7 @@ import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
|
|||||||
import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
||||||
|
import { redisEvictConnection } from "./redis";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -133,11 +131,11 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
logger.info("Crawl is pre-finished, checking if we need to add more jobs");
|
logger.info("Crawl is pre-finished, checking if we need to add more jobs");
|
||||||
if (
|
if (
|
||||||
job.data.crawlerOptions &&
|
job.data.crawlerOptions &&
|
||||||
!(await redisConnection.exists(
|
!(await redisEvictConnection.exists(
|
||||||
"crawl:" + job.data.crawl_id + ":invisible_urls",
|
"crawl:" + job.data.crawl_id + ":invisible_urls",
|
||||||
))
|
))
|
||||||
) {
|
) {
|
||||||
await redisConnection.set(
|
await redisEvictConnection.set(
|
||||||
"crawl:" + job.data.crawl_id + ":invisible_urls",
|
"crawl:" + job.data.crawl_id + ":invisible_urls",
|
||||||
"done",
|
"done",
|
||||||
"EX",
|
"EX",
|
||||||
@ -147,7 +145,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
const sc = (await getCrawl(job.data.crawl_id))!;
|
const sc = (await getCrawl(job.data.crawl_id))!;
|
||||||
|
|
||||||
const visitedUrls = new Set(
|
const visitedUrls = new Set(
|
||||||
await redisConnection.smembers(
|
await redisEvictConnection.smembers(
|
||||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
@ -262,7 +260,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
? normalizeUrlOnlyHostname(sc.originUrl)
|
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||||
: undefined;
|
: undefined;
|
||||||
// Get all visited unique URLs from Redis
|
// Get all visited unique URLs from Redis
|
||||||
const visitedUrls = await redisConnection.smembers(
|
const visitedUrls = await redisEvictConnection.smembers(
|
||||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||||
);
|
);
|
||||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||||
@ -1164,21 +1162,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
document: doc,
|
document: doc,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
|
||||||
logger.debug("Calling webhook with success...", {
|
|
||||||
webhook: job.data.webhook,
|
|
||||||
});
|
|
||||||
await callWebhook(
|
|
||||||
job.data.team_id,
|
|
||||||
job.data.crawl_id,
|
|
||||||
data,
|
|
||||||
job.data.webhook,
|
|
||||||
job.data.v1,
|
|
||||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (job.data.crawl_id) {
|
if (job.data.crawl_id) {
|
||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
|
|
||||||
@ -1231,7 +1214,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
// Prevent redirect target from being visited in the crawl again
|
// Prevent redirect target from being visited in the crawl again
|
||||||
// See lockURL
|
// See lockURL
|
||||||
const x = await redisConnection.sadd(
|
const x = await redisEvictConnection.sadd(
|
||||||
"crawl:" + job.data.crawl_id + ":visited",
|
"crawl:" + job.data.crawl_id + ":visited",
|
||||||
...p1.map((x) => x.href),
|
...p1.map((x) => x.href),
|
||||||
);
|
);
|
||||||
@ -1264,6 +1247,21 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||||
|
logger.debug("Calling webhook with success...", {
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
});
|
||||||
|
await callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
|
data,
|
||||||
|
job.data.webhook,
|
||||||
|
job.data.v1,
|
||||||
|
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
indexJob(job, doc);
|
indexJob(job, doc);
|
||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
logger.debug("Declaring job as done...");
|
||||||
@ -1438,7 +1436,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
logger.debug("Declaring job as done...");
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||||
await redisConnection.srem(
|
await redisEvictConnection.srem(
|
||||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||||
normalizeURL(job.data.url, sc),
|
normalizeURL(job.data.url, sc),
|
||||||
);
|
);
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import Redis from "ioredis";
|
import IORedis from "ioredis";
|
||||||
import { redisRateLimitClient } from "./rate-limiter";
|
import { redisRateLimitClient } from "./rate-limiter";
|
||||||
import { logger } from "../lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
|
|
||||||
@ -70,3 +70,6 @@ const deleteKey = async (key: string) => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export { setValue, getValue, deleteKey };
|
export { setValue, getValue, deleteKey };
|
||||||
|
|
||||||
|
const redisEvictURL = process.env.REDIS_EVICT_URL ?? process.env.REDIS_RATE_LIMIT_URL;
|
||||||
|
export const redisEvictConnection = new IORedis(redisEvictURL!);
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.25.1",
|
"version": "1.25.2",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -119,7 +119,7 @@ export interface CrawlScrapeOptions {
|
|||||||
skipTlsVerification?: boolean;
|
skipTlsVerification?: boolean;
|
||||||
removeBase64Images?: boolean;
|
removeBase64Images?: boolean;
|
||||||
blockAds?: boolean;
|
blockAds?: boolean;
|
||||||
proxy?: "basic" | "stealth";
|
proxy?: "basic" | "stealth" | "auto";
|
||||||
}
|
}
|
||||||
|
|
||||||
export type Action = {
|
export type Action = {
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
||||||
|
|
||||||
__version__ = "2.7.0"
|
__version__ = "2.7.1"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -155,7 +155,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|||||||
skipTlsVerification: Optional[bool] = None
|
skipTlsVerification: Optional[bool] = None
|
||||||
removeBase64Images: Optional[bool] = None
|
removeBase64Images: Optional[bool] = None
|
||||||
blockAds: Optional[bool] = None
|
blockAds: Optional[bool] = None
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
||||||
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
||||||
|
|
||||||
class WaitAction(pydantic.BaseModel):
|
class WaitAction(pydantic.BaseModel):
|
||||||
@ -459,7 +459,7 @@ class FirecrawlApp:
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
@ -481,7 +481,7 @@ class FirecrawlApp:
|
|||||||
skip_tls_verification (Optional[bool]): Skip TLS verification
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
||||||
remove_base64_images (Optional[bool]): Remove base64 images
|
remove_base64_images (Optional[bool]): Remove base64 images
|
||||||
block_ads (Optional[bool]): Block ads
|
block_ads (Optional[bool]): Block ads
|
||||||
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
||||||
extract (Optional[JsonConfig]): Content extraction settings
|
extract (Optional[JsonConfig]): Content extraction settings
|
||||||
json_options (Optional[JsonConfig]): JSON extraction settings
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
||||||
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
||||||
@ -1191,7 +1191,7 @@ class FirecrawlApp:
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
@ -1325,7 +1325,7 @@ class FirecrawlApp:
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
@ -1457,7 +1457,7 @@ class FirecrawlApp:
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
@ -2852,7 +2852,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
@ -2873,7 +2873,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
skip_tls_verification (Optional[bool]): Skip TLS verification
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
||||||
remove_base64_images (Optional[bool]): Remove base64 images
|
remove_base64_images (Optional[bool]): Remove base64 images
|
||||||
block_ads (Optional[bool]): Block ads
|
block_ads (Optional[bool]): Block ads
|
||||||
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
||||||
extract (Optional[JsonConfig]): Content extraction settings
|
extract (Optional[JsonConfig]): Content extraction settings
|
||||||
json_options (Optional[JsonConfig]): JSON extraction settings
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
||||||
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
||||||
@ -2981,7 +2981,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
@ -3120,7 +3120,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
skip_tls_verification: Optional[bool] = None,
|
skip_tls_verification: Optional[bool] = None,
|
||||||
remove_base64_images: Optional[bool] = None,
|
remove_base64_images: Optional[bool] = None,
|
||||||
block_ads: Optional[bool] = None,
|
block_ads: Optional[bool] = None,
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None,
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
||||||
extract: Optional[JsonConfig] = None,
|
extract: Optional[JsonConfig] = None,
|
||||||
json_options: Optional[JsonConfig] = None,
|
json_options: Optional[JsonConfig] = None,
|
||||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user