mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 18:48:59 +08:00
Nick:
This commit is contained in:
parent
6bdb1d045d
commit
c7bfe4ffe8
@ -25,10 +25,11 @@ import {
|
|||||||
} from "../../src/lib/crawl-redis";
|
} from "../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||||
|
import { getJobPriority } from "../../src/lib/job-priority";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Crawl
|
RateLimiterMode.Crawl
|
||||||
@ -126,6 +127,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
team_id,
|
team_id,
|
||||||
|
plan,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -175,6 +177,10 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
await getScrapeQueue().addBulk(jobs);
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
|
|
||||||
|
// Not needed, first one should be 15.
|
||||||
|
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
|
||||||
|
|
||||||
const job = await addScrapeJob(
|
const job = await addScrapeJob(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
|
@ -10,7 +10,7 @@ import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
|||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const { success, error, status } = await authenticateUser(
|
const { success, error, status, team_id:a, plan } = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Preview
|
RateLimiterMode.Preview
|
||||||
@ -88,6 +88,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
team_id,
|
team_id,
|
||||||
|
plan,
|
||||||
robots,
|
robots,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
@ -38,7 +38,7 @@ export async function scrapeHelper(
|
|||||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const jobPriority = await getJobPriority({plan, team_id})
|
const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
|
||||||
|
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob({
|
||||||
url,
|
url,
|
||||||
|
@ -213,7 +213,7 @@ async function sendScrapeRequests() {
|
|||||||
await Promise.all(requests);
|
await Promise.all(requests);
|
||||||
}
|
}
|
||||||
|
|
||||||
sendScrapeRequests();
|
// sendScrapeRequests();
|
||||||
// const sq = getScrapeQueue();
|
// const sq = getScrapeQueue();
|
||||||
|
|
||||||
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||||
|
@ -6,6 +6,7 @@ export type StoredCrawl = {
|
|||||||
crawlerOptions: any;
|
crawlerOptions: any;
|
||||||
pageOptions: any;
|
pageOptions: any;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
|
plan: string;
|
||||||
robots?: string;
|
robots?: string;
|
||||||
cancelled?: boolean;
|
cancelled?: boolean;
|
||||||
createdAt: number;
|
createdAt: number;
|
||||||
|
@ -30,9 +30,11 @@ export async function deleteJobPriority(team_id, job_id) {
|
|||||||
export async function getJobPriority({
|
export async function getJobPriority({
|
||||||
plan,
|
plan,
|
||||||
team_id,
|
team_id,
|
||||||
|
basePriority = 10
|
||||||
}: {
|
}: {
|
||||||
plan: PlanType;
|
plan: PlanType;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
|
basePriority: number;
|
||||||
}): Promise<number> {
|
}): Promise<number> {
|
||||||
const setKey = SET_KEY_PREFIX + team_id;
|
const setKey = SET_KEY_PREFIX + team_id;
|
||||||
|
|
||||||
@ -40,11 +42,18 @@ export async function getJobPriority({
|
|||||||
const setLength = await redisConnection.scard(setKey);
|
const setLength = await redisConnection.scard(setKey);
|
||||||
|
|
||||||
// Determine the priority based on the plan and set length
|
// Determine the priority based on the plan and set length
|
||||||
let basePriority = 10;
|
|
||||||
let planModifier = 1;
|
let planModifier = 1;
|
||||||
let bucketLimit = 0;
|
let bucketLimit = 0;
|
||||||
|
|
||||||
switch (plan) {
|
switch (plan) {
|
||||||
|
case "free":
|
||||||
|
bucketLimit = 25;
|
||||||
|
planModifier = 1;
|
||||||
|
break;
|
||||||
|
case "hobby":
|
||||||
|
bucketLimit = 50;
|
||||||
|
planModifier = 0.5;
|
||||||
|
break;
|
||||||
case "standard":
|
case "standard":
|
||||||
case "standardnew":
|
case "standardnew":
|
||||||
bucketLimit = 100;
|
bucketLimit = 100;
|
||||||
@ -55,11 +64,8 @@ export async function getJobPriority({
|
|||||||
bucketLimit = 200;
|
bucketLimit = 200;
|
||||||
planModifier = 0.2;
|
planModifier = 0.2;
|
||||||
break;
|
break;
|
||||||
case "hobby":
|
|
||||||
bucketLimit = 50;
|
|
||||||
planModifier = 0.5;
|
|
||||||
break;
|
|
||||||
case "free":
|
|
||||||
default:
|
default:
|
||||||
bucketLimit = 25;
|
bucketLimit = 25;
|
||||||
planModifier = 1;
|
planModifier = 1;
|
||||||
|
@ -10,7 +10,7 @@ export async function addScrapeJob(
|
|||||||
jobPriority: number = 10
|
jobPriority: number = 10
|
||||||
): Promise<Job> {
|
): Promise<Job> {
|
||||||
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||||
priority: webScraperOptions.crawl_id ? 20 : jobPriority,
|
priority: jobPriority,
|
||||||
...options,
|
...options,
|
||||||
jobId,
|
jobId,
|
||||||
});
|
});
|
||||||
|
@ -21,7 +21,8 @@ import { addCrawlJob, addCrawlJobDone, crawlToCrawler, finishCrawl, getCrawl, ge
|
|||||||
import { StoredCrawl } from "../lib/crawl-redis";
|
import { StoredCrawl } from "../lib/crawl-redis";
|
||||||
import { addScrapeJob } from "./queue-jobs";
|
import { addScrapeJob } from "./queue-jobs";
|
||||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
import { addJobPriority, deleteJobPriority } from "../../src/lib/job-priority";
|
import { addJobPriority, deleteJobPriority, getJobPriority } from "../../src/lib/job-priority";
|
||||||
|
import { PlanType } from "../types";
|
||||||
|
|
||||||
if (process.env.ENV === "production") {
|
if (process.env.ENV === "production") {
|
||||||
initSDK({
|
initSDK({
|
||||||
@ -216,6 +217,15 @@ async function processJob(job: Job, token: string) {
|
|||||||
|
|
||||||
for (const link of links) {
|
for (const link of links) {
|
||||||
if (await lockURL(job.data.crawl_id, sc, link)) {
|
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||||
|
|
||||||
|
const jobPriority = await getJobPriority({plan:sc.plan as PlanType, team_id: sc.team_id, basePriority: job.data.crawl_id ? 20 : 10})
|
||||||
|
const jobId = uuidv4();
|
||||||
|
|
||||||
|
console.log("plan: ", sc.plan);
|
||||||
|
console.log("team_id: ", sc.team_id)
|
||||||
|
console.log("base priority: ", job.data.crawl_id ? 20 : 10)
|
||||||
|
console.log("job priority: " , jobPriority, "\n\n\n")
|
||||||
|
|
||||||
const newJob = await addScrapeJob({
|
const newJob = await addScrapeJob({
|
||||||
url: link,
|
url: link,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
@ -224,7 +234,7 @@ async function processJob(job: Job, token: string) {
|
|||||||
pageOptions: sc.pageOptions,
|
pageOptions: sc.pageOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
crawl_id: job.data.crawl_id,
|
crawl_id: job.data.crawl_id,
|
||||||
});
|
}, {}, jobId, jobPriority);
|
||||||
|
|
||||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user