diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 8aabf748..eef65125 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -278,23 +278,24 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(401); }, 60000); - it.concurrent('should return a successful response for a scrape with 403 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/403' }); - await new Promise((r) => setTimeout(r, 5000)); + // Removed it as we want to retry fallback to the next scraper + // it.concurrent('should return a successful response for a scrape with 403 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/403' }); + // await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(403); - }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(403); + // }, 60000); it.concurrent('should return a successful response for a scrape with 404 page', async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -314,41 +315,41 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(404); }, 60000); - it.concurrent('should return a successful response for a scrape with 405 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/405' }); - await new Promise((r) => setTimeout(r, 5000)); + // it.concurrent('should return a successful response for a scrape with 405 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/405' }); + // await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(405); - }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(405); + // }, 60000); - it.concurrent('should return a successful response for a scrape with 500 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/500' }); - await new Promise((r) => setTimeout(r, 5000)); + // it.concurrent('should return a successful response for a scrape with 500 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/500' }); + // await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(500); - }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(500); + // }, 60000); it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -680,7 +681,7 @@ describe("POST /v1/crawl", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - limit: 10, + limit: 40, includePaths: ["blog/*"], }); @@ -736,7 +737,7 @@ describe("POST /v1/crawl", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - limit: 10, + limit: 40, excludePaths: ["blog/*"], }); @@ -928,7 +929,7 @@ describe("GET /v1/crawl/:jobId", () => { .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://docs.tatum.io", limit: 200 }); + .send({ url: "https://docs.firecrawl.dev", limit: 10 }); expect(crawlResponse.statusCode).toBe(200); diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 1b1ffdc5..4c50b375 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -49,12 +49,27 @@ export async function crawlStatusController(req: Request, res: Response) { if (sc.team_id !== team_id) { return res.status(403).json({ error: "Forbidden" }); } + let jobIDs = await getCrawlJobs(req.params.jobId); + let jobs = await getJobs(req.params.jobId, jobIDs); + let jobStatuses = await Promise.all(jobs.map(x => x.getState())); - const jobIDs = await getCrawlJobs(req.params.jobId); + // Combine jobs and jobStatuses into a single array of objects + let jobsWithStatuses = jobs.map((job, index) => ({ + job, + status: jobStatuses[index] + })); - const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); - const jobStatuses = await Promise.all(jobs.map(x => x.getState())); - const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobs.some((x, i) => jobStatuses[i] === "failed" && x.failedReason !== "Concurrency limit hit") ? "failed" : "active"; + // Filter out failed jobs + jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed"); + + // Sort jobs by timestamp + jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp); + + // Extract sorted jobs and statuses + jobs = jobsWithStatuses.map(x => x.job); + jobStatuses = jobsWithStatuses.map(x => x.status); + + const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 9832a948..b67e559b 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -94,11 +94,15 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed - const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : jobStatuses.some(x => x[1] === "failed") ? "failed" : "scraping"; + // filter out failed jobs + jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); + // filter the job statues + jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; const doneJobs = await getJobs(doneJobIDs); const data = doneJobs.map(x => x.returnvalue); diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 9c0026a0..63331c9c 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -57,11 +57,15 @@ export async function crawlStatusController(req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed - const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : jobStatuses.some(x => x[1] === "failed") ? "failed" : "scraping"; + // filter out failed jobs + jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); + // filter the job statues + jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 3781eb78..01dff86a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -461,8 +461,8 @@ export function legacyDocumentConverter(doc: any): Document { ...doc.metadata, pageError: undefined, pageStatusCode: undefined, - error: doc.metadata.pageError, - statusCode: doc.metadata.pageStatusCode, + error: doc.metadata?.pageError, + statusCode: doc.metadata?.pageStatusCode, }, }; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 571122f9..6e642c65 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -59,6 +59,7 @@ export async function startWebScraperPipeline({ is_scrape: job.data.is_scrape ?? false, })) as { success: boolean; message: string; docs: Document[] }; } + export async function runWebScraper({ url, mode, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 13ca7dd2..767f30e1 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -425,7 +425,7 @@ export async function scrapSingleUrl( Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; } - if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) { + if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400 || pageStatusCode == 401)) { Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`); break; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 532e8fee..1ccf486e 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -395,6 +395,7 @@ async function processJob(job: Job, token: string) { pageOptions: sc.pageOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, + webhook: job.data.webhook, v1: job.data.v1, }, {}, @@ -468,9 +469,8 @@ async function processJob(job: Job, token: string) { } } else { const jobIDs = await getCrawlJobs(job.data.crawl_id); - const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x))); const jobStatus = - sc.cancelled || jobStatuses.some((x) => x === "failed") + sc.cancelled ? "failed" : "completed"; @@ -554,16 +554,16 @@ async function processJob(job: Job, token: string) { job.data.v1 ); } - if (job.data.v1) { - callWebhook( - job.data.team_id, - job.id as string, - [], - job.data.webhook, - job.data.v1, - "crawl.failed" - ); - } + // if (job.data.v1) { + // callWebhook( + // job.data.team_id, + // job.id as string, + // [], + // job.data.webhook, + // job.data.v1, + // "crawl.failed" + // ); + // } if (job.data.crawl_id) { await logJob({