mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 12:39:05 +08:00
Merge pull request #459 from mendableai/feat/queue-scrapes
feat: Move scraper to queue
This commit is contained in:
commit
81b2479db3
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[http_service.concurrency]
|
[http_service.concurrency]
|
||||||
type = "requests"
|
type = "requests"
|
||||||
hard_limit = 100
|
# hard_limit = 100
|
||||||
soft_limit = 50
|
soft_limit = 100
|
||||||
|
|
||||||
[[http_service.checks]]
|
[[http_service.checks]]
|
||||||
grace_period = "10s"
|
grace_period = "10s"
|
||||||
@ -51,12 +51,13 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[services.concurrency]
|
[services.concurrency]
|
||||||
type = 'connections'
|
type = 'connections'
|
||||||
hard_limit = 25
|
# hard_limit = 25
|
||||||
soft_limit = 20
|
soft_limit = 100
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-1x'
|
size = 'performance-2x'
|
||||||
processes = ['app','worker']
|
processes = ['app','worker']
|
||||||
|
memory = 8192
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,8 +24,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[http_service.concurrency]
|
[http_service.concurrency]
|
||||||
type = "requests"
|
type = "requests"
|
||||||
hard_limit = 200
|
# hard_limit = 200
|
||||||
soft_limit = 75
|
soft_limit = 200
|
||||||
|
|
||||||
[[http_service.checks]]
|
[[http_service.checks]]
|
||||||
grace_period = "20s"
|
grace_period = "20s"
|
||||||
@ -50,8 +50,8 @@ kill_timeout = '30s'
|
|||||||
|
|
||||||
[services.concurrency]
|
[services.concurrency]
|
||||||
type = 'connections'
|
type = 'connections'
|
||||||
hard_limit = 30
|
# hard_limit = 30
|
||||||
soft_limit = 12
|
soft_limit = 200
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-4x'
|
size = 'performance-4x'
|
||||||
|
@ -29,7 +29,6 @@
|
|||||||
"@jest/globals": "^29.7.0",
|
"@jest/globals": "^29.7.0",
|
||||||
"@tsconfig/recommended": "^1.0.3",
|
"@tsconfig/recommended": "^1.0.3",
|
||||||
"@types/body-parser": "^1.19.2",
|
"@types/body-parser": "^1.19.2",
|
||||||
"@types/bull": "^4.10.0",
|
|
||||||
"@types/cors": "^2.8.13",
|
"@types/cors": "^2.8.13",
|
||||||
"@types/express": "^4.17.17",
|
"@types/express": "^4.17.17",
|
||||||
"@types/jest": "^29.5.12",
|
"@types/jest": "^29.5.12",
|
||||||
@ -63,7 +62,7 @@
|
|||||||
"async-mutex": "^0.5.0",
|
"async-mutex": "^0.5.0",
|
||||||
"axios": "^1.3.4",
|
"axios": "^1.3.4",
|
||||||
"bottleneck": "^2.19.5",
|
"bottleneck": "^2.19.5",
|
||||||
"bull": "^4.15.0",
|
"bullmq": "^5.11.0",
|
||||||
"cacheable-lookup": "^6.1.0",
|
"cacheable-lookup": "^6.1.0",
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
"cohere": "^1.1.1",
|
"cohere": "^1.1.1",
|
||||||
@ -99,6 +98,7 @@
|
|||||||
"robots-parser": "^3.0.1",
|
"robots-parser": "^3.0.1",
|
||||||
"scrapingbee": "^1.7.4",
|
"scrapingbee": "^1.7.4",
|
||||||
"stripe": "^16.1.0",
|
"stripe": "^16.1.0",
|
||||||
|
"systeminformation": "^5.22.11",
|
||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
"turndown-plugin-gfm": "^1.0.2",
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"typesense": "^1.5.4",
|
"typesense": "^1.5.4",
|
||||||
|
55
apps/api/pnpm-lock.yaml
generated
55
apps/api/pnpm-lock.yaml
generated
@ -56,9 +56,9 @@ importers:
|
|||||||
bottleneck:
|
bottleneck:
|
||||||
specifier: ^2.19.5
|
specifier: ^2.19.5
|
||||||
version: 2.19.5
|
version: 2.19.5
|
||||||
bull:
|
bullmq:
|
||||||
specifier: ^4.15.0
|
specifier: ^5.11.0
|
||||||
version: 4.15.0
|
version: 5.11.0
|
||||||
cacheable-lookup:
|
cacheable-lookup:
|
||||||
specifier: ^6.1.0
|
specifier: ^6.1.0
|
||||||
version: 6.1.0
|
version: 6.1.0
|
||||||
@ -164,6 +164,9 @@ importers:
|
|||||||
stripe:
|
stripe:
|
||||||
specifier: ^16.1.0
|
specifier: ^16.1.0
|
||||||
version: 16.1.0
|
version: 16.1.0
|
||||||
|
systeminformation:
|
||||||
|
specifier: ^5.22.11
|
||||||
|
version: 5.22.11
|
||||||
turndown:
|
turndown:
|
||||||
specifier: ^7.1.3
|
specifier: ^7.1.3
|
||||||
version: 7.2.0
|
version: 7.2.0
|
||||||
@ -204,9 +207,6 @@ importers:
|
|||||||
'@types/body-parser':
|
'@types/body-parser':
|
||||||
specifier: ^1.19.2
|
specifier: ^1.19.2
|
||||||
version: 1.19.5
|
version: 1.19.5
|
||||||
'@types/bull':
|
|
||||||
specifier: ^4.10.0
|
|
||||||
version: 4.10.0
|
|
||||||
'@types/cors':
|
'@types/cors':
|
||||||
specifier: ^2.8.13
|
specifier: ^2.8.13
|
||||||
version: 2.8.17
|
version: 2.8.17
|
||||||
@ -1538,10 +1538,6 @@ packages:
|
|||||||
'@types/body-parser@1.19.5':
|
'@types/body-parser@1.19.5':
|
||||||
resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==}
|
resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==}
|
||||||
|
|
||||||
'@types/bull@4.10.0':
|
|
||||||
resolution: {integrity: sha512-RkYW8K2H3J76HT6twmHYbzJ0GtLDDotpLP9ah9gtiA7zfF6peBH1l5fEiK0oeIZ3/642M7Jcb9sPmor8Vf4w6g==}
|
|
||||||
deprecated: This is a stub types definition. bull provides its own type definitions, so you do not need this installed.
|
|
||||||
|
|
||||||
'@types/bunyan@1.8.9':
|
'@types/bunyan@1.8.9':
|
||||||
resolution: {integrity: sha512-ZqS9JGpBxVOvsawzmVt30sP++gSQMTejCkIAQ3VdadOcRE8izTyW66hufvwLeH+YEGP6Js2AW7Gz+RMyvrEbmw==}
|
resolution: {integrity: sha512-ZqS9JGpBxVOvsawzmVt30sP++gSQMTejCkIAQ3VdadOcRE8izTyW66hufvwLeH+YEGP6Js2AW7Gz+RMyvrEbmw==}
|
||||||
|
|
||||||
@ -1938,9 +1934,8 @@ packages:
|
|||||||
buffer@6.0.3:
|
buffer@6.0.3:
|
||||||
resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==}
|
resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==}
|
||||||
|
|
||||||
bull@4.15.0:
|
bullmq@5.11.0:
|
||||||
resolution: {integrity: sha512-nOEAfUXwUXtFbRPQP3bWCwpQ/NAerAu2Nym/ucv5C1E+Qh2x6RGdKKsYIfZam4mYncayTynTUN/HLhRgGi2N8w==}
|
resolution: {integrity: sha512-qVzyWGZqie3VHaYEgRXhId/j8ebfmj6MExEJyUByMsUJA5pVciVle3hKLer5fyMwtQ8lTMP7GwhXV/NZ+HzlRA==}
|
||||||
engines: {node: '>=12'}
|
|
||||||
|
|
||||||
bytes@3.1.2:
|
bytes@3.1.2:
|
||||||
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
|
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
|
||||||
@ -2562,10 +2557,6 @@ packages:
|
|||||||
resolution: {integrity: sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==}
|
resolution: {integrity: sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==}
|
||||||
engines: {node: '>=8.0.0'}
|
engines: {node: '>=8.0.0'}
|
||||||
|
|
||||||
get-port@5.1.1:
|
|
||||||
resolution: {integrity: sha512-g/Q1aTSDOxFpchXC4i8ZWvxA1lnPqx/JHqcpIw0/LX9T8x/GBbi6YnlN5nhaKIFkT8oFsscUKgDJYxfwfS6QsQ==}
|
|
||||||
engines: {node: '>=8'}
|
|
||||||
|
|
||||||
get-stream@5.2.0:
|
get-stream@5.2.0:
|
||||||
resolution: {integrity: sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==}
|
resolution: {integrity: sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
@ -4268,6 +4259,12 @@ packages:
|
|||||||
resolution: {integrity: sha512-SzRP5LQ6Ts2G5NyAa/jg16s8e3R7rfdFjizy1zeoecYWw+nGL+YA1xZvW/+iJmidBGSdLkuvdwTYEyJEb+EiUw==}
|
resolution: {integrity: sha512-SzRP5LQ6Ts2G5NyAa/jg16s8e3R7rfdFjizy1zeoecYWw+nGL+YA1xZvW/+iJmidBGSdLkuvdwTYEyJEb+EiUw==}
|
||||||
engines: {node: '>=0.2.6'}
|
engines: {node: '>=0.2.6'}
|
||||||
|
|
||||||
|
systeminformation@5.22.11:
|
||||||
|
resolution: {integrity: sha512-aLws5yi4KCHTb0BVvbodQY5bY8eW4asMRDTxTW46hqw9lGjACX6TlLdJrkdoHYRB0qs+MekqEq1zG7WDnWE8Ug==}
|
||||||
|
engines: {node: '>=8.0.0'}
|
||||||
|
os: [darwin, linux, win32, freebsd, openbsd, netbsd, sunos, android]
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
tar-fs@3.0.5:
|
tar-fs@3.0.5:
|
||||||
resolution: {integrity: sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==}
|
resolution: {integrity: sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==}
|
||||||
|
|
||||||
@ -4460,10 +4457,6 @@ packages:
|
|||||||
resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==}
|
resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
uuid@8.3.2:
|
|
||||||
resolution: {integrity: sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==}
|
|
||||||
hasBin: true
|
|
||||||
|
|
||||||
uuid@9.0.1:
|
uuid@9.0.1:
|
||||||
resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==}
|
resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
@ -6447,12 +6440,6 @@ snapshots:
|
|||||||
'@types/connect': 3.4.38
|
'@types/connect': 3.4.38
|
||||||
'@types/node': 20.14.1
|
'@types/node': 20.14.1
|
||||||
|
|
||||||
'@types/bull@4.10.0':
|
|
||||||
dependencies:
|
|
||||||
bull: 4.15.0
|
|
||||||
transitivePeerDependencies:
|
|
||||||
- supports-color
|
|
||||||
|
|
||||||
'@types/bunyan@1.8.9':
|
'@types/bunyan@1.8.9':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/node': 20.14.1
|
'@types/node': 20.14.1
|
||||||
@ -6923,15 +6910,15 @@ snapshots:
|
|||||||
base64-js: 1.5.1
|
base64-js: 1.5.1
|
||||||
ieee754: 1.2.1
|
ieee754: 1.2.1
|
||||||
|
|
||||||
bull@4.15.0:
|
bullmq@5.11.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
cron-parser: 4.9.0
|
cron-parser: 4.9.0
|
||||||
get-port: 5.1.1
|
|
||||||
ioredis: 5.4.1
|
ioredis: 5.4.1
|
||||||
lodash: 4.17.21
|
|
||||||
msgpackr: 1.10.2
|
msgpackr: 1.10.2
|
||||||
|
node-abort-controller: 3.1.1
|
||||||
semver: 7.6.2
|
semver: 7.6.2
|
||||||
uuid: 8.3.2
|
tslib: 2.6.3
|
||||||
|
uuid: 9.0.1
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
|
|
||||||
@ -7532,8 +7519,6 @@ snapshots:
|
|||||||
|
|
||||||
get-package-type@0.1.0: {}
|
get-package-type@0.1.0: {}
|
||||||
|
|
||||||
get-port@5.1.1: {}
|
|
||||||
|
|
||||||
get-stream@5.2.0:
|
get-stream@5.2.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
pump: 3.0.0
|
pump: 3.0.0
|
||||||
@ -9433,6 +9418,8 @@ snapshots:
|
|||||||
|
|
||||||
sylvester@0.0.12: {}
|
sylvester@0.0.12: {}
|
||||||
|
|
||||||
|
systeminformation@5.22.11: {}
|
||||||
|
|
||||||
tar-fs@3.0.5:
|
tar-fs@3.0.5:
|
||||||
dependencies:
|
dependencies:
|
||||||
pump: 3.0.0
|
pump: 3.0.0
|
||||||
@ -9605,8 +9592,6 @@ snapshots:
|
|||||||
|
|
||||||
uuid@10.0.0: {}
|
uuid@10.0.0: {}
|
||||||
|
|
||||||
uuid@8.3.2: {}
|
|
||||||
|
|
||||||
uuid@9.0.1: {}
|
uuid@9.0.1: {}
|
||||||
|
|
||||||
v8-compile-cache-lib@3.0.1: {}
|
v8-compile-cache-lib@3.0.1: {}
|
||||||
|
@ -217,7 +217,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found");
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||||
@ -233,7 +232,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed");
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||||
@ -249,7 +247,6 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty('content');
|
expect(response.body.data).toHaveProperty('content');
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error");
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
|
|
||||||
import { Job } from "bull";
|
import { Job } from "bullmq";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
import { getWebScraperQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { checkAlerts } from "../../services/alerts";
|
import { checkAlerts } from "../../services/alerts";
|
||||||
|
|
||||||
export async function cleanBefore24hCompleteJobsController(
|
export async function cleanBefore24hCompleteJobsController(
|
||||||
@ -11,13 +11,13 @@ export async function cleanBefore24hCompleteJobsController(
|
|||||||
) {
|
) {
|
||||||
Logger.info("🐂 Cleaning jobs older than 24h");
|
Logger.info("🐂 Cleaning jobs older than 24h");
|
||||||
try {
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const batchSize = 10;
|
const batchSize = 10;
|
||||||
const numberOfBatches = 9; // Adjust based on your needs
|
const numberOfBatches = 9; // Adjust based on your needs
|
||||||
const completedJobsPromises: Promise<Job[]>[] = [];
|
const completedJobsPromises: Promise<Job[]>[] = [];
|
||||||
for (let i = 0; i < numberOfBatches; i++) {
|
for (let i = 0; i < numberOfBatches; i++) {
|
||||||
completedJobsPromises.push(
|
completedJobsPromises.push(
|
||||||
webScraperQueue.getJobs(
|
scrapeQueue.getJobs(
|
||||||
["completed"],
|
["completed"],
|
||||||
i * batchSize,
|
i * batchSize,
|
||||||
i * batchSize + batchSize,
|
i * batchSize + batchSize,
|
||||||
@ -68,10 +68,10 @@ export async function checkQueuesController(req: Request, res: Response) {
|
|||||||
// Use this as a "health check" that way we dont destroy the server
|
// Use this as a "health check" that way we dont destroy the server
|
||||||
export async function queuesController(req: Request, res: Response) {
|
export async function queuesController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
|
|
||||||
const [webScraperActive] = await Promise.all([
|
const [webScraperActive] = await Promise.all([
|
||||||
webScraperQueue.getActiveCount(),
|
scrapeQueue.getActiveCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const noActiveJobs = webScraperActive === 0;
|
const noActiveJobs = webScraperActive === 0;
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
|
||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
|
||||||
import { supabase_service } from "../../src/services/supabase";
|
import { supabase_service } from "../../src/services/supabase";
|
||||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
import { getCrawl, saveCrawl } from "../../src/lib/crawl-redis";
|
||||||
|
|
||||||
export async function crawlCancelController(req: Request, res: Response) {
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -19,8 +17,9 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
if (!success) {
|
if (!success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
|
||||||
if (!job) {
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
if (!sc) {
|
||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -40,27 +39,13 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const jobState = await job.getState();
|
|
||||||
const { partialDocs } = await job.progress();
|
|
||||||
|
|
||||||
if (partialDocs && partialDocs.length > 0 && jobState === "active") {
|
|
||||||
Logger.info("Billing team for partial docs...");
|
|
||||||
// Note: the credits that we will bill them here might be lower than the actual
|
|
||||||
// due to promises that are not yet resolved
|
|
||||||
await billTeam(team_id, partialDocs.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await getWebScraperQueue().client.del(job.lockKey());
|
sc.cancelled = true;
|
||||||
await job.takeLock();
|
await saveCrawl(req.params.jobId, sc);
|
||||||
await job.discard();
|
|
||||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const newJobState = await job.getState();
|
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: "cancelled"
|
status: "cancelled"
|
||||||
});
|
});
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
|
||||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||||
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
|
||||||
export async function crawlStatusController(req: Request, res: Response) {
|
export async function crawlStatusController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -16,33 +16,42 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
if (!success) {
|
if (!success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
|
||||||
if (!job) {
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
if (!sc) {
|
||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
if (sc.team_id !== team_id) {
|
||||||
|
return res.status(403).json({ error: "Forbidden" });
|
||||||
let data = job.returnvalue;
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
|
||||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
|
||||||
|
|
||||||
if (supabaseData) {
|
|
||||||
data = supabaseData.docs;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const jobStatus = await job.getState();
|
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
|
|
||||||
|
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||||
|
const job = await getScrapeQueue().getJob(x);
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
const supabaseData = await supabaseGetJobById(job.id);
|
||||||
|
|
||||||
|
if (supabaseData) {
|
||||||
|
job.returnvalue = supabaseData.docs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||||
|
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||||
|
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||||
|
|
||||||
|
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: jobStatus,
|
status: jobStatus,
|
||||||
// progress: job.progress(),
|
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||||
current,
|
total: jobs.length,
|
||||||
current_url,
|
data: jobStatus === "completed" ? data : null,
|
||||||
current_step,
|
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||||
total,
|
|
||||||
data: data ? data : null,
|
|
||||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../../src/scraper/WebScraper";
|
|
||||||
import { billTeam } from "../../src/services/billing/credit_billing";
|
|
||||||
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||||
@ -12,6 +10,9 @@ import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
|||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||||
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
|
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
@ -43,10 +44,17 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
return res.status(402).json({ error: "Insufficient credits" });
|
return res.status(402).json({ error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const url = req.body.url;
|
let url = req.body.url;
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
url = checkAndUpdateURL(url).url;
|
||||||
|
} catch (e) {
|
||||||
|
return res
|
||||||
|
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
|
||||||
|
.json({ error: e.message ?? e });
|
||||||
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res
|
return res
|
||||||
@ -62,47 +70,100 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
try {
|
// try {
|
||||||
const a = new WebScraperDataProvider();
|
// const a = new WebScraperDataProvider();
|
||||||
await a.setOptions({
|
// await a.setOptions({
|
||||||
jobId: uuidv4(),
|
// jobId: uuidv4(),
|
||||||
mode: "single_urls",
|
// mode: "single_urls",
|
||||||
urls: [url],
|
// urls: [url],
|
||||||
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||||
pageOptions: pageOptions,
|
// pageOptions: pageOptions,
|
||||||
});
|
// });
|
||||||
|
|
||||||
const docs = await a.getDocuments(false, (progress) => {
|
// const docs = await a.getDocuments(false, (progress) => {
|
||||||
job.progress({
|
// job.updateProgress({
|
||||||
current: progress.current,
|
// current: progress.current,
|
||||||
total: progress.total,
|
// total: progress.total,
|
||||||
current_step: "SCRAPING",
|
// current_step: "SCRAPING",
|
||||||
current_url: progress.currentDocumentUrl,
|
// current_url: progress.currentDocumentUrl,
|
||||||
});
|
// });
|
||||||
});
|
// });
|
||||||
return res.json({
|
// return res.json({
|
||||||
success: true,
|
// success: true,
|
||||||
documents: docs,
|
// documents: docs,
|
||||||
});
|
// });
|
||||||
} catch (error) {
|
// } catch (error) {
|
||||||
Logger.error(error);
|
// Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
// return res.status(500).json({ error: error.message });
|
||||||
}
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
const id = uuidv4();
|
||||||
|
|
||||||
|
await logCrawl(id, team_id);
|
||||||
|
|
||||||
|
const sc: StoredCrawl = {
|
||||||
|
originUrl: url,
|
||||||
|
crawlerOptions,
|
||||||
|
pageOptions,
|
||||||
|
team_id,
|
||||||
|
createdAt: Date.now(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
|
try {
|
||||||
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
} catch (_) {}
|
||||||
|
|
||||||
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
|
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
|
|
||||||
|
if (sitemap !== null) {
|
||||||
|
const jobs = sitemap.map(x => {
|
||||||
|
const url = x.url;
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
|
crawl_id: id,
|
||||||
|
sitemapped: true,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 20,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})
|
||||||
|
|
||||||
|
await lockURLs(id, jobs.map(x => x.data.url));
|
||||||
|
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||||
|
await getScrapeQueue().addBulk(jobs);
|
||||||
|
} else {
|
||||||
|
await lockURL(id, sc, url);
|
||||||
|
const job = await addScrapeJob({
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
|
crawl_id: id,
|
||||||
|
}, {
|
||||||
|
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||||
|
});
|
||||||
|
await addCrawlJob(id, job.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
res.json({ jobId: id });
|
||||||
url: url,
|
|
||||||
mode: mode ?? "crawl", // fix for single urls not working
|
|
||||||
crawlerOptions: crawlerOptions,
|
|
||||||
team_id: team_id,
|
|
||||||
pageOptions: pageOptions,
|
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
|
||||||
});
|
|
||||||
|
|
||||||
await logCrawl(job.id.toString(), team_id);
|
|
||||||
|
|
||||||
res.json({ jobId: job.id });
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
|
@ -1,44 +1,125 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../../src/types";
|
import { RateLimiterMode } from "../../src/types";
|
||||||
import { addWebScraperJob } from "../../src/services/queue-jobs";
|
|
||||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||||
|
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const { success, team_id, error, status } = await authenticateUser(
|
const { success, error, status } = await authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
RateLimiterMode.Preview
|
RateLimiterMode.Preview
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const team_id = "preview";
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
return res.status(status).json({ error });
|
return res.status(status).json({ error });
|
||||||
}
|
}
|
||||||
// authenticate on supabase
|
|
||||||
const url = req.body.url;
|
const url = req.body.url;
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: "Url is required" });
|
return res.status(400).json({ error: "Url is required" });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
|
return res
|
||||||
|
.status(403)
|
||||||
|
.json({
|
||||||
|
error:
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
url: url,
|
// try {
|
||||||
mode: mode ?? "crawl", // fix for single urls not working
|
// const a = new WebScraperDataProvider();
|
||||||
crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
|
// await a.setOptions({
|
||||||
team_id: "preview",
|
// jobId: uuidv4(),
|
||||||
pageOptions: pageOptions,
|
// mode: "single_urls",
|
||||||
origin: "website-preview",
|
// urls: [url],
|
||||||
});
|
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||||
|
// pageOptions: pageOptions,
|
||||||
|
// });
|
||||||
|
|
||||||
res.json({ jobId: job.id });
|
// const docs = await a.getDocuments(false, (progress) => {
|
||||||
|
// job.updateProgress({
|
||||||
|
// current: progress.current,
|
||||||
|
// total: progress.total,
|
||||||
|
// current_step: "SCRAPING",
|
||||||
|
// current_url: progress.currentDocumentUrl,
|
||||||
|
// });
|
||||||
|
// });
|
||||||
|
// return res.json({
|
||||||
|
// success: true,
|
||||||
|
// documents: docs,
|
||||||
|
// });
|
||||||
|
// } catch (error) {
|
||||||
|
// Logger.error(error);
|
||||||
|
// return res.status(500).json({ error: error.message });
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
const id = uuidv4();
|
||||||
|
|
||||||
|
let robots;
|
||||||
|
|
||||||
|
try {
|
||||||
|
robots = await this.getRobotsTxt();
|
||||||
|
} catch (_) {}
|
||||||
|
|
||||||
|
const sc: StoredCrawl = {
|
||||||
|
originUrl: url,
|
||||||
|
crawlerOptions,
|
||||||
|
pageOptions,
|
||||||
|
team_id,
|
||||||
|
robots,
|
||||||
|
createdAt: Date.now(),
|
||||||
|
};
|
||||||
|
|
||||||
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
|
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
|
|
||||||
|
if (sitemap !== null) {
|
||||||
|
for (const url of sitemap.map(x => x.url)) {
|
||||||
|
await lockURL(id, sc, url);
|
||||||
|
const job = await addScrapeJob({
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
origin: "website-preview",
|
||||||
|
crawl_id: id,
|
||||||
|
sitemapped: true,
|
||||||
|
});
|
||||||
|
await addCrawlJob(id, job.id);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
await lockURL(id, sc, url);
|
||||||
|
const job = await addScrapeJob({
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
origin: "website-preview",
|
||||||
|
crawl_id: id,
|
||||||
|
});
|
||||||
|
await addCrawlJob(id, job.id);
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json({ jobId: id });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
|
||||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "./auth";
|
import { authenticateUser } from "./auth";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
@ -9,6 +8,8 @@ import { Document } from "../lib/entities";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||||
|
import { addScrapeJob } from '../services/queue-jobs';
|
||||||
|
import { scrapeQueueEvents } from '../services/queue-service';
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from '../lib/logger';
|
import { Logger } from '../lib/logger';
|
||||||
|
|
||||||
@ -36,50 +37,49 @@ export async function scrapeHelper(
|
|||||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const job = await addScrapeJob({
|
||||||
await a.setOptions({
|
url,
|
||||||
jobId,
|
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
crawlerOptions,
|
||||||
crawlerOptions: {
|
team_id,
|
||||||
...crawlerOptions,
|
pageOptions,
|
||||||
},
|
extractorOptions,
|
||||||
pageOptions: pageOptions,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
extractorOptions: extractorOptions,
|
}, {}, jobId);
|
||||||
});
|
|
||||||
|
|
||||||
const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
|
let doc;
|
||||||
setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
|
|
||||||
);
|
|
||||||
|
|
||||||
const docsPromise = a.getDocuments(false);
|
|
||||||
|
|
||||||
let docs;
|
|
||||||
try {
|
try {
|
||||||
docs = await Promise.race([docsPromise, timeoutPromise]);
|
doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; //60 seconds timeout
|
||||||
} catch (error) {
|
} catch (e) {
|
||||||
return error;
|
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: "Request timed out",
|
||||||
|
returnCode: 408,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// make sure doc.content is not empty
|
await job.remove();
|
||||||
let filteredDocs = docs.filter(
|
|
||||||
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0
|
if (!doc) {
|
||||||
);
|
console.error("!!! PANIC DOC IS", doc, job);
|
||||||
if (filteredDocs.length === 0) {
|
return { success: true, error: "No page found", returnCode: 200, data: doc };
|
||||||
return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete doc.index;
|
||||||
|
delete doc.provider;
|
||||||
|
|
||||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||||
filteredDocs.forEach(doc => {
|
delete doc.rawHtml;
|
||||||
delete doc.rawHtml;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: filteredDocs[0],
|
data: doc,
|
||||||
returnCode: 200,
|
returnCode: 200,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -143,7 +143,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||||
|
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 0; // billing for doc done on queue end
|
||||||
const creditsPerLLMExtract = 50;
|
const creditsPerLLMExtract = 50;
|
||||||
|
|
||||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||||
|
@ -9,6 +9,7 @@ import { search } from "../search";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
@ -75,26 +76,28 @@ export async function searchHelper(
|
|||||||
|
|
||||||
// filter out social media links
|
// filter out social media links
|
||||||
|
|
||||||
|
const jobDatas = res.map(x => {
|
||||||
|
const url = x.url;
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: crawlerOptions,
|
||||||
|
team_id: team_id,
|
||||||
|
pageOptions: pageOptions,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 10,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
})
|
||||||
|
|
||||||
|
const jobs = await getScrapeQueue().addBulk(jobDatas);
|
||||||
|
|
||||||
const a = new WebScraperDataProvider();
|
const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]);
|
||||||
await a.setOptions({
|
|
||||||
jobId,
|
|
||||||
mode: "single_urls",
|
|
||||||
urls: res.map((r) => r.url).slice(0, Math.min(searchOptions.limit ?? 5, 5)),
|
|
||||||
crawlerOptions: {
|
|
||||||
...crawlerOptions,
|
|
||||||
},
|
|
||||||
pageOptions: {
|
|
||||||
...pageOptions,
|
|
||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
|
||||||
includeHtml: pageOptions?.includeHtml ?? false,
|
|
||||||
removeTags: pageOptions?.removeTags ?? [],
|
|
||||||
fallback: false,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const docs = await a.getDocuments(false);
|
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
@ -109,19 +112,6 @@ export async function searchHelper(
|
|||||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
||||||
}
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
|
||||||
team_id,
|
|
||||||
filteredDocs.length
|
|
||||||
);
|
|
||||||
if (!billingResult.success) {
|
|
||||||
return {
|
|
||||||
success: false,
|
|
||||||
error:
|
|
||||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
|
||||||
returnCode: 402,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: filteredDocs,
|
data: filteredDocs,
|
||||||
|
@ -1,39 +1,51 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { getWebScraperQueue } from "../../src/services/queue-service";
|
|
||||||
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
|
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||||
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
|
||||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const job = await getWebScraperQueue().getJob(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!job) {
|
if (!sc) {
|
||||||
return res.status(404).json({ error: "Job not found" });
|
return res.status(404).json({ error: "Job not found" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
let data = job.returnvalue;
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
|
||||||
const supabaseData = await supabaseGetJobById(req.params.jobId);
|
|
||||||
|
|
||||||
if (supabaseData) {
|
// let data = job.returnvalue;
|
||||||
data = supabaseData.docs;
|
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
||||||
|
|
||||||
|
// if (supabaseData) {
|
||||||
|
// data = supabaseData.docs;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||||
|
const job = await getScrapeQueue().getJob(x);
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
const supabaseData = await supabaseGetJobById(job.id);
|
||||||
|
|
||||||
|
if (supabaseData) {
|
||||||
|
job.returnvalue = supabaseData.docs;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
let jobStatus = await job.getState();
|
return job;
|
||||||
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
|
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||||
jobStatus = 'active';
|
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||||
}
|
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||||
|
|
||||||
|
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: jobStatus,
|
status: jobStatus,
|
||||||
// progress: job.progress(),
|
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
||||||
current,
|
total: jobs.length,
|
||||||
current_url,
|
data: jobStatus === "completed" ? data : null,
|
||||||
current_step,
|
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
||||||
total,
|
|
||||||
data: data ? data : null,
|
|
||||||
partial_data: jobStatus == 'completed' ? [] : partialDocs,
|
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
|
@ -2,7 +2,7 @@ import express from "express";
|
|||||||
import bodyParser from "body-parser";
|
import bodyParser from "body-parser";
|
||||||
import cors from "cors";
|
import cors from "cors";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import { getWebScraperQueue } from "./services/queue-service";
|
import { getScrapeQueue } from "./services/queue-service";
|
||||||
import { v0Router } from "./routes/v0";
|
import { v0Router } from "./routes/v0";
|
||||||
import { initSDK } from "@hyperdx/node-opentelemetry";
|
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||||
import cluster from "cluster";
|
import cluster from "cluster";
|
||||||
@ -58,7 +58,7 @@ if (cluster.isMaster) {
|
|||||||
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
||||||
|
|
||||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||||
queues: [new BullAdapter(getWebScraperQueue())],
|
queues: [new BullAdapter(getScrapeQueue())],
|
||||||
serverAdapter: serverAdapter,
|
serverAdapter: serverAdapter,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -104,9 +104,9 @@ if (cluster.isMaster) {
|
|||||||
|
|
||||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||||
try {
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const [waitingJobs] = await Promise.all([
|
const [waitingJobs] = await Promise.all([
|
||||||
webScraperQueue.getWaitingCount(),
|
scrapeQueue.getWaitingCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const noWaitingJobs = waitingJobs === 0;
|
const noWaitingJobs = waitingJobs === 0;
|
||||||
@ -126,9 +126,9 @@ if (cluster.isMaster) {
|
|||||||
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
||||||
|
|
||||||
const getWaitingJobsCount = async () => {
|
const getWaitingJobsCount = async () => {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const [waitingJobsCount] = await Promise.all([
|
const [waitingJobsCount] = await Promise.all([
|
||||||
webScraperQueue.getWaitingCount(),
|
scrapeQueue.getWaitingCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return waitingJobsCount;
|
return waitingJobsCount;
|
||||||
@ -181,13 +181,12 @@ if (cluster.isMaster) {
|
|||||||
Logger.info(`Worker ${process.pid} started`);
|
Logger.info(`Worker ${process.pid} started`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const wsq = getWebScraperQueue();
|
// const sq = getScrapeQueue();
|
||||||
|
|
||||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
|
||||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
|
||||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
|
||||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
|
||||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
|
||||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
|
||||||
|
|
||||||
|
// sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||||
|
// sq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||||
|
// sq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||||
|
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||||
|
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||||
|
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||||
|
|
||||||
|
91
apps/api/src/lib/crawl-redis.ts
Normal file
91
apps/api/src/lib/crawl-redis.ts
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
|
import { redisConnection } from "../services/queue-service";
|
||||||
|
|
||||||
|
export type StoredCrawl = {
|
||||||
|
originUrl: string;
|
||||||
|
crawlerOptions: any;
|
||||||
|
pageOptions: any;
|
||||||
|
team_id: string;
|
||||||
|
robots?: string;
|
||||||
|
cancelled?: boolean;
|
||||||
|
createdAt: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||||
|
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||||
|
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
||||||
|
const x = await redisConnection.get("crawl:" + id);
|
||||||
|
|
||||||
|
if (x === null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return JSON.parse(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function addCrawlJob(id: string, job_id: string) {
|
||||||
|
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||||
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function isCrawlFinished(id: string) {
|
||||||
|
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getCrawlJobs(id: string): Promise<string[]> {
|
||||||
|
return await redisConnection.smembers("crawl:" + id + ":jobs");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
||||||
|
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||||
|
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||||
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
|
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||||
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||||
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
||||||
|
const crawler = new WebCrawler({
|
||||||
|
jobId: id,
|
||||||
|
initialUrl: sc.originUrl,
|
||||||
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
|
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
|
||||||
|
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||||
|
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||||
|
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||||
|
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (sc.robots !== undefined) {
|
||||||
|
try {
|
||||||
|
crawler.importRobotsTxt(sc.robots);
|
||||||
|
} catch (_) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
return crawler;
|
||||||
|
}
|
@ -65,6 +65,7 @@ export type WebScraperOptions = {
|
|||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
bullJobId?: string;
|
bullJobId?: string;
|
||||||
|
priority?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export interface DocumentUrl {
|
export interface DocumentUrl {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { Job, JobId } from "bull";
|
import { Job } from "bullmq";
|
||||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||||
import { supabase_service as supabase } from "../services/supabase";
|
import { supabase_service as supabase } from "../services/supabase";
|
||||||
import { Logger } from "./logger";
|
import { Logger } from "./logger";
|
||||||
@ -70,7 +70,7 @@ export class ScrapeEvents {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) {
|
static async logJobEvent(job: Job | any, event: ScrapeQueueEvent["event"]) {
|
||||||
try {
|
try {
|
||||||
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||||
type: "queue",
|
type: "queue",
|
||||||
|
38
apps/api/src/lib/validateUrl.ts
Normal file
38
apps/api/src/lib/validateUrl.ts
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
const protocolIncluded = (url: string) => {
|
||||||
|
// if :// not in the start of the url assume http (maybe https?)
|
||||||
|
// regex checks if :// appears before any .
|
||||||
|
return(/^([^.:]+:\/\/)/.test(url));
|
||||||
|
}
|
||||||
|
|
||||||
|
const getURLobj = (s: string) => {
|
||||||
|
// URL fails if we dont include the protocol ie google.com
|
||||||
|
let error = false;
|
||||||
|
let urlObj = {};
|
||||||
|
try {
|
||||||
|
urlObj = new URL(s);
|
||||||
|
} catch (err) {
|
||||||
|
error = true;
|
||||||
|
}
|
||||||
|
return { error, urlObj };
|
||||||
|
};
|
||||||
|
|
||||||
|
export const checkAndUpdateURL = (url: string) => {
|
||||||
|
|
||||||
|
if (!protocolIncluded(url)) {
|
||||||
|
url = `http://${url}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { error, urlObj } = getURLobj(url);
|
||||||
|
if (error) {
|
||||||
|
throw new Error("Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
const typedUrlObj = urlObj as URL;
|
||||||
|
|
||||||
|
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||||
|
throw new Error("Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
return { urlObj: typedUrlObj, url: url };
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
import { Job } from "bull";
|
import { Job } from "bullmq";
|
||||||
import {
|
import {
|
||||||
CrawlResult,
|
CrawlResult,
|
||||||
WebScraperOptions,
|
WebScraperOptions,
|
||||||
@ -12,17 +12,21 @@ import { Document } from "../lib/entities";
|
|||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
|
import { getScrapeQueue } from "../services/queue-service";
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
|
token,
|
||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions>;
|
job: Job<WebScraperOptions>;
|
||||||
|
token: string;
|
||||||
}) {
|
}) {
|
||||||
let partialDocs: Document[] = [];
|
let partialDocs: Document[] = [];
|
||||||
return (await runWebScraper({
|
return (await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
extractorOptions: job.data.extractorOptions,
|
||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
inProgress: (progress) => {
|
inProgress: (progress) => {
|
||||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||||
@ -31,20 +35,21 @@ export async function startWebScraperPipeline({
|
|||||||
if (partialDocs.length > 50) {
|
if (partialDocs.length > 50) {
|
||||||
partialDocs = partialDocs.slice(-50);
|
partialDocs = partialDocs.slice(-50);
|
||||||
}
|
}
|
||||||
job.progress({ ...progress, partialDocs: partialDocs });
|
// job.updateProgress({ ...progress, partialDocs: partialDocs });
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
onSuccess: (result) => {
|
onSuccess: (result, mode) => {
|
||||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||||
saveJob(job, result);
|
saveJob(job, result, token, mode);
|
||||||
},
|
},
|
||||||
onError: (error) => {
|
onError: (error) => {
|
||||||
Logger.error(`🐂 Job failed ${job.id}`);
|
Logger.error(`🐂 Job failed ${job.id}`);
|
||||||
ScrapeEvents.logJobEvent(job, "failed");
|
ScrapeEvents.logJobEvent(job, "failed");
|
||||||
job.moveToFailed(error);
|
job.moveToFailed(error, token, false);
|
||||||
},
|
},
|
||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
bull_job_id: job.id.toString(),
|
bull_job_id: job.id.toString(),
|
||||||
|
priority: job.opts.priority,
|
||||||
})) as { success: boolean; message: string; docs: Document[] };
|
})) as { success: boolean; message: string; docs: Document[] };
|
||||||
}
|
}
|
||||||
export async function runWebScraper({
|
export async function runWebScraper({
|
||||||
@ -52,11 +57,13 @@ export async function runWebScraper({
|
|||||||
mode,
|
mode,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
|
extractorOptions,
|
||||||
inProgress,
|
inProgress,
|
||||||
onSuccess,
|
onSuccess,
|
||||||
onError,
|
onError,
|
||||||
team_id,
|
team_id,
|
||||||
bull_job_id,
|
bull_job_id,
|
||||||
|
priority,
|
||||||
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
}: RunWebScraperParams): Promise<RunWebScraperResult> {
|
||||||
try {
|
try {
|
||||||
const provider = new WebScraperDataProvider();
|
const provider = new WebScraperDataProvider();
|
||||||
@ -65,17 +72,21 @@ export async function runWebScraper({
|
|||||||
jobId: bull_job_id,
|
jobId: bull_job_id,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: [url],
|
urls: [url],
|
||||||
|
extractorOptions,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
bullJobId: bull_job_id,
|
bullJobId: bull_job_id,
|
||||||
|
priority,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
await provider.setOptions({
|
await provider.setOptions({
|
||||||
jobId: bull_job_id,
|
jobId: bull_job_id,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
urls: url.split(","),
|
urls: url.split(","),
|
||||||
|
extractorOptions,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
priority,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
@ -97,8 +108,8 @@ export async function runWebScraper({
|
|||||||
return { url: doc.metadata.sourceURL };
|
return { url: doc.metadata.sourceURL };
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
: docs.filter((doc) => doc.content.trim().length > 0);
|
: docs;
|
||||||
|
|
||||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
const billingResult = await billTeam(team_id, filteredDocs.length);
|
||||||
|
|
||||||
if (!billingResult.success) {
|
if (!billingResult.success) {
|
||||||
@ -111,7 +122,7 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// This is where the returnvalue from the job is set
|
// This is where the returnvalue from the job is set
|
||||||
onSuccess(filteredDocs);
|
onSuccess(filteredDocs, mode);
|
||||||
|
|
||||||
// this return doesn't matter too much for the job completion result
|
// this return doesn't matter too much for the job completion result
|
||||||
return { success: true, message: "", docs: filteredDocs };
|
return { success: true, message: "", docs: filteredDocs };
|
||||||
@ -121,7 +132,7 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const saveJob = async (job: Job, result: any) => {
|
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||||
try {
|
try {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
@ -130,17 +141,21 @@ const saveJob = async (job: Job, result: any) => {
|
|||||||
.eq("job_id", job.id);
|
.eq("job_id", job.id);
|
||||||
|
|
||||||
if (error) throw new Error(error.message);
|
if (error) throw new Error(error.message);
|
||||||
try {
|
// try {
|
||||||
await job.moveToCompleted(null, false, false);
|
// if (mode === "crawl") {
|
||||||
} catch (error) {
|
// await job.moveToCompleted(null, token, false);
|
||||||
// I think the job won't exist here anymore
|
// } else {
|
||||||
}
|
// await job.moveToCompleted(result, token, false);
|
||||||
} else {
|
// }
|
||||||
try {
|
// } catch (error) {
|
||||||
await job.moveToCompleted(result, false, false);
|
// // I think the job won't exist here anymore
|
||||||
} catch (error) {
|
// }
|
||||||
// I think the job won't exist here anymore
|
// } else {
|
||||||
}
|
// try {
|
||||||
|
// await job.moveToCompleted(result, token, false);
|
||||||
|
// } catch (error) {
|
||||||
|
// // I think the job won't exist here anymore
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
ScrapeEvents.logJobEvent(job, "completed");
|
ScrapeEvents.logJobEvent(job, "completed");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
175
apps/api/src/run-req.ts
Normal file
175
apps/api/src/run-req.ts
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
import { promises as fs } from "fs";
|
||||||
|
import { v4 as uuidV4 } from "uuid";
|
||||||
|
|
||||||
|
interface Result {
|
||||||
|
start_url: string;
|
||||||
|
job_id?: string;
|
||||||
|
idempotency_key?: string;
|
||||||
|
result_data_jsonb?: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sendCrawl(result: Result): Promise<string | undefined> {
|
||||||
|
const idempotencyKey = uuidV4();
|
||||||
|
const url = result.start_url;
|
||||||
|
try {
|
||||||
|
const response = await axios.post(
|
||||||
|
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
|
||||||
|
{
|
||||||
|
url: url,
|
||||||
|
crawlerOptions: {
|
||||||
|
limit: 75,
|
||||||
|
},
|
||||||
|
pageOptions: {
|
||||||
|
includeHtml: true,
|
||||||
|
replaceAllPathsWithAbsolutePaths: true,
|
||||||
|
waitFor: 1000,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Authorization: `Bearer `,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
result.idempotency_key = idempotencyKey;
|
||||||
|
return response.data.jobId;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error sending crawl:", error);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getContent(result: Result): Promise<boolean> {
|
||||||
|
let attempts = 0;
|
||||||
|
while (attempts < 120) {
|
||||||
|
// Reduce the number of attempts to speed up
|
||||||
|
try {
|
||||||
|
const response = await axios.get(
|
||||||
|
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Authorization: `Bearer `,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
if (response.data.status === "completed") {
|
||||||
|
result.result_data_jsonb = response.data.data;
|
||||||
|
// Job actually completed
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error getting content:", error);
|
||||||
|
}
|
||||||
|
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
|
||||||
|
attempts++;
|
||||||
|
}
|
||||||
|
// Set result as null if timed out
|
||||||
|
result.result_data_jsonb = null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processResults(results: Result[]): Promise<void> {
|
||||||
|
let processedCount = 0;
|
||||||
|
let starterCount = 0;
|
||||||
|
const queue: Result[] = [];
|
||||||
|
const processedUrls = new Set<string>();
|
||||||
|
|
||||||
|
// Initialize the queue with the first 1000 results
|
||||||
|
for (let i = 0; i < Math.min(100, results.length); i++) {
|
||||||
|
queue.push(results[i]);
|
||||||
|
processedUrls.add(results[i].start_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to process a single result
|
||||||
|
const processSingleResult = async (result: Result) => {
|
||||||
|
const jobId = await sendCrawl(result);
|
||||||
|
if (jobId) {
|
||||||
|
console.log(`Job requested count: ${starterCount}`);
|
||||||
|
starterCount++;
|
||||||
|
result.job_id = jobId;
|
||||||
|
processedCount++;
|
||||||
|
// Save the result to the file
|
||||||
|
try {
|
||||||
|
// Save job id along with the start_url
|
||||||
|
const resultWithJobId = results.map(r => ({
|
||||||
|
start_url: r.start_url,
|
||||||
|
job_id: r.job_id,
|
||||||
|
}));
|
||||||
|
await fs.writeFile(
|
||||||
|
"results_with_job_id_4000_6000.json",
|
||||||
|
JSON.stringify(resultWithJobId, null, 4)
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error writing to results_with_content.json:", error);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a new result to the queue if there are more results to process
|
||||||
|
// if (processedCount < results.length) {
|
||||||
|
// for (let i = queue.length; i < results.length; i++) {
|
||||||
|
// if (!processedUrls.has(results[i].start_url)) {
|
||||||
|
// const nextResult = results[i];
|
||||||
|
// console.log("Next result:", nextResult.start_url);
|
||||||
|
// queue.push(nextResult);
|
||||||
|
// processedUrls.add(nextResult.start_url);
|
||||||
|
// console.log(`Queue length: ${queue.length}`);
|
||||||
|
// processSingleResult(nextResult);
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Start processing the initial queue concurrently
|
||||||
|
// for (let i = 0; i < queue.length; i++) {
|
||||||
|
// processSingleResult(queue[i]);
|
||||||
|
// if ((i + 1) % 500 === 0) {
|
||||||
|
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
|
||||||
|
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// Start processing the initial queue concurrently
|
||||||
|
// await Promise.all(queue.map(result => processSingleResult(result)));
|
||||||
|
for (let i = 0; i < results.length; i += 100) {
|
||||||
|
const batch = results.slice(i, i + 100);
|
||||||
|
Promise.all(batch.map((result) => processSingleResult(result)))
|
||||||
|
.then(() => {
|
||||||
|
console.log(`Processed ${i + 100} results.`);
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error(`Error processing batch starting at index ${i}:`, error);
|
||||||
|
});
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Example call
|
||||||
|
|
||||||
|
async function getStartUrls(): Promise<Result[]> {
|
||||||
|
try {
|
||||||
|
const data = await fs.readFile("starturls.json", "utf-8");
|
||||||
|
return JSON.parse(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error reading starturls.json:", error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
|
||||||
|
// console.log(results.map((r) => r.start_url).slice(0, 3));
|
||||||
|
|
||||||
|
processResults(results)
|
||||||
|
.then(() => {
|
||||||
|
console.log("All results processed.");
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error("Error processing results:", error);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
@ -24,7 +24,7 @@ describe('scrapSingleUrl', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it('should return a list of links on the firecrawl.ai page', async () => {
|
it('should return a list of links on the firecrawl.ai page', async () => {
|
||||||
const url = 'https://example.com';
|
const url = 'https://flutterbricks.com';
|
||||||
const pageOptions: PageOptions = { includeHtml: true };
|
const pageOptions: PageOptions = { includeHtml: true };
|
||||||
|
|
||||||
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
const result = await scrapSingleUrl("TEST", url, pageOptions);
|
||||||
@ -33,5 +33,5 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
|||||||
expect(result.linksOnPage).toBeDefined();
|
expect(result.linksOnPage).toBeDefined();
|
||||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||||
expect(result.linksOnPage).toContain('https://www.iana.org/domains/example')
|
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import axios from "axios";
|
import axios, { AxiosError } from "axios";
|
||||||
import cheerio, { load } from "cheerio";
|
import cheerio, { load } from "cheerio";
|
||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
@ -22,7 +22,7 @@ export class WebCrawler {
|
|||||||
private crawledUrls: Map<string, string> = new Map();
|
private crawledUrls: Map<string, string> = new Map();
|
||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
public robots: any;
|
||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
private allowBackwardCrawling: boolean;
|
private allowBackwardCrawling: boolean;
|
||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
@ -66,7 +66,7 @@ export class WebCrawler {
|
|||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||||
return sitemapLinks
|
return sitemapLinks
|
||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
const url = new URL(link.trim(), this.baseUrl);
|
const url = new URL(link.trim(), this.baseUrl);
|
||||||
@ -130,6 +130,25 @@ export class WebCrawler {
|
|||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async getRobotsTxt(): Promise<string> {
|
||||||
|
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||||
|
return response.data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public importRobotsTxt(txt: string) {
|
||||||
|
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||||
|
}
|
||||||
|
|
||||||
|
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
|
||||||
|
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
||||||
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
|
if (sitemapLinks.length > 0) {
|
||||||
|
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
|
||||||
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
public async start(
|
public async start(
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
pageOptions?: PageOptions,
|
pageOptions?: PageOptions,
|
||||||
@ -142,19 +161,17 @@ export class WebCrawler {
|
|||||||
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
Logger.debug(`Crawler starting with ${this.initialUrl}`);
|
||||||
// Fetch and parse robots.txt
|
// Fetch and parse robots.txt
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
const txt = await this.getRobotsTxt();
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.importRobotsTxt(txt);
|
||||||
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!crawlerOptions?.ignoreSitemap){
|
if (!crawlerOptions?.ignoreSitemap){
|
||||||
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
|
const sm = await this.tryGetSitemap();
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
if (sm !== null) {
|
||||||
if (sitemapLinks.length > 0) {
|
return sm;
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -241,6 +258,37 @@ export class WebCrawler {
|
|||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public filterURL(href: string, url: string): string | null {
|
||||||
|
let fullUrl = href;
|
||||||
|
if (!href.startsWith("http")) {
|
||||||
|
fullUrl = new URL(href, this.baseUrl).toString();
|
||||||
|
}
|
||||||
|
const urlObj = new URL(fullUrl);
|
||||||
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
|
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||||
|
if (this.isInternalLink(fullUrl) &&
|
||||||
|
this.noSections(fullUrl) &&
|
||||||
|
!this.matchesExcludes(path) &&
|
||||||
|
this.isRobotsAllowed(fullUrl)
|
||||||
|
) {
|
||||||
|
return fullUrl;
|
||||||
|
}
|
||||||
|
} else { // EXTERNAL LINKS
|
||||||
|
if (
|
||||||
|
this.isInternalLink(url) &&
|
||||||
|
this.allowExternalContentLinks &&
|
||||||
|
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||||
|
!this.matchesExcludes(fullUrl, true) &&
|
||||||
|
!this.isExternalMainPage(fullUrl)
|
||||||
|
) {
|
||||||
|
return fullUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||||
return [];
|
return [];
|
||||||
@ -287,31 +335,9 @@ export class WebCrawler {
|
|||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
const href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
let fullUrl = href;
|
const u = this.filterURL(href, url);
|
||||||
if (!href.startsWith("http")) {
|
if (u !== null) {
|
||||||
fullUrl = new URL(href, this.baseUrl).toString();
|
links.push({ url: u, html: content, pageStatusCode, pageError });
|
||||||
}
|
|
||||||
const urlObj = new URL(fullUrl);
|
|
||||||
const path = urlObj.pathname;
|
|
||||||
|
|
||||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
|
||||||
if (this.isInternalLink(fullUrl) &&
|
|
||||||
this.noSections(fullUrl) &&
|
|
||||||
!this.matchesExcludes(path) &&
|
|
||||||
this.isRobotsAllowed(fullUrl)
|
|
||||||
) {
|
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
|
||||||
}
|
|
||||||
} else { // EXTERNAL LINKS
|
|
||||||
if (
|
|
||||||
this.isInternalLink(url) &&
|
|
||||||
this.allowExternalContentLinks &&
|
|
||||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
|
||||||
!this.matchesExcludes(fullUrl, true) &&
|
|
||||||
!this.isExternalMainPage(fullUrl)
|
|
||||||
) {
|
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -465,9 +491,13 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
if (response) {
|
// ignore 404
|
||||||
sitemapLinks = response;
|
} else {
|
||||||
|
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||||
|
if (response) {
|
||||||
|
sitemapLinks = response;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -480,7 +510,11 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
|
// ignore 404
|
||||||
|
} else {
|
||||||
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ import {
|
|||||||
replacePathsWithAbsolutePaths,
|
replacePathsWithAbsolutePaths,
|
||||||
} from "./utils/replacePaths";
|
} from "./utils/replacePaths";
|
||||||
import { generateCompletions } from "../../lib/LLM-extraction";
|
import { generateCompletions } from "../../lib/LLM-extraction";
|
||||||
import { getWebScraperQueue } from "../../../src/services/queue-service";
|
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
@ -44,6 +44,7 @@ export class WebScraperDataProvider {
|
|||||||
private crawlerMode: string = "default";
|
private crawlerMode: string = "default";
|
||||||
private allowBackwardCrawling: boolean = false;
|
private allowBackwardCrawling: boolean = false;
|
||||||
private allowExternalContentLinks: boolean = false;
|
private allowExternalContentLinks: boolean = false;
|
||||||
|
private priority?: number;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
@ -72,7 +73,8 @@ export class WebScraperDataProvider {
|
|||||||
url,
|
url,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
this.extractorOptions,
|
this.extractorOptions,
|
||||||
existingHTML
|
existingHTML,
|
||||||
|
this.priority,
|
||||||
);
|
);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
@ -88,21 +90,6 @@ export class WebScraperDataProvider {
|
|||||||
results[i + index] = result;
|
results[i + index] = result;
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
try {
|
|
||||||
if (this.mode === "crawl" && this.bullJobId) {
|
|
||||||
const job = await getWebScraperQueue().getJob(this.bullJobId);
|
|
||||||
const jobStatus = await job.getState();
|
|
||||||
if (jobStatus === "failed") {
|
|
||||||
Logger.info(
|
|
||||||
"Job has failed or has been cancelled by the user. Stopping the job..."
|
|
||||||
);
|
|
||||||
return [] as Document[];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
Logger.error(error.message);
|
|
||||||
return [] as Document[];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return results.filter((result) => result !== null) as Document[];
|
return results.filter((result) => result !== null) as Document[];
|
||||||
}
|
}
|
||||||
@ -608,6 +595,7 @@ export class WebScraperDataProvider {
|
|||||||
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||||
this.allowExternalContentLinks =
|
this.allowExternalContentLinks =
|
||||||
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||||
|
this.priority = options.priority;
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
@ -26,6 +26,7 @@ export async function scrapWithFireEngine({
|
|||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
|
priority,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
@ -35,6 +36,7 @@ export async function scrapWithFireEngine({
|
|||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
|
priority?: number;
|
||||||
}): Promise<FireEngineResponse> {
|
}): Promise<FireEngineResponse> {
|
||||||
const logParams = {
|
const logParams = {
|
||||||
url,
|
url,
|
||||||
@ -78,6 +80,7 @@ export async function scrapWithFireEngine({
|
|||||||
fullPageScreenshot: fullPageScreenshotParam,
|
fullPageScreenshot: fullPageScreenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
priority,
|
||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -134,7 +134,8 @@ export async function scrapSingleUrl(
|
|||||||
extractorOptions: ExtractorOptions = {
|
extractorOptions: ExtractorOptions = {
|
||||||
mode: "llm-extraction-from-markdown",
|
mode: "llm-extraction-from-markdown",
|
||||||
},
|
},
|
||||||
existingHtml: string = ""
|
existingHtml: string = "",
|
||||||
|
priority?: number,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
@ -177,7 +178,8 @@ export async function scrapSingleUrl(
|
|||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
engine: engine,
|
engine: engine,
|
||||||
}
|
},
|
||||||
|
priority,
|
||||||
});
|
});
|
||||||
scraperResponse.text = response.html;
|
scraperResponse.text = response.html;
|
||||||
scraperResponse.screenshot = response.screenshot;
|
scraperResponse.screenshot = response.screenshot;
|
||||||
@ -340,7 +342,7 @@ export async function scrapSingleUrl(
|
|||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (pageStatusCode && pageStatusCode == 404) {
|
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -41,10 +41,10 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||||||
links.push(href);
|
links.push(href);
|
||||||
} else if (href.startsWith('/')) {
|
} else if (href.startsWith('/')) {
|
||||||
// Relative URL starting with '/', append to origin
|
// Relative URL starting with '/', append to origin
|
||||||
links.push(`${origin}${href}`);
|
links.push(new URL(href, baseUrl).href);
|
||||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
||||||
// Relative URL not starting with '/', append to base URL
|
// Relative URL not starting with '/', append to base URL
|
||||||
links.push(`${baseUrl}/${href}`);
|
links.push(new URL(href, baseUrl).href);
|
||||||
} else if (href.startsWith('mailto:')) {
|
} else if (href.startsWith('mailto:')) {
|
||||||
// mailto: links, add as is
|
// mailto: links, add as is
|
||||||
links.push(href);
|
links.push(href);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
import { getWebScraperQueue } from "../queue-service";
|
import { getScrapeQueue } from "../queue-service";
|
||||||
import { sendSlackWebhook } from "./slack";
|
import { sendSlackWebhook } from "./slack";
|
||||||
|
|
||||||
export async function checkAlerts() {
|
export async function checkAlerts() {
|
||||||
@ -13,8 +13,8 @@ export async function checkAlerts() {
|
|||||||
Logger.info("Initializing alerts");
|
Logger.info("Initializing alerts");
|
||||||
const checkActiveJobs = async () => {
|
const checkActiveJobs = async () => {
|
||||||
try {
|
try {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const activeJobs = await webScraperQueue.getActiveCount();
|
const activeJobs = await scrapeQueue.getActiveCount();
|
||||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||||
Logger.warn(
|
Logger.warn(
|
||||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
||||||
@ -34,11 +34,10 @@ export async function checkAlerts() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const checkWaitingQueue = async () => {
|
const checkWaitingQueue = async () => {
|
||||||
const webScraperQueue = getWebScraperQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const waitingJobs = await webScraperQueue.getWaitingCount();
|
const waitingJobs = await scrapeQueue.getWaitingCount();
|
||||||
const paused = await webScraperQueue.getPausedCount();
|
|
||||||
|
|
||||||
if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||||
Logger.warn(
|
Logger.warn(
|
||||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
||||||
);
|
);
|
||||||
|
@ -40,10 +40,11 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
extractor_options: job.extractor_options,
|
extractor_options: job.extractor_options,
|
||||||
num_tokens: job.num_tokens,
|
num_tokens: job.num_tokens,
|
||||||
retry: !!job.retry,
|
retry: !!job.retry,
|
||||||
|
crawl_id: job.crawl_id,
|
||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (process.env.POSTHOG_API_KEY) {
|
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
|
||||||
let phLog = {
|
let phLog = {
|
||||||
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
||||||
...(job.team_id !== "preview" && {
|
...(job.team_id !== "preview" && {
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
import { Job, Queue } from "bull";
|
import { Job, Queue } from "bullmq";
|
||||||
import {
|
import { getScrapeQueue } from "./queue-service";
|
||||||
getWebScraperQueue,
|
|
||||||
} from "./queue-service";
|
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { WebScraperOptions } from "../types";
|
import { WebScraperOptions } from "../types";
|
||||||
|
|
||||||
export async function addWebScraperJob(
|
export async function addScrapeJob(
|
||||||
webScraperOptions: WebScraperOptions,
|
webScraperOptions: WebScraperOptions,
|
||||||
options: any = {}
|
options: any = {},
|
||||||
|
jobId: string = uuidv4(),
|
||||||
): Promise<Job> {
|
): Promise<Job> {
|
||||||
return await getWebScraperQueue().add(webScraperOptions, {
|
return await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||||
|
priority: webScraperOptions.crawl_id ? 20 : 10,
|
||||||
...options,
|
...options,
|
||||||
jobId: uuidv4(),
|
jobId,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,23 +1,40 @@
|
|||||||
import Queue from "bull";
|
import { Queue } from "bullmq";
|
||||||
import { Queue as BullQueue } from "bull";
|
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import IORedis from "ioredis";
|
||||||
|
|
||||||
let webScraperQueue: BullQueue;
|
let scrapeQueue: Queue;
|
||||||
|
|
||||||
export function getWebScraperQueue() {
|
export const redisConnection = new IORedis(process.env.REDIS_URL, {
|
||||||
if (!webScraperQueue) {
|
maxRetriesPerRequest: null,
|
||||||
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
|
});
|
||||||
settings: {
|
|
||||||
lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
export const scrapeQueueName = "{scrapeQueue}";
|
||||||
lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
|
||||||
stalledInterval: 30 * 1000,
|
export function getScrapeQueue() {
|
||||||
maxStalledCount: 10,
|
if (!scrapeQueue) {
|
||||||
},
|
scrapeQueue = new Queue(
|
||||||
defaultJobOptions:{
|
scrapeQueueName,
|
||||||
attempts: 2
|
{
|
||||||
|
connection: redisConnection,
|
||||||
}
|
}
|
||||||
});
|
// {
|
||||||
|
// settings: {
|
||||||
|
// lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
||||||
|
// lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
||||||
|
// stalledInterval: 30 * 1000,
|
||||||
|
// maxStalledCount: 10,
|
||||||
|
// },
|
||||||
|
// defaultJobOptions:{
|
||||||
|
// attempts: 5
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
);
|
||||||
Logger.info("Web scraper queue created");
|
Logger.info("Web scraper queue created");
|
||||||
}
|
}
|
||||||
return webScraperQueue;
|
return scrapeQueue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
import { QueueEvents } from 'bullmq';
|
||||||
|
|
||||||
|
export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection });
|
@ -1,74 +1,273 @@
|
|||||||
import { CustomError } from "../lib/custom-error";
|
import { CustomError } from "../lib/custom-error";
|
||||||
import { getWebScraperQueue } from "./queue-service";
|
import {
|
||||||
|
getScrapeQueue,
|
||||||
|
redisConnection,
|
||||||
|
scrapeQueueName,
|
||||||
|
} from "./queue-service";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import { logtail } from "./logtail";
|
import { logtail } from "./logtail";
|
||||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
import { logJob } from "./logging/log_job";
|
import { logJob } from "./logging/log_job";
|
||||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||||
import { Job } from "bull";
|
import { Job } from "bullmq";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { Worker } from "bullmq";
|
||||||
|
import systemMonitor from "./system-monitor";
|
||||||
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import { addCrawlJob, addCrawlJobDone, crawlToCrawler, getCrawl, getCrawlJobs, isCrawlFinished, lockURL } from "../lib/crawl-redis";
|
||||||
|
import { StoredCrawl } from "../lib/crawl-redis";
|
||||||
|
import { addScrapeJob } from "./queue-jobs";
|
||||||
|
import { supabaseGetJobById } from "../../src/lib/supabase-jobs";
|
||||||
|
|
||||||
if (process.env.ENV === 'production') {
|
if (process.env.ENV === "production") {
|
||||||
initSDK({
|
initSDK({
|
||||||
consoleCapture: true,
|
consoleCapture: true,
|
||||||
additionalInstrumentations: [],
|
additionalInstrumentations: [],
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
|
||||||
const wsq = getWebScraperQueue();
|
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||||
|
const workerStalledCheckInterval =
|
||||||
|
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
|
||||||
|
const jobLockExtendInterval =
|
||||||
|
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
||||||
|
const jobLockExtensionTime =
|
||||||
|
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
||||||
|
|
||||||
async function processJob(job: Job, done) {
|
const cantAcceptConnectionInterval =
|
||||||
|
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
||||||
|
const connectionMonitorInterval =
|
||||||
|
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
||||||
|
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||||
|
|
||||||
|
const processJobInternal = async (token: string, job: Job) => {
|
||||||
|
const extendLockInterval = setInterval(async () => {
|
||||||
|
Logger.info(`🐂 Worker extending lock on job ${job.id}`);
|
||||||
|
await job.extendLock(token, jobLockExtensionTime);
|
||||||
|
}, jobLockExtendInterval);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await processJob(job, token);
|
||||||
|
try{
|
||||||
|
if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
await job.moveToCompleted(null, token, false);
|
||||||
|
} else {
|
||||||
|
await job.moveToCompleted(result.docs, token, false);
|
||||||
|
}
|
||||||
|
}catch(e){
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.log("Job failed, error:", error);
|
||||||
|
|
||||||
|
await job.moveToFailed(error, token, false);
|
||||||
|
} finally {
|
||||||
|
clearInterval(extendLockInterval);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let isShuttingDown = false;
|
||||||
|
|
||||||
|
process.on("SIGINT", () => {
|
||||||
|
console.log("Received SIGINT. Shutting down gracefully...");
|
||||||
|
isShuttingDown = true;
|
||||||
|
});
|
||||||
|
|
||||||
|
const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise<void>) => {
|
||||||
|
const worker = new Worker(queueName, null, {
|
||||||
|
connection: redisConnection,
|
||||||
|
lockDuration: 1 * 60 * 1000, // 1 minute
|
||||||
|
// lockRenewTime: 15 * 1000, // 15 seconds
|
||||||
|
stalledInterval: 30 * 1000, // 30 seconds
|
||||||
|
maxStalledCount: 10, // 10 times
|
||||||
|
});
|
||||||
|
|
||||||
|
worker.startStalledCheckTimer();
|
||||||
|
|
||||||
|
const monitor = await systemMonitor;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (isShuttingDown) {
|
||||||
|
console.log("No longer accepting new jobs. SIGINT");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const token = uuidv4();
|
||||||
|
const canAcceptConnection = await monitor.acceptConnection();
|
||||||
|
if (!canAcceptConnection) {
|
||||||
|
console.log("Cant accept connection");
|
||||||
|
await sleep(cantAcceptConnectionInterval); // more sleep
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const job = await worker.getNextJob(token);
|
||||||
|
if (job) {
|
||||||
|
processJobInternal(token, job);
|
||||||
|
await sleep(gotJobInterval);
|
||||||
|
} else {
|
||||||
|
await sleep(connectionMonitorInterval);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
workerFun(scrapeQueueName, processJobInternal);
|
||||||
|
|
||||||
|
async function processJob(job: Job, token: string) {
|
||||||
Logger.info(`🐂 Worker taking job ${job.id}`);
|
Logger.info(`🐂 Worker taking job ${job.id}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
job.progress({
|
job.updateProgress({
|
||||||
current: 1,
|
current: 1,
|
||||||
total: 100,
|
total: 100,
|
||||||
current_step: "SCRAPING",
|
current_step: "SCRAPING",
|
||||||
current_url: "",
|
current_url: "",
|
||||||
});
|
});
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
const { success, message, docs } = await startWebScraperPipeline({ job });
|
const { success, message, docs } = await startWebScraperPipeline({
|
||||||
|
job,
|
||||||
|
token,
|
||||||
|
});
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success: success,
|
success,
|
||||||
result: {
|
result: {
|
||||||
links: docs.map((doc) => {
|
links: docs.map((doc) => {
|
||||||
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
return {
|
||||||
|
content: doc,
|
||||||
|
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||||
|
};
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
project_id: job.data.project_id,
|
project_id: job.data.project_id,
|
||||||
error: message /* etc... */,
|
error: message /* etc... */,
|
||||||
|
docs,
|
||||||
};
|
};
|
||||||
|
|
||||||
await callWebhook(job.data.team_id, job.id as string, data);
|
if (job.data.mode === "crawl") {
|
||||||
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (job.data.crawl_id) {
|
||||||
|
await logJob({
|
||||||
|
job_id: job.id as string,
|
||||||
|
success: success,
|
||||||
|
message: message,
|
||||||
|
num_docs: docs.length,
|
||||||
|
docs: docs,
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: job.data.mode,
|
||||||
|
url: job.data.url,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
pageOptions: job.data.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||||
|
|
||||||
|
const sc = await getCrawl(job.data.crawl_id) as StoredCrawl;
|
||||||
|
|
||||||
|
if (!job.data.sitemapped) {
|
||||||
|
if (!sc.cancelled) {
|
||||||
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
|
||||||
|
const links = crawler.filterLinks((data.docs[0].linksOnPage as string[])
|
||||||
|
.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
||||||
|
.filter(x => x !== null),
|
||||||
|
Infinity,
|
||||||
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
|
)
|
||||||
|
|
||||||
|
for (const link of links) {
|
||||||
|
if (await lockURL(job.data.crawl_id, sc, link)) {
|
||||||
|
const newJob = await addScrapeJob({
|
||||||
|
url: link,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
team_id: sc.team_id,
|
||||||
|
pageOptions: sc.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (await isCrawlFinished(job.data.crawl_id)) {
|
||||||
|
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||||
|
|
||||||
|
const jobs = (await Promise.all(jobIDs.map(async x => {
|
||||||
|
if (x === job.id) {
|
||||||
|
return {
|
||||||
|
async getState() {
|
||||||
|
return "completed"
|
||||||
|
},
|
||||||
|
timestamp: Date.now(),
|
||||||
|
returnvalue: docs,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const j = await getScrapeQueue().getJob(x);
|
||||||
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
const supabaseData = await supabaseGetJobById(j.id);
|
||||||
|
|
||||||
|
if (supabaseData) {
|
||||||
|
j.returnvalue = supabaseData.docs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return j;
|
||||||
|
}))).sort((a, b) => a.timestamp - b.timestamp);
|
||||||
|
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
||||||
|
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
||||||
|
|
||||||
|
const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||||
|
|
||||||
|
await logJob({
|
||||||
|
job_id: job.data.crawl_id,
|
||||||
|
success: jobStatus === "completed",
|
||||||
|
message: message,
|
||||||
|
num_docs: fullDocs.length,
|
||||||
|
docs: [],
|
||||||
|
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: "crawl",
|
||||||
|
url: sc.originUrl,
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
pageOptions: sc.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
success: jobStatus !== "failed",
|
||||||
|
result: {
|
||||||
|
links: fullDocs.map((doc) => {
|
||||||
|
return {
|
||||||
|
content: doc,
|
||||||
|
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
project_id: job.data.project_id,
|
||||||
|
error: message /* etc... */,
|
||||||
|
docs: fullDocs,
|
||||||
|
};
|
||||||
|
|
||||||
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
await logJob({
|
|
||||||
job_id: job.id as string,
|
|
||||||
success: success,
|
|
||||||
message: message,
|
|
||||||
num_docs: docs.length,
|
|
||||||
docs: docs,
|
|
||||||
time_taken: timeTakenInSeconds,
|
|
||||||
team_id: job.data.team_id,
|
|
||||||
mode: "crawl",
|
|
||||||
url: job.data.url,
|
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
|
||||||
pageOptions: job.data.pageOptions,
|
|
||||||
origin: job.data.origin,
|
|
||||||
});
|
|
||||||
Logger.info(`🐂 Job done ${job.id}`);
|
Logger.info(`🐂 Job done ${job.id}`);
|
||||||
done(null, data);
|
return data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||||
if (await getWebScraperQueue().isPaused(false)) {
|
|
||||||
Logger.debug("🐂Queue is paused, ignoring");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (error instanceof CustomError) {
|
if (error instanceof CustomError) {
|
||||||
// Here we handle the error, then save the failed job
|
// Here we handle the error, then save the failed job
|
||||||
@ -81,6 +280,9 @@ async function processJob(job: Job, done) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
|
if (error.stack) {
|
||||||
|
Logger.error(error.stack);
|
||||||
|
}
|
||||||
|
|
||||||
logtail.error("Overall error ingesting", {
|
logtail.error("Overall error ingesting", {
|
||||||
job_id: job.id,
|
job_id: job.id,
|
||||||
@ -89,37 +291,69 @@ async function processJob(job: Job, done) {
|
|||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success: false,
|
success: false,
|
||||||
|
docs: [],
|
||||||
project_id: job.data.project_id,
|
project_id: job.data.project_id,
|
||||||
error:
|
error:
|
||||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||||
};
|
};
|
||||||
await callWebhook(job.data.team_id, job.id as string, data);
|
|
||||||
await logJob({
|
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
||||||
job_id: job.id as string,
|
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
||||||
success: false,
|
}
|
||||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
|
||||||
num_docs: 0,
|
if (job.data.crawl_id) {
|
||||||
docs: [],
|
await logJob({
|
||||||
time_taken: 0,
|
job_id: job.id as string,
|
||||||
team_id: job.data.team_id,
|
success: false,
|
||||||
mode: "crawl",
|
message:
|
||||||
url: job.data.url,
|
typeof error === "string"
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
? error
|
||||||
pageOptions: job.data.pageOptions,
|
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||||
origin: job.data.origin,
|
num_docs: 0,
|
||||||
});
|
docs: [],
|
||||||
done(null, data);
|
time_taken: 0,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: job.data.mode,
|
||||||
|
url: job.data.url,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
pageOptions: job.data.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
const sc = await getCrawl(job.data.crawl_id);
|
||||||
|
|
||||||
|
await logJob({
|
||||||
|
job_id: job.data.crawl_id,
|
||||||
|
success: false,
|
||||||
|
message:
|
||||||
|
typeof error === "string"
|
||||||
|
? error
|
||||||
|
: error.message ?? "Something went wrong... Contact help@mendable.ai",
|
||||||
|
num_docs: 0,
|
||||||
|
docs: [],
|
||||||
|
time_taken: 0,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
mode: "crawl",
|
||||||
|
url: sc ? sc.originUrl : job.data.url,
|
||||||
|
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||||
|
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// done(null, data);
|
||||||
|
return data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
wsq.process(
|
// wsq.process(
|
||||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
// Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||||
processJob
|
// processJob
|
||||||
);
|
// );
|
||||||
|
|
||||||
wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
// wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
|
||||||
wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
// wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
|
||||||
wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
// wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
|
||||||
wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
// wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||||
wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
// wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||||
wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
// wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||||
|
81
apps/api/src/services/system-monitor.ts
Normal file
81
apps/api/src/services/system-monitor.ts
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
import si from 'systeminformation';
|
||||||
|
import { Mutex } from "async-mutex";
|
||||||
|
|
||||||
|
const MAX_CPU = process.env.MAX_CPU ? parseFloat(process.env.MAX_CPU) : 0.8;
|
||||||
|
const MAX_RAM = process.env.MAX_RAM ? parseFloat(process.env.MAX_RAM) : 0.8;
|
||||||
|
const CACHE_DURATION = process.env.SYS_INFO_MAX_CACHE_DURATION ? parseFloat(process.env.SYS_INFO_MAX_CACHE_DURATION) : 150;
|
||||||
|
|
||||||
|
class SystemMonitor {
|
||||||
|
private static instance: SystemMonitor;
|
||||||
|
private static instanceMutex = new Mutex();
|
||||||
|
|
||||||
|
private cpuUsageCache: number | null = null;
|
||||||
|
private memoryUsageCache: number | null = null;
|
||||||
|
private lastCpuCheck: number = 0;
|
||||||
|
private lastMemoryCheck: number = 0;
|
||||||
|
|
||||||
|
private constructor() {}
|
||||||
|
|
||||||
|
public static async getInstance(): Promise<SystemMonitor> {
|
||||||
|
if (SystemMonitor.instance) {
|
||||||
|
return SystemMonitor.instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.instanceMutex.runExclusive(async () => {
|
||||||
|
if (!SystemMonitor.instance) {
|
||||||
|
SystemMonitor.instance = new SystemMonitor();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return SystemMonitor.instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async checkMemoryUsage() {
|
||||||
|
const now = Date.now();
|
||||||
|
if (this.memoryUsageCache !== null && (now - this.lastMemoryCheck) < CACHE_DURATION) {
|
||||||
|
return this.memoryUsageCache;
|
||||||
|
}
|
||||||
|
|
||||||
|
const memoryData = await si.mem();
|
||||||
|
const totalMemory = memoryData.total;
|
||||||
|
const availableMemory = memoryData.available;
|
||||||
|
const usedMemory = totalMemory - availableMemory;
|
||||||
|
const usedMemoryPercentage = (usedMemory / totalMemory);
|
||||||
|
|
||||||
|
this.memoryUsageCache = usedMemoryPercentage;
|
||||||
|
this.lastMemoryCheck = now;
|
||||||
|
|
||||||
|
return usedMemoryPercentage;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async checkCpuUsage() {
|
||||||
|
const now = Date.now();
|
||||||
|
if (this.cpuUsageCache !== null && (now - this.lastCpuCheck) < CACHE_DURATION) {
|
||||||
|
return this.cpuUsageCache;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cpuData = await si.currentLoad();
|
||||||
|
const cpuLoad = cpuData.currentLoad / 100;
|
||||||
|
|
||||||
|
this.cpuUsageCache = cpuLoad;
|
||||||
|
this.lastCpuCheck = now;
|
||||||
|
|
||||||
|
return cpuLoad;
|
||||||
|
}
|
||||||
|
|
||||||
|
public async acceptConnection() {
|
||||||
|
const cpuUsage = await this.checkCpuUsage();
|
||||||
|
const memoryUsage = await this.checkMemoryUsage();
|
||||||
|
|
||||||
|
return cpuUsage < MAX_CPU && memoryUsage < MAX_RAM;
|
||||||
|
}
|
||||||
|
|
||||||
|
public clearCache() {
|
||||||
|
this.cpuUsageCache = null;
|
||||||
|
this.memoryUsageCache = null;
|
||||||
|
this.lastCpuCheck = 0;
|
||||||
|
this.lastMemoryCheck = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default SystemMonitor.getInstance();
|
@ -25,8 +25,11 @@ export interface WebScraperOptions {
|
|||||||
mode: Mode;
|
mode: Mode;
|
||||||
crawlerOptions: any;
|
crawlerOptions: any;
|
||||||
pageOptions: any;
|
pageOptions: any;
|
||||||
|
extractorOptions?: any;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
origin?: string;
|
origin?: string;
|
||||||
|
crawl_id?: string;
|
||||||
|
sitemapped?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RunWebScraperParams {
|
export interface RunWebScraperParams {
|
||||||
@ -34,11 +37,13 @@ export interface RunWebScraperParams {
|
|||||||
mode: Mode;
|
mode: Mode;
|
||||||
crawlerOptions: any;
|
crawlerOptions: any;
|
||||||
pageOptions?: any;
|
pageOptions?: any;
|
||||||
|
extractorOptions?: any;
|
||||||
inProgress: (progress: any) => void;
|
inProgress: (progress: any) => void;
|
||||||
onSuccess: (result: any) => void;
|
onSuccess: (result: any, mode: string) => void;
|
||||||
onError: (error: Error) => void;
|
onError: (error: Error) => void;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
bull_job_id: string;
|
bull_job_id: string;
|
||||||
|
priority?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RunWebScraperResult {
|
export interface RunWebScraperResult {
|
||||||
@ -63,6 +68,7 @@ export interface FirecrawlJob {
|
|||||||
extractor_options?: ExtractorOptions,
|
extractor_options?: ExtractorOptions,
|
||||||
num_tokens?: number,
|
num_tokens?: number,
|
||||||
retry?: boolean,
|
retry?: boolean,
|
||||||
|
crawl_id?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FirecrawlScrapeResponse {
|
export interface FirecrawlScrapeResponse {
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/jest": "^29.5.12",
|
"@types/jest": "^29.5.12",
|
||||||
"@types/supertest": "^6.0.2",
|
"@types/supertest": "^6.0.2",
|
||||||
|
"artillery": "^2.0.19",
|
||||||
"typescript": "^5.4.5"
|
"typescript": "^5.4.5"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
6884
apps/test-suite/pnpm-lock.yaml
generated
6884
apps/test-suite/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user