mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 22:35:54 +08:00
Merge branch 'main' into mog/ongoing-crawls
This commit is contained in:
commit
782702d536
34
.github/workflows/deploy-playwright.yml
vendored
Normal file
34
.github/workflows/deploy-playwright.yml
vendored
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
name: Deploy Playwright to GHCR
|
||||||
|
|
||||||
|
env:
|
||||||
|
DOTNET_VERSION: '6.0.x'
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- apps/playwright-service-ts/**
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push-app-image:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: './apps/playwright-service-ts'
|
||||||
|
steps:
|
||||||
|
- name: 'Checkout GitHub Action'
|
||||||
|
uses: actions/checkout@main
|
||||||
|
|
||||||
|
- name: 'Login to GitHub Container Registry'
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{github.actor}}
|
||||||
|
password: ${{secrets.GITHUB_TOKEN}}
|
||||||
|
|
||||||
|
- name: 'Build Inventory Image'
|
||||||
|
run: |
|
||||||
|
docker build . --tag ghcr.io/mendableai/playwright-service:latest
|
||||||
|
docker push ghcr.io/mendableai/playwright-service:latest
|
1
.github/workflows/test-server-self-host.yml
vendored
1
.github/workflows/test-server-self-host.yml
vendored
@ -16,6 +16,7 @@ env:
|
|||||||
TEST_SUITE_SELF_HOSTED: true
|
TEST_SUITE_SELF_HOSTED: true
|
||||||
USE_GO_MARKDOWN_PARSER: true
|
USE_GO_MARKDOWN_PARSER: true
|
||||||
FIRECRAWL_DEBUG_FILTER_LINKS: true
|
FIRECRAWL_DEBUG_FILTER_LINKS: true
|
||||||
|
SENTRY_ENVIRONMENT: dev
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test:
|
test:
|
||||||
|
9
.github/workflows/test-server.yml
vendored
9
.github/workflows/test-server.yml
vendored
@ -21,6 +21,9 @@ env:
|
|||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
|
SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
|
||||||
|
INDEX_SUPABASE_SERVICE_TOKEN: ${{ secrets.INDEX_SUPABASE_SERVICE_TOKEN }}
|
||||||
|
INDEX_SUPABASE_ANON_TOKEN: ${{ secrets.INDEX_SUPABASE_ANON_TOKEN }}
|
||||||
|
INDEX_SUPABASE_URL: ${{ secrets.INDEX_SUPABASE_URL }}
|
||||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||||
USE_DB_AUTHENTICATION: true
|
USE_DB_AUTHENTICATION: true
|
||||||
@ -30,6 +33,7 @@ env:
|
|||||||
RUNPOD_MU_API_KEY: ${{ secrets.RUNPOD_MU_API_KEY }}
|
RUNPOD_MU_API_KEY: ${{ secrets.RUNPOD_MU_API_KEY }}
|
||||||
GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }}
|
GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }}
|
||||||
GCS_BUCKET_NAME: ${{ secrets.GCS_BUCKET_NAME }}
|
GCS_BUCKET_NAME: ${{ secrets.GCS_BUCKET_NAME }}
|
||||||
|
GCS_INDEX_BUCKET_NAME: ${{ secrets.GCS_INDEX_BUCKET_NAME }}
|
||||||
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
|
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
|
||||||
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
||||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||||
@ -100,11 +104,11 @@ jobs:
|
|||||||
working-directory: ./apps/api
|
working-directory: ./apps/api
|
||||||
id: start_workers
|
id: start_workers
|
||||||
- name: Start index worker
|
- name: Start index worker
|
||||||
run: npm run index-worker &
|
run: npm run index-worker > index-worker.log 2>&1 &
|
||||||
working-directory: ./apps/api
|
working-directory: ./apps/api
|
||||||
id: start_index_worker
|
id: start_index_worker
|
||||||
- name: Wait for API
|
- name: Wait for API
|
||||||
run: pnpx wait-on tcp:3002 -t 15s
|
run: pnpx wait-on tcp:3002 -t 30s
|
||||||
- name: Run snippet tests
|
- name: Run snippet tests
|
||||||
run: |
|
run: |
|
||||||
npm run test:snips
|
npm run test:snips
|
||||||
@ -119,3 +123,4 @@ jobs:
|
|||||||
# path: |
|
# path: |
|
||||||
# ./apps/api/api.log
|
# ./apps/api/api.log
|
||||||
# ./apps/api/worker.log
|
# ./apps/api/worker.log
|
||||||
|
# ./apps/api/index-worker.log
|
@ -124,7 +124,7 @@ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut lib
|
|||||||
let meta = meta.as_node().as_element().unwrap();
|
let meta = meta.as_node().as_element().unwrap();
|
||||||
let attrs = meta.attributes.borrow();
|
let attrs = meta.attributes.borrow();
|
||||||
|
|
||||||
if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) {
|
if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")).or_else(|| attrs.get("itemprop")) {
|
||||||
if let Some(content) = attrs.get("content") {
|
if let Some(content) = attrs.get("content") {
|
||||||
if let Some(v) = out.get(name) {
|
if let Some(v) = out.get(name) {
|
||||||
match v {
|
match v {
|
||||||
|
@ -64,6 +64,8 @@ describe("Queue Concurrency Integration", () => {
|
|||||||
removeBase64Images: true,
|
removeBase64Images: true,
|
||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: true,
|
blockAds: true,
|
||||||
|
maxAge: 0,
|
||||||
|
storeInCache: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
|
@ -1,197 +1,211 @@
|
|||||||
// import { batchScrape, crawl, creditUsage, extract, map, scrape, search, tokenUsage } from "./lib";
|
import { batchScrape, crawl, creditUsage, extract, map, scrape, search, tokenUsage } from "./lib";
|
||||||
|
|
||||||
// const sleep = (ms: number) => new Promise(x => setTimeout(() => x(true), ms));
|
const sleep = (ms: number) => new Promise(x => setTimeout(() => x(true), ms));
|
||||||
// const sleepForBatchBilling = () => sleep(20000);
|
const sleepForBatchBilling = () => sleep(40000);
|
||||||
|
|
||||||
// beforeAll(async () => {
|
beforeAll(async () => {
|
||||||
// // Wait for previous test runs to stop billing processing
|
// Wait for previous test runs to stop billing processing
|
||||||
// if (!process.env.TEST_SUITE_SELF_HOSTED) {
|
if (!process.env.TEST_SUITE_SELF_HOSTED) {
|
||||||
// await sleep(40000);
|
await sleep(40000);
|
||||||
// }
|
}
|
||||||
// }, 50000);
|
}, 50000);
|
||||||
|
|
||||||
// describe("Billing tests", () => {
|
describe("Billing tests", () => {
|
||||||
// if (process.env.TEST_SUITE_SELF_HOSTED) {
|
if (process.env.TEST_SUITE_SELF_HOSTED) {
|
||||||
// it("dummy", () => {
|
it("dummy", () => {
|
||||||
// expect(true).toBe(true);
|
|
||||||
// });
|
|
||||||
// } else {
|
|
||||||
// it("bills scrape correctly", async () => {
|
|
||||||
// const rc1 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// // Run all scrape operations in parallel with Promise.all
|
|
||||||
// await Promise.all([
|
|
||||||
// // scrape 1: regular fc.dev scrape (1 credit)
|
|
||||||
// scrape({
|
|
||||||
// url: "https://firecrawl.dev"
|
|
||||||
// }),
|
|
||||||
|
|
||||||
// // scrape 1.1: regular fc.dev scrape (1 credit)
|
|
||||||
// scrape({
|
|
||||||
// url: "https://firecrawl.dev"
|
|
||||||
// }),
|
|
||||||
|
|
||||||
// // scrape 2: fc.dev with json (5 credits)
|
|
||||||
// scrape({
|
|
||||||
// url: "https://firecrawl.dev",
|
|
||||||
// formats: ["json"],
|
|
||||||
// jsonOptions: {
|
|
||||||
// schema: {
|
|
||||||
// type: "object",
|
|
||||||
// properties: {
|
|
||||||
// is_open_source: { type: "boolean" },
|
|
||||||
// },
|
|
||||||
// required: ["is_open_source"],
|
|
||||||
// },
|
|
||||||
// },
|
|
||||||
// })
|
|
||||||
// ]);
|
|
||||||
|
|
||||||
// // sum: 7 credits
|
|
||||||
|
|
||||||
// await sleepForBatchBilling();
|
|
||||||
|
|
||||||
// const rc2 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// expect(rc1 - rc2).toBe(7);
|
|
||||||
// }, 120000);
|
|
||||||
|
|
||||||
// it("bills batch scrape correctly", async () => {
|
|
||||||
// const rc1 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// // Run both scrape operations in parallel with Promise.all
|
|
||||||
// const [scrape1, scrape2] = await Promise.all([
|
|
||||||
// // scrape 1: regular batch scrape with failing domain (2 credits)
|
|
||||||
// batchScrape({
|
|
||||||
// urls: [
|
|
||||||
// "https://firecrawl.dev",
|
|
||||||
// "https://mendable.ai",
|
|
||||||
// "https://thisdomaindoesnotexistandwillfail.fcr",
|
|
||||||
// ],
|
|
||||||
// }),
|
|
||||||
|
|
||||||
// // scrape 2: batch scrape with json (10 credits)
|
|
||||||
// batchScrape({
|
|
||||||
// urls: [
|
|
||||||
// "https://firecrawl.dev",
|
|
||||||
// "https://mendable.ai",
|
|
||||||
// "https://thisdomaindoesnotexistandwillfail.fcr",
|
|
||||||
// ],
|
|
||||||
// formats: ["json"],
|
|
||||||
// jsonOptions: {
|
|
||||||
// schema: {
|
|
||||||
// type: "object",
|
|
||||||
// properties: {
|
|
||||||
// four_word_summary: { type: "string" },
|
|
||||||
// },
|
|
||||||
// required: ["four_word_summary"],
|
|
||||||
// },
|
|
||||||
// },
|
|
||||||
// })
|
|
||||||
// ]);
|
|
||||||
|
|
||||||
// // sum: 12 credits
|
|
||||||
|
|
||||||
// await sleepForBatchBilling();
|
|
||||||
|
|
||||||
// const rc2 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// expect(rc1 - rc2).toBe(12);
|
|
||||||
// }, 600000);
|
|
||||||
|
|
||||||
// it("bills crawl correctly", async () => {
|
|
||||||
// const rc1 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// // Run both crawl operations in parallel with Promise.all
|
|
||||||
// const [crawl1, crawl2] = await Promise.all([
|
|
||||||
// // crawl 1: regular fc.dev crawl (x credits)
|
|
||||||
// crawl({
|
|
||||||
// url: "https://firecrawl.dev",
|
|
||||||
// }),
|
|
||||||
|
|
||||||
// // crawl 2: fc.dev crawl with json (5y credits)
|
|
||||||
// crawl({
|
|
||||||
// url: "https://firecrawl.dev",
|
|
||||||
// scrapeOptions: {
|
|
||||||
// formats: ["json"],
|
|
||||||
// jsonOptions: {
|
|
||||||
// schema: {
|
|
||||||
// type: "object",
|
|
||||||
// properties: {
|
|
||||||
// four_word_summary: { type: "string" },
|
|
||||||
// },
|
|
||||||
// required: ["four_word_summary"],
|
|
||||||
// },
|
|
||||||
// },
|
|
||||||
// }
|
|
||||||
// })
|
|
||||||
// ]);
|
|
||||||
|
|
||||||
// expect(crawl1.success).toBe(true);
|
|
||||||
// expect(crawl2.success).toBe(true);
|
|
||||||
|
|
||||||
// // sum: x+5y credits
|
|
||||||
|
|
||||||
// await sleepForBatchBilling();
|
|
||||||
|
|
||||||
// const rc2 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// if (crawl1.success && crawl2.success) {
|
|
||||||
// expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
|
|
||||||
// }
|
|
||||||
// }, 600000);
|
|
||||||
|
|
||||||
// it("bills map correctly", async () => {
|
|
||||||
// const rc1 = (await creditUsage()).remaining_credits;
|
|
||||||
// await map({ url: "https://firecrawl.dev" });
|
|
||||||
// await sleepForBatchBilling();
|
|
||||||
// const rc2 = (await creditUsage()).remaining_credits;
|
|
||||||
// expect(rc1 - rc2).toBe(1);
|
|
||||||
// }, 60000);
|
|
||||||
|
|
||||||
// it("bills search correctly", async () => {
|
|
||||||
// const rc1 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// const results = await search({
|
|
||||||
// query: "firecrawl"
|
|
||||||
// });
|
|
||||||
|
|
||||||
// await sleepForBatchBilling();
|
|
||||||
|
|
||||||
// const rc2 = (await creditUsage()).remaining_credits;
|
|
||||||
|
|
||||||
// expect(rc1 - rc2).toBe(results.length);
|
|
||||||
// }, 60000);
|
|
||||||
|
|
||||||
// it("bills extract correctly", async () => {
|
|
||||||
// const rc1 = (await tokenUsage()).remaining_tokens;
|
|
||||||
|
|
||||||
// await extract({
|
|
||||||
// urls: ["https://firecrawl.dev"],
|
|
||||||
// schema: {
|
|
||||||
// "type": "object",
|
|
||||||
// "properties": {
|
|
||||||
// "is_open_source": {
|
|
||||||
// "type": "boolean"
|
|
||||||
// }
|
|
||||||
// },
|
|
||||||
// "required": [
|
|
||||||
// "is_open_source"
|
|
||||||
// ]
|
|
||||||
// },
|
|
||||||
// origin: "api-sdk",
|
|
||||||
// });
|
|
||||||
|
|
||||||
// await sleepForBatchBilling();
|
|
||||||
|
|
||||||
// const rc2 = (await tokenUsage()).remaining_tokens;
|
|
||||||
|
|
||||||
// expect(rc1 - rc2).toBe(305);
|
|
||||||
// }, 300000);
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
|
|
||||||
// temporarily disabled
|
|
||||||
it("is mocked", () => {
|
|
||||||
expect(true).toBe(true);
|
expect(true).toBe(true);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
it("bills scrape correctly", async () => {
|
||||||
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
// Run all scrape operations in parallel with Promise.all
|
||||||
|
await Promise.all([
|
||||||
|
// scrape 1: regular fc.dev scrape (1 credit)
|
||||||
|
scrape({
|
||||||
|
url: "https://firecrawl.dev"
|
||||||
|
}),
|
||||||
|
|
||||||
|
// scrape 1.1: regular fc.dev scrape (1 credit)
|
||||||
|
scrape({
|
||||||
|
url: "https://firecrawl.dev"
|
||||||
|
}),
|
||||||
|
|
||||||
|
// scrape 2: fc.dev with json (5 credits)
|
||||||
|
scrape({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
formats: ["json"],
|
||||||
|
jsonOptions: {
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
is_open_source: { type: "boolean" },
|
||||||
|
},
|
||||||
|
required: ["is_open_source"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
]);
|
||||||
|
|
||||||
|
// sum: 7 credits
|
||||||
|
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
expect(rc1 - rc2).toBe(7);
|
||||||
|
}, 120000);
|
||||||
|
|
||||||
|
it("bills batch scrape correctly", async () => {
|
||||||
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
// Run both scrape operations in parallel with Promise.all
|
||||||
|
const [scrape1, scrape2] = await Promise.all([
|
||||||
|
// scrape 1: regular batch scrape with failing domain (2 credits)
|
||||||
|
batchScrape({
|
||||||
|
urls: [
|
||||||
|
"https://firecrawl.dev",
|
||||||
|
"https://mendable.ai",
|
||||||
|
"https://thisdomaindoesnotexistandwillfail.fcr",
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
|
||||||
|
// scrape 2: batch scrape with json (10 credits)
|
||||||
|
batchScrape({
|
||||||
|
urls: [
|
||||||
|
"https://firecrawl.dev",
|
||||||
|
"https://mendable.ai",
|
||||||
|
"https://thisdomaindoesnotexistandwillfail.fcr",
|
||||||
|
],
|
||||||
|
formats: ["json"],
|
||||||
|
jsonOptions: {
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
four_word_summary: { type: "string" },
|
||||||
|
},
|
||||||
|
required: ["four_word_summary"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
]);
|
||||||
|
|
||||||
|
// sum: 12 credits
|
||||||
|
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
expect(rc1 - rc2).toBe(12);
|
||||||
|
}, 600000);
|
||||||
|
|
||||||
|
it("bills crawl correctly", async () => {
|
||||||
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
// Run both crawl operations in parallel with Promise.all
|
||||||
|
const [crawl1, crawl2] = await Promise.all([
|
||||||
|
// crawl 1: regular fc.dev crawl (x credits)
|
||||||
|
crawl({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
limit: 10,
|
||||||
|
}),
|
||||||
|
|
||||||
|
// crawl 2: fc.dev crawl with json (5y credits)
|
||||||
|
crawl({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
scrapeOptions: {
|
||||||
|
formats: ["json"],
|
||||||
|
jsonOptions: {
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
four_word_summary: { type: "string" },
|
||||||
|
},
|
||||||
|
required: ["four_word_summary"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
limit: 10,
|
||||||
|
})
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(crawl1.success).toBe(true);
|
||||||
|
expect(crawl2.success).toBe(true);
|
||||||
|
|
||||||
|
// sum: x+5y credits
|
||||||
|
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
if (crawl1.success && crawl2.success) {
|
||||||
|
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
|
||||||
|
}
|
||||||
|
}, 600000);
|
||||||
|
|
||||||
|
it("bills map correctly", async () => {
|
||||||
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
await map({ url: "https://firecrawl.dev" });
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
expect(rc1 - rc2).toBe(1);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it("bills search correctly", async () => {
|
||||||
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
const results = await search({
|
||||||
|
query: "firecrawl"
|
||||||
|
});
|
||||||
|
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
expect(rc1 - rc2).toBe(results.length);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it("bills search with scrape correctly", async () => {
|
||||||
|
const rc1 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
const results = await search({
|
||||||
|
query: "firecrawl",
|
||||||
|
scrapeOptions: {
|
||||||
|
formats: ["markdown"],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
|
expect(rc1 - rc2).toBe(results.length);
|
||||||
|
}, 600000);
|
||||||
|
|
||||||
|
it("bills extract correctly", async () => {
|
||||||
|
const rc1 = (await tokenUsage()).remaining_tokens;
|
||||||
|
|
||||||
|
await extract({
|
||||||
|
urls: ["https://firecrawl.dev"],
|
||||||
|
schema: {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"is_open_source": {
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"is_open_source"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
origin: "api-sdk",
|
||||||
|
});
|
||||||
|
|
||||||
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
|
const rc2 = (await tokenUsage()).remaining_tokens;
|
||||||
|
|
||||||
|
expect(rc1 - rc2).toBe(305);
|
||||||
|
}, 300000);
|
||||||
|
}
|
||||||
});
|
});
|
@ -1,4 +1,5 @@
|
|||||||
import { scrape, scrapeStatus, scrapeWithFailure } from "./lib";
|
import { scrape, scrapeStatus, scrapeWithFailure } from "./lib";
|
||||||
|
import crypto from "crypto";
|
||||||
|
|
||||||
describe("Scrape tests", () => {
|
describe("Scrape tests", () => {
|
||||||
it.concurrent("mocking works properly", async () => {
|
it.concurrent("mocking works properly", async () => {
|
||||||
@ -73,27 +74,71 @@ describe("Scrape tests", () => {
|
|||||||
|
|
||||||
expect(response.markdown).toContain("Firecrawl");
|
expect(response.markdown).toContain("Firecrawl");
|
||||||
|
|
||||||
|
// Give time to propagate to read replica
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||||
|
|
||||||
const status = await scrapeStatus(response.metadata.scrapeId!);
|
const status = await scrapeStatus(response.metadata.scrapeId!);
|
||||||
expect(JSON.stringify(status)).toBe(JSON.stringify(response));
|
expect(JSON.stringify(status)).toBe(JSON.stringify(response));
|
||||||
}, 60000);
|
}, 60000);
|
||||||
|
|
||||||
describe("Ad blocking (f-e dependant)", () => {
|
// describe("Ad blocking (f-e dependant)", () => {
|
||||||
it.concurrent("blocks ads by default", async () => {
|
// it.concurrent("blocks ads by default", async () => {
|
||||||
const response = await scrape({
|
// const response = await scrape({
|
||||||
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
// url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||||
|
// });
|
||||||
|
|
||||||
|
// expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||||
|
// }, 30000);
|
||||||
|
|
||||||
|
// it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||||
|
// const response = await scrape({
|
||||||
|
// url: "https://www.allrecipes.com/recipe/18185/yum/",
|
||||||
|
// blockAds: false,
|
||||||
|
// });
|
||||||
|
|
||||||
|
// expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
||||||
|
// }, 30000);
|
||||||
|
// });
|
||||||
|
|
||||||
|
describe("Index", () => {
|
||||||
|
it.concurrent("caches properly", async () => {
|
||||||
|
const id = crypto.randomUUID();
|
||||||
|
const url = "https://firecrawl.dev/?testId=" + id;
|
||||||
|
|
||||||
|
const response1 = await scrape({
|
||||||
|
url,
|
||||||
|
maxAge: 120000,
|
||||||
|
storeInCache: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
expect(response1.metadata.cacheState).toBe("miss");
|
||||||
}, 30000);
|
|
||||||
|
|
||||||
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
await new Promise(resolve => setTimeout(resolve, 17000));
|
||||||
const response = await scrape({
|
|
||||||
url: "https://www.allrecipes.com/recipe/18185/yum/",
|
const response2 = await scrape({
|
||||||
blockAds: false,
|
url,
|
||||||
|
maxAge: 120000,
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
expect(response2.metadata.cacheState).toBe("miss");
|
||||||
}, 30000);
|
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 17000));
|
||||||
|
|
||||||
|
const response3 = await scrape({
|
||||||
|
url,
|
||||||
|
maxAge: 120000,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response3.metadata.cacheState).toBe("hit");
|
||||||
|
expect(response3.metadata.cachedAt).toBeDefined();
|
||||||
|
|
||||||
|
const response4 = await scrape({
|
||||||
|
url,
|
||||||
|
maxAge: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response4.metadata.cacheState).toBe("miss");
|
||||||
|
}, 150000 + 2 * 17000);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("Change Tracking format", () => {
|
describe("Change Tracking format", () => {
|
||||||
|
12
apps/api/src/controllers/v0/admin/index-queue-prometheus.ts
Normal file
12
apps/api/src/controllers/v0/admin/index-queue-prometheus.ts
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import type { Request, Response } from "express";
|
||||||
|
import { getIndexInsertQueueLength } from "../../../services";
|
||||||
|
|
||||||
|
export async function indexQueuePrometheus(req: Request, res: Response) {
|
||||||
|
const queueLength = await getIndexInsertQueueLength();
|
||||||
|
res.setHeader("Content-Type", "text/plain");
|
||||||
|
res.send(`\
|
||||||
|
# HELP firecrawl_index_queue_length The number of items in the index insert queue
|
||||||
|
# TYPE firecrawl_index_queue_length gauge
|
||||||
|
firecrawl_index_queue_length ${queueLength}
|
||||||
|
`);
|
||||||
|
}
|
@ -2,6 +2,7 @@ import { Request, Response } from "express";
|
|||||||
import { RequestWithAuth } from "./types";
|
import { RequestWithAuth } from "./types";
|
||||||
import { getACUCTeam } from "../auth";
|
import { getACUCTeam } from "../auth";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
|
import { RateLimiterMode } from "../../types";
|
||||||
|
|
||||||
export async function creditUsageController(
|
export async function creditUsageController(
|
||||||
req: RequestWithAuth,
|
req: RequestWithAuth,
|
||||||
@ -20,7 +21,7 @@ export async function creditUsageController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise fetch fresh data
|
// Otherwise fetch fresh data
|
||||||
const chunk = await getACUCTeam(req.auth.team_id);
|
const chunk = await getACUCTeam(req.auth.team_id, false, false, RateLimiterMode.Scrape);
|
||||||
if (!chunk) {
|
if (!chunk) {
|
||||||
res.status(404).json({
|
res.status(404).json({
|
||||||
success: false,
|
success: false,
|
||||||
|
@ -25,6 +25,7 @@ import { logger } from "../../lib/logger";
|
|||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
import { querySitemapIndex } from "../../scraper/WebScraper/sitemap-index";
|
||||||
import { getIndexQueue } from "../../services/queue-service";
|
import { getIndexQueue } from "../../services/queue-service";
|
||||||
|
import { queryIndexAtSplitLevel } from "../../services/index";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -43,6 +44,14 @@ interface MapResult {
|
|||||||
mapResults: MapDocument[];
|
mapResults: MapDocument[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function queryIndex(url: string, limit: number, useIndex: boolean): Promise<string[]> {
|
||||||
|
if (!useIndex) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return await queryIndexAtSplitLevel(url, limit);
|
||||||
|
}
|
||||||
|
|
||||||
export async function getMapResults({
|
export async function getMapResults({
|
||||||
url,
|
url,
|
||||||
search,
|
search,
|
||||||
@ -58,6 +67,7 @@ export async function getMapResults({
|
|||||||
mock,
|
mock,
|
||||||
filterByPath = true,
|
filterByPath = true,
|
||||||
flags,
|
flags,
|
||||||
|
useIndex = true,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@ -73,6 +83,7 @@ export async function getMapResults({
|
|||||||
mock?: string;
|
mock?: string;
|
||||||
filterByPath?: boolean;
|
filterByPath?: boolean;
|
||||||
flags: TeamFlags;
|
flags: TeamFlags;
|
||||||
|
useIndex?: boolean;
|
||||||
}): Promise<MapResult> {
|
}): Promise<MapResult> {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [url];
|
let links: string[] = [url];
|
||||||
@ -165,11 +176,16 @@ export async function getMapResults({
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Parallelize sitemap index query with search results
|
// Parallelize sitemap index query with search results
|
||||||
const [sitemapIndexResult, ...searchResults] = await Promise.all([
|
const [sitemapIndexResult, indexResults, ...searchResults] = await Promise.all([
|
||||||
querySitemapIndex(url, abort),
|
querySitemapIndex(url, abort),
|
||||||
|
queryIndex(url, limit, useIndex),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
if (indexResults.length > 0) {
|
||||||
|
links.push(...indexResults);
|
||||||
|
}
|
||||||
|
|
||||||
const twoDaysAgo = new Date();
|
const twoDaysAgo = new Date();
|
||||||
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
||||||
|
|
||||||
@ -333,6 +349,7 @@ export async function mapController(
|
|||||||
mock: req.body.useMock,
|
mock: req.body.useMock,
|
||||||
filterByPath: req.body.filterByPath !== false,
|
filterByPath: req.body.filterByPath !== false,
|
||||||
flags: req.acuc?.flags ?? null,
|
flags: req.acuc?.flags ?? null,
|
||||||
|
useIndex: req.body.useIndex,
|
||||||
}),
|
}),
|
||||||
...(req.body.timeout !== undefined ? [
|
...(req.body.timeout !== undefined ? [
|
||||||
new Promise((resolve, reject) => setTimeout(() => {
|
new Promise((resolve, reject) => setTimeout(() => {
|
||||||
|
@ -42,6 +42,8 @@ export async function scrapeController(
|
|||||||
});
|
});
|
||||||
//
|
//
|
||||||
|
|
||||||
|
const isDirectToBullMQ = process.env.SEARCH_PREVIEW_TOKEN !== undefined && process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
@ -52,6 +54,8 @@ export async function scrapeController(
|
|||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||||
unnormalizedSourceURL: preNormalizedBody.url,
|
unnormalizedSourceURL: preNormalizedBody.url,
|
||||||
|
useCache: req.body.__experimental_cache ? true : false,
|
||||||
|
bypassBilling: isDirectToBullMQ,
|
||||||
},
|
},
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
startTime,
|
startTime,
|
||||||
@ -59,6 +63,7 @@ export async function scrapeController(
|
|||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority,
|
jobPriority,
|
||||||
|
isDirectToBullMQ,
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait =
|
const totalWait =
|
||||||
@ -130,6 +135,7 @@ export async function scrapeController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: doc,
|
data: doc,
|
||||||
|
@ -40,23 +40,23 @@ export async function searchAndScrapeSearchResult(
|
|||||||
try {
|
try {
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
query,
|
query,
|
||||||
num_results: 5
|
num_results: 5,
|
||||||
});
|
});
|
||||||
|
|
||||||
const documents = await Promise.all(
|
const documents = await Promise.all(
|
||||||
searchResults.map(result =>
|
searchResults.map((result) =>
|
||||||
scrapeSearchResult(
|
scrapeSearchResult(
|
||||||
{
|
{
|
||||||
url: result.url,
|
url: result.url,
|
||||||
title: result.title,
|
title: result.title,
|
||||||
description: result.description
|
description: result.description,
|
||||||
},
|
},
|
||||||
options,
|
options,
|
||||||
logger,
|
logger,
|
||||||
costTracking,
|
costTracking,
|
||||||
flags
|
flags,
|
||||||
)
|
),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
@ -77,6 +77,7 @@ async function scrapeSearchResult(
|
|||||||
costTracking: CostTracking,
|
costTracking: CostTracking,
|
||||||
flags: TeamFlags,
|
flags: TeamFlags,
|
||||||
directToBullMQ: boolean = false,
|
directToBullMQ: boolean = false,
|
||||||
|
isSearchPreview: boolean = false,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
@ -100,7 +101,7 @@ async function scrapeSearchResult(
|
|||||||
mode: "single_urls" as Mode,
|
mode: "single_urls" as Mode,
|
||||||
team_id: options.teamId,
|
team_id: options.teamId,
|
||||||
scrapeOptions: options.scrapeOptions,
|
scrapeOptions: options.scrapeOptions,
|
||||||
internalOptions: { teamId: options.teamId, useCache: true },
|
internalOptions: { teamId: options.teamId, useCache: true, bypassBilling: true },
|
||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
startTime: Date.now(),
|
startTime: Date.now(),
|
||||||
@ -171,6 +172,7 @@ export async function searchController(
|
|||||||
};
|
};
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const costTracking = new CostTracking();
|
const costTracking = new CostTracking();
|
||||||
|
const isSearchPreview = process.env.SEARCH_PREVIEW_TOKEN !== undefined && process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
req.body = searchRequestSchema.parse(req.body);
|
req.body = searchRequestSchema.parse(req.body);
|
||||||
@ -199,7 +201,9 @@ export async function searchController(
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (req.body.ignoreInvalidURLs) {
|
if (req.body.ignoreInvalidURLs) {
|
||||||
searchResults = searchResults.filter((result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null));
|
searchResults = searchResults.filter(
|
||||||
|
(result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Searching completed", {
|
logger.info("Searching completed", {
|
||||||
@ -226,12 +230,20 @@ export async function searchController(
|
|||||||
} else {
|
} else {
|
||||||
logger.info("Scraping search results");
|
logger.info("Scraping search results");
|
||||||
const scrapePromises = searchResults.map((result) =>
|
const scrapePromises = searchResults.map((result) =>
|
||||||
scrapeSearchResult(result, {
|
scrapeSearchResult(
|
||||||
|
result,
|
||||||
|
{
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
timeout: req.body.timeout,
|
timeout: req.body.timeout,
|
||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
}, logger, costTracking, req.acuc?.flags ?? null, (req.acuc?.price_credits ?? 0) <= 3000),
|
},
|
||||||
|
logger,
|
||||||
|
costTracking,
|
||||||
|
req.acuc?.flags ?? null,
|
||||||
|
(req.acuc?.price_credits ?? 0) <= 3000,
|
||||||
|
isSearchPreview,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
const docs = await Promise.all(scrapePromises);
|
const docs = await Promise.all(scrapePromises);
|
||||||
@ -257,17 +269,23 @@ export async function searchController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Bill team once for all successful results
|
// Bill team once for all successful results
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.reduce((a,x) => {
|
if (!isSearchPreview) {
|
||||||
|
billTeam(
|
||||||
|
req.auth.team_id,
|
||||||
|
req.acuc?.sub_id,
|
||||||
|
responseData.data.reduce((a, x) => {
|
||||||
if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) {
|
if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) {
|
||||||
return a + x.metadata.numPages;
|
return a + x.metadata.numPages;
|
||||||
} else {
|
} else {
|
||||||
return a + 1;
|
return a + 1;
|
||||||
}
|
}
|
||||||
}, 0)).catch((error) => {
|
}, 0),
|
||||||
|
).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`,
|
`Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
@ -277,7 +295,8 @@ export async function searchController(
|
|||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
});
|
});
|
||||||
|
|
||||||
logJob({
|
logJob(
|
||||||
|
{
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: true,
|
success: true,
|
||||||
num_docs: responseData.data.length,
|
num_docs: responseData.data.length,
|
||||||
@ -289,10 +308,12 @@ export async function searchController(
|
|||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
});
|
},
|
||||||
|
false,
|
||||||
|
isSearchPreview,
|
||||||
|
);
|
||||||
|
|
||||||
return res.status(200).json(responseData);
|
return res.status(200).json(responseData);
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (
|
if (
|
||||||
error instanceof Error &&
|
error instanceof Error &&
|
||||||
|
@ -21,7 +21,7 @@ export async function tokenUsageController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise fetch fresh data
|
// Otherwise fetch fresh data
|
||||||
const chunk = await getACUCTeam(req.auth.team_id, false, true, RateLimiterMode.Extract);
|
const chunk = await getACUCTeam(req.auth.team_id, false, false, RateLimiterMode.Extract);
|
||||||
if (!chunk) {
|
if (!chunk) {
|
||||||
res.status(404).json({
|
res.status(404).json({
|
||||||
success: false,
|
success: false,
|
||||||
|
@ -309,6 +309,10 @@ const baseScrapeOptions = z
|
|||||||
useMock: z.string().optional(),
|
useMock: z.string().optional(),
|
||||||
blockAds: z.boolean().default(true),
|
blockAds: z.boolean().default(true),
|
||||||
proxy: z.enum(["basic", "stealth", "auto"]).optional(),
|
proxy: z.enum(["basic", "stealth", "auto"]).optional(),
|
||||||
|
maxAge: z.number().int().gte(0).safe().default(0),
|
||||||
|
storeInCache: z.boolean().default(true),
|
||||||
|
__experimental_cache: z.boolean().default(false).optional(),
|
||||||
|
__searchPreviewToken: z.string().optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@ -656,6 +660,7 @@ export const mapRequestSchema = crawlerOptions
|
|||||||
timeout: z.number().positive().finite().optional(),
|
timeout: z.number().positive().finite().optional(),
|
||||||
useMock: z.string().optional(),
|
useMock: z.string().optional(),
|
||||||
filterByPath: z.boolean().default(true),
|
filterByPath: z.boolean().default(true),
|
||||||
|
useIndex: z.boolean().default(true),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@ -752,6 +757,8 @@ export type Document = {
|
|||||||
numPages?: number;
|
numPages?: number;
|
||||||
contentType?: string;
|
contentType?: string;
|
||||||
proxyUsed: "basic" | "stealth";
|
proxyUsed: "basic" | "stealth";
|
||||||
|
cacheState?: "hit" | "miss";
|
||||||
|
cachedAt?: string;
|
||||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||||
};
|
};
|
||||||
serpResults?: {
|
serpResults?: {
|
||||||
@ -1198,6 +1205,7 @@ export const searchRequestSchema = z
|
|||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
ignoreInvalidURLs: z.boolean().optional().default(false),
|
ignoreInvalidURLs: z.boolean().optional().default(false),
|
||||||
|
__searchPreviewToken: z.string().optional(),
|
||||||
scrapeOptions: baseScrapeOptions
|
scrapeOptions: baseScrapeOptions
|
||||||
.extend({
|
.extend({
|
||||||
formats: z
|
formats: z
|
||||||
|
@ -131,6 +131,9 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
removeBase64Images: false,
|
removeBase64Images: false,
|
||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: false,
|
blockAds: false,
|
||||||
|
maxAge: 0,
|
||||||
|
storeInCache: true,
|
||||||
|
__experimental_cache: true,
|
||||||
},
|
},
|
||||||
}, logger, costTracking, acuc?.flags ?? null);
|
}, logger, costTracking, acuc?.flags ?? null);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
|
@ -10,6 +10,7 @@ import { wrap } from "./v1";
|
|||||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||||
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
|
import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine";
|
||||||
import { cclogController } from "../controllers/v0/admin/cclog";
|
import { cclogController } from "../controllers/v0/admin/cclog";
|
||||||
|
import { indexQueuePrometheus } from "../controllers/v0/admin/index-queue-prometheus";
|
||||||
|
|
||||||
export const adminRouter = express.Router();
|
export const adminRouter = express.Router();
|
||||||
|
|
||||||
@ -49,3 +50,8 @@ adminRouter.get(
|
|||||||
`/admin/${process.env.BULL_AUTH_KEY}/cclog`,
|
`/admin/${process.env.BULL_AUTH_KEY}/cclog`,
|
||||||
wrap(cclogController),
|
wrap(cclogController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
adminRouter.get(
|
||||||
|
`/admin/${process.env.BULL_AUTH_KEY}/index-queue-prometheus`,
|
||||||
|
wrap(indexQueuePrometheus),
|
||||||
|
);
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
|
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { EngineError } from "../../error";
|
import { EngineError, IndexMissError } from "../../error";
|
||||||
|
|
||||||
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||||
if (key === null) throw new EngineError("Scrape not eligible for caching");
|
if (key === null) throw new EngineError("Scrape not eligible for caching");
|
||||||
|
|
||||||
const entry = await getEntryFromCache(key);
|
const entry = await getEntryFromCache(key);
|
||||||
if (entry === null) throw new EngineError("Cache missed");
|
if (entry === null) throw new IndexMissError();
|
||||||
|
|
||||||
if (!entry.html) {
|
if (!entry.html) {
|
||||||
throw new EngineError("Cache hit but HTML is missing");
|
throw new EngineError("Cache hit but HTML is missing");
|
||||||
|
@ -10,6 +10,8 @@ import { scrapePDF } from "./pdf";
|
|||||||
import { scrapeURLWithFetch } from "./fetch";
|
import { scrapeURLWithFetch } from "./fetch";
|
||||||
import { scrapeURLWithPlaywright } from "./playwright";
|
import { scrapeURLWithPlaywright } from "./playwright";
|
||||||
import { scrapeCache } from "./cache";
|
import { scrapeCache } from "./cache";
|
||||||
|
import { scrapeURLWithIndex } from "./index/index";
|
||||||
|
import { useIndex } from "../../../services";
|
||||||
|
|
||||||
export type Engine =
|
export type Engine =
|
||||||
| "fire-engine;chrome-cdp"
|
| "fire-engine;chrome-cdp"
|
||||||
@ -24,7 +26,9 @@ export type Engine =
|
|||||||
| "fetch"
|
| "fetch"
|
||||||
| "pdf"
|
| "pdf"
|
||||||
| "docx"
|
| "docx"
|
||||||
| "cache";
|
| "cache"
|
||||||
|
| "index"
|
||||||
|
| "index;documents";
|
||||||
|
|
||||||
const useFireEngine =
|
const useFireEngine =
|
||||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
@ -38,6 +42,7 @@ const useCache =
|
|||||||
|
|
||||||
export const engines: Engine[] = [
|
export const engines: Engine[] = [
|
||||||
...(useCache ? ["cache" as const] : []),
|
...(useCache ? ["cache" as const] : []),
|
||||||
|
...(useIndex ? ["index" as const, "index;documents" as const] : []),
|
||||||
...(useFireEngine
|
...(useFireEngine
|
||||||
? [
|
? [
|
||||||
"fire-engine;chrome-cdp" as const,
|
"fire-engine;chrome-cdp" as const,
|
||||||
@ -114,6 +119,10 @@ export type EngineScrapeResult = {
|
|||||||
|
|
||||||
numPages?: number;
|
numPages?: number;
|
||||||
|
|
||||||
|
cacheInfo?: {
|
||||||
|
created_at: Date;
|
||||||
|
};
|
||||||
|
|
||||||
contentType?: string;
|
contentType?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -124,6 +133,8 @@ const engineHandlers: {
|
|||||||
) => Promise<EngineScrapeResult>;
|
) => Promise<EngineScrapeResult>;
|
||||||
} = {
|
} = {
|
||||||
cache: scrapeCache,
|
cache: scrapeCache,
|
||||||
|
index: scrapeURLWithIndex,
|
||||||
|
"index;documents": scrapeURLWithIndex,
|
||||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
"fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
"fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
|
||||||
@ -166,6 +177,24 @@ export const engineOptions: {
|
|||||||
},
|
},
|
||||||
quality: 1000, // cache should always be tried first
|
quality: 1000, // cache should always be tried first
|
||||||
},
|
},
|
||||||
|
index: {
|
||||||
|
features: {
|
||||||
|
actions: false,
|
||||||
|
waitFor: true,
|
||||||
|
screenshot: true,
|
||||||
|
"screenshot@fullScreen": true,
|
||||||
|
pdf: false,
|
||||||
|
docx: false,
|
||||||
|
atsv: false,
|
||||||
|
mobile: true,
|
||||||
|
location: true,
|
||||||
|
skipTlsVerification: true,
|
||||||
|
useFastMode: true,
|
||||||
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
|
},
|
||||||
|
quality: 999, // index should always be tried second ? - MG
|
||||||
|
},
|
||||||
"fire-engine;chrome-cdp": {
|
"fire-engine;chrome-cdp": {
|
||||||
features: {
|
features: {
|
||||||
actions: true,
|
actions: true,
|
||||||
@ -202,6 +231,24 @@ export const engineOptions: {
|
|||||||
},
|
},
|
||||||
quality: 45,
|
quality: 45,
|
||||||
},
|
},
|
||||||
|
"index;documents": {
|
||||||
|
features: {
|
||||||
|
actions: false,
|
||||||
|
waitFor: true,
|
||||||
|
screenshot: true,
|
||||||
|
"screenshot@fullScreen": true,
|
||||||
|
pdf: true,
|
||||||
|
docx: true,
|
||||||
|
atsv: false,
|
||||||
|
location: true,
|
||||||
|
mobile: true,
|
||||||
|
skipTlsVerification: true,
|
||||||
|
useFastMode: true,
|
||||||
|
stealthProxy: false,
|
||||||
|
disableAdblock: false,
|
||||||
|
},
|
||||||
|
quality: -1,
|
||||||
|
},
|
||||||
"fire-engine;chrome-cdp;stealth": {
|
"fire-engine;chrome-cdp;stealth": {
|
||||||
features: {
|
features: {
|
||||||
actions: true,
|
actions: true,
|
||||||
@ -218,7 +265,7 @@ export const engineOptions: {
|
|||||||
stealthProxy: true,
|
stealthProxy: true,
|
||||||
disableAdblock: false,
|
disableAdblock: false,
|
||||||
},
|
},
|
||||||
quality: -1,
|
quality: -2,
|
||||||
},
|
},
|
||||||
"fire-engine(retry);chrome-cdp;stealth": {
|
"fire-engine(retry);chrome-cdp;stealth": {
|
||||||
features: {
|
features: {
|
||||||
@ -402,6 +449,41 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const shouldUseIndex =
|
||||||
|
useIndex
|
||||||
|
&& process.env.FIRECRAWL_INDEX_WRITE_ONLY !== "true"
|
||||||
|
&& !meta.options.formats.includes("changeTracking")
|
||||||
|
&& meta.options.maxAge !== 0
|
||||||
|
&& (
|
||||||
|
meta.options.headers === undefined
|
||||||
|
|| Object.keys(meta.options.headers).length === 0
|
||||||
|
)
|
||||||
|
&& (
|
||||||
|
meta.options.actions === undefined
|
||||||
|
|| meta.options.actions.length === 0
|
||||||
|
)
|
||||||
|
&& meta.options.proxy !== "stealth";
|
||||||
|
|
||||||
|
meta.logger.warn("shouldUseIndex", {
|
||||||
|
shouldUseIndex,
|
||||||
|
formatsNoChangeTracking: !meta.options.formats.includes("changeTracking"),
|
||||||
|
maxAge: meta.options.maxAge !== 0,
|
||||||
|
headers: meta.options.headers === undefined || Object.keys(meta.options.headers).length === 0,
|
||||||
|
actions: meta.options.actions === undefined || meta.options.actions.length === 0,
|
||||||
|
proxy: meta.options.proxy !== "stealth",
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!shouldUseIndex) {
|
||||||
|
const indexIndex = _engines.indexOf("index");
|
||||||
|
if (indexIndex !== -1) {
|
||||||
|
_engines.splice(indexIndex, 1);
|
||||||
|
}
|
||||||
|
const indexDocumentsIndex = _engines.indexOf("index;documents");
|
||||||
|
if (indexDocumentsIndex !== -1) {
|
||||||
|
_engines.splice(indexDocumentsIndex, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const prioritySum = [...meta.featureFlags].reduce(
|
const prioritySum = [...meta.featureFlags].reduce(
|
||||||
(a, x) => a + featureFlagOptions[x].priority,
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
0,
|
0,
|
||||||
|
165
apps/api/src/scraper/scrapeURL/engines/index/index.ts
Normal file
165
apps/api/src/scraper/scrapeURL/engines/index/index.ts
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
import { Document } from "../../../../controllers/v1/types";
|
||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits, addIndexInsertJob } from "../../../../services";
|
||||||
|
import { EngineError, IndexMissError } from "../../error";
|
||||||
|
import crypto from "crypto";
|
||||||
|
|
||||||
|
export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||||
|
const shouldCache = meta.options.storeInCache
|
||||||
|
&& meta.winnerEngine !== "cache"
|
||||||
|
&& meta.winnerEngine !== "index"
|
||||||
|
&& meta.winnerEngine !== "index;documents"
|
||||||
|
&& !meta.featureFlags.has("actions")
|
||||||
|
&& (
|
||||||
|
meta.options.headers === undefined
|
||||||
|
|| Object.keys(meta.options.headers).length === 0
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!shouldCache) {
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
try {
|
||||||
|
const normalizedURL = normalizeURLForIndex(meta.url);
|
||||||
|
const urlHash = await hashURL(normalizedURL);
|
||||||
|
|
||||||
|
const urlSplits = generateURLSplits(normalizedURL);
|
||||||
|
const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split)));
|
||||||
|
|
||||||
|
const indexId = crypto.randomUUID();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await saveIndexToGCS(indexId, {
|
||||||
|
url: normalizedURL,
|
||||||
|
html: document.rawHtml!,
|
||||||
|
statusCode: document.metadata.statusCode,
|
||||||
|
error: document.metadata.error,
|
||||||
|
screenshot: document.screenshot,
|
||||||
|
numPages: document.metadata.numPages,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.error("Failed to save document to index", {
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await addIndexInsertJob({
|
||||||
|
id: indexId,
|
||||||
|
url: normalizedURL,
|
||||||
|
url_hash: urlHash,
|
||||||
|
url_splits: urlSplits,
|
||||||
|
url_splits_hash: urlSplitsHash,
|
||||||
|
original_url: document.metadata.sourceURL ?? meta.url,
|
||||||
|
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||||
|
has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"),
|
||||||
|
has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"),
|
||||||
|
is_mobile: meta.options.mobile,
|
||||||
|
block_ads: meta.options.blockAds,
|
||||||
|
location_country: meta.options.location?.country ?? null,
|
||||||
|
location_languages: meta.options.location?.languages ?? null,
|
||||||
|
status: document.metadata.statusCode,
|
||||||
|
...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({
|
||||||
|
...a,
|
||||||
|
[`url_split_${i}_hash`]: x,
|
||||||
|
}), {})),
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.error("Failed to add document to index insert queue", {
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.error("Failed to save document to index (outer)", {
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
const errorCountToRegister = 3;
|
||||||
|
|
||||||
|
export async function scrapeURLWithIndex(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const normalizedURL = normalizeURLForIndex(meta.url);
|
||||||
|
const urlHash = await hashURL(normalizedURL);
|
||||||
|
|
||||||
|
let selector = index_supabase_service
|
||||||
|
.from("index")
|
||||||
|
.select("id, created_at, status")
|
||||||
|
.eq("url_hash", urlHash)
|
||||||
|
.gte("created_at", new Date(Date.now() - meta.options.maxAge).toISOString())
|
||||||
|
.eq("is_mobile", meta.options.mobile)
|
||||||
|
.eq("block_ads", meta.options.blockAds);
|
||||||
|
|
||||||
|
if (meta.featureFlags.has("screenshot")) {
|
||||||
|
selector = selector.eq("has_screenshot", true);
|
||||||
|
}
|
||||||
|
if (meta.featureFlags.has("screenshot@fullScreen")) {
|
||||||
|
selector = selector.eq("has_screenshot_fullscreen", true);
|
||||||
|
}
|
||||||
|
if (meta.options.location?.country) {
|
||||||
|
selector = selector.eq("location_country", meta.options.location.country);
|
||||||
|
} else {
|
||||||
|
selector = selector.is("location_country", null);
|
||||||
|
}
|
||||||
|
if (meta.options.location?.languages) {
|
||||||
|
selector = selector.eq("location_languages", meta.options.location.languages);
|
||||||
|
} else {
|
||||||
|
selector = selector.is("location_languages", null);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { data, error } = await selector
|
||||||
|
.order("created_at", { ascending: false })
|
||||||
|
.limit(5);
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
throw new EngineError("Failed to retrieve URL from DB index", {
|
||||||
|
cause: error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let selectedRow: {
|
||||||
|
id: string;
|
||||||
|
created_at: string;
|
||||||
|
status: number;
|
||||||
|
} | null = null;
|
||||||
|
|
||||||
|
if (data.length > 0) {
|
||||||
|
const newest200Index = data.findIndex(x => x.status >= 200 && x.status < 300);
|
||||||
|
// If the newest 200 index is further back than the allowed error count, we should display the errored index entry
|
||||||
|
if (newest200Index >= errorCountToRegister || newest200Index === -1) {
|
||||||
|
selectedRow = data[0];
|
||||||
|
} else {
|
||||||
|
selectedRow = data[newest200Index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (selectedRow === null || selectedRow === undefined) {
|
||||||
|
throw new IndexMissError();
|
||||||
|
}
|
||||||
|
|
||||||
|
const id = data[0].id;
|
||||||
|
|
||||||
|
const doc = await getIndexFromGCS(id + ".json");
|
||||||
|
if (!doc) {
|
||||||
|
throw new EngineError("Document not found in GCS");
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: doc.url,
|
||||||
|
html: doc.html,
|
||||||
|
statusCode: doc.statusCode,
|
||||||
|
error: doc.error,
|
||||||
|
screenshot: doc.screenshot,
|
||||||
|
numPages: doc.numPages,
|
||||||
|
|
||||||
|
cacheInfo: {
|
||||||
|
created_at: new Date(data[0].created_at),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
@ -92,3 +92,9 @@ export class PDFInsufficientTimeError extends Error {
|
|||||||
super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`);
|
super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class IndexMissError extends Error {
|
||||||
|
constructor() {
|
||||||
|
super("Index doesn't have the page we're looking for");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -23,6 +23,7 @@ import {
|
|||||||
UnsupportedFileError,
|
UnsupportedFileError,
|
||||||
SSLError,
|
SSLError,
|
||||||
PDFInsufficientTimeError,
|
PDFInsufficientTimeError,
|
||||||
|
IndexMissError,
|
||||||
} from "./error";
|
} from "./error";
|
||||||
import { executeTransformers } from "./transformers";
|
import { executeTransformers } from "./transformers";
|
||||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||||
@ -59,6 +60,7 @@ export type Meta = {
|
|||||||
status: number;
|
status: number;
|
||||||
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
||||||
costTracking: CostTracking;
|
costTracking: CostTracking;
|
||||||
|
winnerEngine?: Engine;
|
||||||
};
|
};
|
||||||
|
|
||||||
function buildFeatureFlags(
|
function buildFeatureFlags(
|
||||||
@ -189,6 +191,7 @@ export type InternalOptions = {
|
|||||||
unnormalizedSourceURL?: string;
|
unnormalizedSourceURL?: string;
|
||||||
|
|
||||||
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
||||||
|
bypassBilling?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = {
|
export type EngineResultsTracker = {
|
||||||
@ -295,11 +298,23 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
result: engineResult as EngineScrapeResult & { markdown: string },
|
result: engineResult as EngineScrapeResult & { markdown: string },
|
||||||
};
|
};
|
||||||
|
meta.winnerEngine = engine;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof EngineError) {
|
if (error instanceof EngineError) {
|
||||||
meta.logger.info("Engine " + engine + " could not scrape the page.", {
|
meta.logger.warn("Engine " + engine + " could not scrape the page.", {
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
results[engine] = {
|
||||||
|
state: "error",
|
||||||
|
error: safeguardCircularError(error),
|
||||||
|
unexpected: false,
|
||||||
|
startedAt,
|
||||||
|
finishedAt: Date.now(),
|
||||||
|
};
|
||||||
|
} else if (error instanceof IndexMissError) {
|
||||||
|
meta.logger.info("Engine " + engine + " could not find the page in the index.", {
|
||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
@ -385,6 +400,14 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
numPages: result.result.numPages,
|
numPages: result.result.numPages,
|
||||||
contentType: result.result.contentType,
|
contentType: result.result.contentType,
|
||||||
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
||||||
|
...(results["index"] ? (
|
||||||
|
result.result.cacheInfo ? {
|
||||||
|
cacheState: "hit",
|
||||||
|
cachedAt: result.result.cacheInfo.created_at.toISOString(),
|
||||||
|
} : {
|
||||||
|
cacheState: "miss",
|
||||||
|
}
|
||||||
|
) : {})
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ export async function extractMetadata(
|
|||||||
// Extract all meta tags for custom metadata
|
// Extract all meta tags for custom metadata
|
||||||
soup("meta").each((i, elem) => {
|
soup("meta").each((i, elem) => {
|
||||||
try {
|
try {
|
||||||
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
const name = soup(elem).attr("name") || soup(elem).attr("property") || soup(elem).attr("itemprop");
|
||||||
const content = soup(elem).attr("content");
|
const content = soup(elem).attr("content");
|
||||||
|
|
||||||
if (name && content) {
|
if (name && content) {
|
||||||
|
@ -11,6 +11,9 @@ import { saveToCache } from "./cache";
|
|||||||
import { performAgent } from "./agent";
|
import { performAgent } from "./agent";
|
||||||
|
|
||||||
import { deriveDiff } from "./diff";
|
import { deriveDiff } from "./diff";
|
||||||
|
import { useIndex } from "../../../services/index";
|
||||||
|
import { sendDocumentToIndex } from "../engines/index/index";
|
||||||
|
|
||||||
export type Transformer = (
|
export type Transformer = (
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document,
|
document: Document,
|
||||||
@ -205,6 +208,7 @@ export const transformerStack: Transformer[] = [
|
|||||||
deriveLinksFromHTML,
|
deriveLinksFromHTML,
|
||||||
deriveMetadataFromRawHTML,
|
deriveMetadataFromRawHTML,
|
||||||
uploadScreenshot,
|
uploadScreenshot,
|
||||||
|
...(useIndex ? [sendDocumentToIndex] : []),
|
||||||
performLLMExtract,
|
performLLMExtract,
|
||||||
performAgent,
|
performAgent,
|
||||||
deriveDiff,
|
deriveDiff,
|
||||||
|
241
apps/api/src/services/index.ts
Normal file
241
apps/api/src/services/index.ts
Normal file
@ -0,0 +1,241 @@
|
|||||||
|
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||||
|
import { logger } from "../lib/logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Storage } from "@google-cloud/storage";
|
||||||
|
import crypto from "crypto";
|
||||||
|
import { redisEvictConnection } from "./redis";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
|
class IndexSupabaseService {
|
||||||
|
private client: SupabaseClient | null = null;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
const supabaseUrl = process.env.INDEX_SUPABASE_URL;
|
||||||
|
const supabaseServiceToken = process.env.INDEX_SUPABASE_SERVICE_TOKEN;
|
||||||
|
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||||
|
if (!supabaseUrl || !supabaseServiceToken) {
|
||||||
|
// Warn the user that Authentication is disabled by setting the client to null
|
||||||
|
logger.warn(
|
||||||
|
"Index supabase client will not be initialized.",
|
||||||
|
);
|
||||||
|
this.client = null;
|
||||||
|
} else {
|
||||||
|
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Provides access to the initialized Supabase client, if available.
|
||||||
|
getClient(): SupabaseClient | null {
|
||||||
|
return this.client;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const serv = new IndexSupabaseService();
|
||||||
|
|
||||||
|
// Using a Proxy to handle dynamic access to the Supabase client or service methods.
|
||||||
|
// This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error.
|
||||||
|
export const index_supabase_service: SupabaseClient = new Proxy(
|
||||||
|
serv,
|
||||||
|
{
|
||||||
|
get: function (target, prop, receiver) {
|
||||||
|
const client = target.getClient();
|
||||||
|
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||||
|
if (client === null) {
|
||||||
|
return () => {
|
||||||
|
throw new Error("Index supabase client is not configured.");
|
||||||
|
};
|
||||||
|
}
|
||||||
|
// Direct access to SupabaseService properties takes precedence.
|
||||||
|
if (prop in target) {
|
||||||
|
return Reflect.get(target, prop, receiver);
|
||||||
|
}
|
||||||
|
// Otherwise, delegate access to the Supabase client.
|
||||||
|
return Reflect.get(client, prop, receiver);
|
||||||
|
},
|
||||||
|
},
|
||||||
|
) as unknown as SupabaseClient;
|
||||||
|
|
||||||
|
const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined;
|
||||||
|
|
||||||
|
export async function getIndexFromGCS(url: string): Promise<any | null> {
|
||||||
|
// logger.info(`Getting f-engine document from GCS`, {
|
||||||
|
// url,
|
||||||
|
// });
|
||||||
|
try {
|
||||||
|
if (!process.env.GCS_INDEX_BUCKET_NAME) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const storage = new Storage({ credentials });
|
||||||
|
const bucket = storage.bucket(process.env.GCS_INDEX_BUCKET_NAME);
|
||||||
|
const blob = bucket.file(`${url}`);
|
||||||
|
const [exists] = await blob.exists();
|
||||||
|
if (!exists) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const [blobContent] = await blob.download();
|
||||||
|
const parsed = JSON.parse(blobContent.toString());
|
||||||
|
return parsed;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error getting f-engine document from GCS`, {
|
||||||
|
error,
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export async function saveIndexToGCS(id: string, doc: {
|
||||||
|
url: string;
|
||||||
|
html: string;
|
||||||
|
statusCode: number;
|
||||||
|
error?: string;
|
||||||
|
screenshot?: string;
|
||||||
|
numPages?: number;
|
||||||
|
}): Promise<void> {
|
||||||
|
try {
|
||||||
|
if (!process.env.GCS_INDEX_BUCKET_NAME) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const storage = new Storage({ credentials });
|
||||||
|
const bucket = storage.bucket(process.env.GCS_INDEX_BUCKET_NAME);
|
||||||
|
const blob = bucket.file(`${id}.json`);
|
||||||
|
for (let i = 0; i < 3; i++) {
|
||||||
|
try {
|
||||||
|
await blob.save(JSON.stringify(doc), {
|
||||||
|
contentType: "application/json",
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
} catch (error) {
|
||||||
|
if (i === 2) {
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
logger.error(`Error saving index document to GCS, retrying`, {
|
||||||
|
error,
|
||||||
|
indexId: id,
|
||||||
|
i,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error("Error saving index document to GCS", {
|
||||||
|
cause: error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export const useIndex =
|
||||||
|
process.env.INDEX_SUPABASE_URL !== "" &&
|
||||||
|
process.env.INDEX_SUPABASE_URL !== undefined;
|
||||||
|
|
||||||
|
export function normalizeURLForIndex(url: string): string {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.hash = "";
|
||||||
|
urlObj.protocol = "https";
|
||||||
|
|
||||||
|
if (urlObj.port === "80" || urlObj.port === "443") {
|
||||||
|
urlObj.port = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlObj.hostname.startsWith("www.")) {
|
||||||
|
urlObj.hostname = urlObj.hostname.slice(4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlObj.pathname.endsWith("/index.html")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -10);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.php")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.htm")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.shtml")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -11);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.xml")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlObj.pathname.endsWith("/")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return urlObj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function hashURL(url: string): Promise<string> {
|
||||||
|
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function generateURLSplits(url: string): string[] {
|
||||||
|
const urls: string[] = [];
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.hash = "";
|
||||||
|
urlObj.search = "";
|
||||||
|
const pathnameParts = urlObj.pathname.split("/");
|
||||||
|
|
||||||
|
for (let i = 0; i <= pathnameParts.length; i++) {
|
||||||
|
urlObj.pathname = pathnameParts.slice(0, i).join("/");
|
||||||
|
urls.push(urlObj.href);
|
||||||
|
}
|
||||||
|
|
||||||
|
urls.push(url);
|
||||||
|
|
||||||
|
return [...new Set(urls.map(x => normalizeURLForIndex(x)))];
|
||||||
|
}
|
||||||
|
|
||||||
|
const INDEX_INSERT_QUEUE_KEY = "index-insert-queue";
|
||||||
|
const INDEX_INSERT_BATCH_SIZE = 1000;
|
||||||
|
|
||||||
|
export async function addIndexInsertJob(data: any) {
|
||||||
|
await redisEvictConnection.rpush(INDEX_INSERT_QUEUE_KEY, JSON.stringify(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getIndexInsertJobs(): Promise<any[]> {
|
||||||
|
const jobs = (await redisEvictConnection.lpop(INDEX_INSERT_QUEUE_KEY, INDEX_INSERT_BATCH_SIZE)) ?? [];
|
||||||
|
return jobs.map(x => JSON.parse(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function processIndexInsertJobs() {
|
||||||
|
const jobs = await getIndexInsertJobs();
|
||||||
|
if (jobs.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
logger.info(`Index inserter found jobs to insert`, { jobCount: jobs.length });
|
||||||
|
try {
|
||||||
|
await index_supabase_service.from("index").insert(jobs);
|
||||||
|
logger.info(`Index inserter inserted jobs`, { jobCount: jobs.length });
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Index inserter failed to insert jobs`, { error, jobCount: jobs.length });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getIndexInsertQueueLength(): Promise<number> {
|
||||||
|
return await redisEvictConnection.llen(INDEX_INSERT_QUEUE_KEY) ?? 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function queryIndexAtSplitLevel(url: string, limit: number): Promise<string[]> {
|
||||||
|
if (!useIndex || process.env.FIRECRAWL_INDEX_WRITE_ONLY === "true") {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.search = "";
|
||||||
|
|
||||||
|
const urlSplitsHash = generateURLSplits(urlObj.href).map(x => hashURL(x));
|
||||||
|
|
||||||
|
const { data, error } = await index_supabase_service
|
||||||
|
.from("index")
|
||||||
|
.select("resolved_url")
|
||||||
|
.eq("url_split_" + (urlSplitsHash.length - 1) + "_hash", urlSplitsHash[urlSplitsHash.length - 1])
|
||||||
|
.gte("created_at", new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString())
|
||||||
|
.limit(limit)
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
logger.warn("Error querying index", { error, url, limit });
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...new Set((data ?? []).map((x) => x.resolved_url))];
|
||||||
|
}
|
@ -14,6 +14,7 @@ import { saveCrawlMap } from "./crawl-maps-index";
|
|||||||
import { processBillingBatch, queueBillingOperation, startBillingBatchProcessing } from "../billing/batch_billing";
|
import { processBillingBatch, queueBillingOperation, startBillingBatchProcessing } from "../billing/batch_billing";
|
||||||
import systemMonitor from "../system-monitor";
|
import systemMonitor from "../system-monitor";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
import { processIndexInsertJobs } from "..";
|
||||||
|
|
||||||
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||||
const workerStalledCheckInterval =
|
const workerStalledCheckInterval =
|
||||||
@ -226,6 +227,8 @@ const workerFun = async (queue: Queue, jobProcessor: (token: string, job: Job) =
|
|||||||
process.exit(0);
|
process.exit(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const INDEX_INSERT_INTERVAL = 15000;
|
||||||
|
|
||||||
// Start the workers
|
// Start the workers
|
||||||
(async () => {
|
(async () => {
|
||||||
// Start index worker
|
// Start index worker
|
||||||
@ -235,6 +238,16 @@ const workerFun = async (queue: Queue, jobProcessor: (token: string, job: Job) =
|
|||||||
startBillingBatchProcessing();
|
startBillingBatchProcessing();
|
||||||
const billingWorkerPromise = workerFun(getBillingQueue(), processBillingJobInternal);
|
const billingWorkerPromise = workerFun(getBillingQueue(), processBillingJobInternal);
|
||||||
|
|
||||||
|
const indexInserterInterval = setInterval(async () => {
|
||||||
|
if (isShuttingDown) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await processIndexInsertJobs();
|
||||||
|
}, INDEX_INSERT_INTERVAL);
|
||||||
|
|
||||||
// Wait for both workers to complete (which should only happen on shutdown)
|
// Wait for both workers to complete (which should only happen on shutdown)
|
||||||
await Promise.all([indexWorkerPromise, billingWorkerPromise]);
|
await Promise.all([indexWorkerPromise, billingWorkerPromise]);
|
||||||
|
|
||||||
|
clearInterval(indexInserterInterval);
|
||||||
})();
|
})();
|
||||||
|
@ -21,13 +21,14 @@ function cleanOfNull<T>(x: T): T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
export async function logJob(job: FirecrawlJob, force: boolean = false, bypassLogging: boolean = false) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if (!useDbAuthentication) {
|
if (!useDbAuthentication) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Redact any pages that have an authorization header
|
// Redact any pages that have an authorization header
|
||||||
// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
|
// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
|
||||||
// if (
|
// if (
|
||||||
@ -70,6 +71,10 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
|||||||
await saveJobToGCS(job);
|
await saveJobToGCS(job);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bypassLogging) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (force) {
|
if (force) {
|
||||||
let i = 0,
|
let i = 0,
|
||||||
done = false;
|
done = false;
|
||||||
|
@ -194,7 +194,7 @@ export async function addScrapeJob(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority);
|
await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority, directToBullMQ);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,6 +86,8 @@ import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
|||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
|
||||||
import { redisEvictConnection } from "./redis";
|
import { redisEvictConnection } from "./redis";
|
||||||
|
import { generateURLSplits, queryIndexAtSplitLevel } from "./index";
|
||||||
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@ -319,7 +321,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
scrapeOptions: sc.scrapeOptions,
|
scrapeOptions: sc.scrapeOptions,
|
||||||
crawlerOptions: sc.crawlerOptions,
|
crawlerOptions: sc.crawlerOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
});
|
}, false, job.data.internalOptions?.bypassBilling ?? false);
|
||||||
logger.info("Logged crawl!");
|
logger.info("Logged crawl!");
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
@ -371,8 +373,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
|
job.data.internalOptions?.bypassBilling ?? false,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
// v1 web hooks, call when done with no data, but with event completed
|
// v1 web hooks, call when done with no data, but with event completed
|
||||||
if (job.data.v1 && job.data.webhook) {
|
if (job.data.v1 && job.data.webhook) {
|
||||||
callWebhook(
|
callWebhook(
|
||||||
@ -911,6 +915,29 @@ const workerFun = async (
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
async function kickoffGetIndexLinks(sc: StoredCrawl, crawler: WebCrawler, url: string) {
|
||||||
|
if (sc.crawlerOptions.ignoreSitemap) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const trimmedURL = new URL(url);
|
||||||
|
trimmedURL.search = "";
|
||||||
|
|
||||||
|
const index = await queryIndexAtSplitLevel(
|
||||||
|
sc.crawlerOptions.allowBackwardCrawling ? generateURLSplits(trimmedURL.href)[0] : trimmedURL.href,
|
||||||
|
sc.crawlerOptions.limit ?? 100,
|
||||||
|
);
|
||||||
|
|
||||||
|
const validIndexLinks = crawler.filterLinks(
|
||||||
|
index.filter(x => crawler.filterURL(x, trimmedURL.href) !== null),
|
||||||
|
sc.crawlerOptions.limit ?? 100,
|
||||||
|
sc.crawlerOptions.maxDepth ?? 10,
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
|
||||||
|
return validIndexLinks;
|
||||||
|
}
|
||||||
|
|
||||||
async function processKickoffJob(job: Job & { id: string }, token: string) {
|
async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "queue-worker",
|
module: "queue-worker",
|
||||||
@ -1028,6 +1055,61 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const indexLinks = await kickoffGetIndexLinks(sc, crawler, job.data.url);
|
||||||
|
|
||||||
|
if (indexLinks.length > 0) {
|
||||||
|
logger.debug("Using index links of length " + indexLinks.length, {
|
||||||
|
indexLinksLength: indexLinks.length,
|
||||||
|
});
|
||||||
|
|
||||||
|
let jobPriority = await getJobPriority({
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
basePriority: 21,
|
||||||
|
});
|
||||||
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
|
const jobs = indexLinks.map((url) => {
|
||||||
|
const uuid = uuidv4();
|
||||||
|
return {
|
||||||
|
name: uuid,
|
||||||
|
data: {
|
||||||
|
url,
|
||||||
|
mode: "single_urls" as const,
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
|
internalOptions: sc.internalOptions,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
sitemapped: true,
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
v1: job.data.v1,
|
||||||
|
},
|
||||||
|
opts: {
|
||||||
|
jobId: uuid,
|
||||||
|
priority: 20,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.debug("Locking URLs...");
|
||||||
|
const lockedIds = await lockURLsIndividually(
|
||||||
|
job.data.crawl_id,
|
||||||
|
sc,
|
||||||
|
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
|
||||||
|
);
|
||||||
|
const lockedJobs = jobs.filter((x) =>
|
||||||
|
lockedIds.find((y) => y.id === x.opts.jobId),
|
||||||
|
);
|
||||||
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
|
await addCrawlJobs(
|
||||||
|
job.data.crawl_id,
|
||||||
|
lockedJobs.map((x) => x.opts.jobId),
|
||||||
|
);
|
||||||
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
|
await addScrapeJobs(lockedJobs);
|
||||||
|
}
|
||||||
|
|
||||||
logger.debug("Done queueing jobs!");
|
logger.debug("Done queueing jobs!");
|
||||||
|
|
||||||
await finishCrawlKickoff(job.data.crawl_id);
|
await finishCrawlKickoff(job.data.crawl_id);
|
||||||
@ -1048,7 +1130,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) {
|
async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) {
|
||||||
let creditsToBeBilled: number | null = null;
|
let creditsToBeBilled: number | null = null;
|
||||||
|
|
||||||
if (job.data.is_scrape !== true) {
|
if (job.data.is_scrape !== true && !job.data.internalOptions?.bypassBilling) {
|
||||||
creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, document, job.id, costTracking);
|
creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, document, job.id, costTracking);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@ -1378,6 +1460,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
credits_billed,
|
credits_billed,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
|
job.data.internalOptions?.bypassBilling ?? false,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||||
@ -1424,7 +1507,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
pdf_num_pages: doc.metadata.numPages,
|
pdf_num_pages: doc.metadata.numPages,
|
||||||
credits_billed,
|
credits_billed,
|
||||||
});
|
}, false, job.data.internalOptions?.bypassBilling ?? false);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`🐂 Job done ${job.id}`);
|
logger.info(`🐂 Job done ${job.id}`);
|
||||||
@ -1523,6 +1606,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
|
job.data.internalOptions?.bypassBilling ?? false,
|
||||||
);
|
);
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.25.2",
|
"version": "1.25.3",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -120,6 +120,7 @@ export interface CrawlScrapeOptions {
|
|||||||
removeBase64Images?: boolean;
|
removeBase64Images?: boolean;
|
||||||
blockAds?: boolean;
|
blockAds?: boolean;
|
||||||
proxy?: "basic" | "stealth" | "auto";
|
proxy?: "basic" | "stealth" | "auto";
|
||||||
|
storeInCache?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type Action = {
|
export type Action = {
|
||||||
|
@ -46,6 +46,9 @@ x-common-env: &common-env
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
playwright-service:
|
playwright-service:
|
||||||
|
# NOTE: If you don't want to build the service locally,
|
||||||
|
# uncomment the build: statement and comment out the image: statement
|
||||||
|
# image: ghcr.io/mendableai/playwright-service:latest
|
||||||
build: apps/playwright-service-ts
|
build: apps/playwright-service-ts
|
||||||
environment:
|
environment:
|
||||||
PORT: 3000
|
PORT: 3000
|
||||||
|
Loading…
x
Reference in New Issue
Block a user