mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-04 21:30:39 +08:00
176 lines
5.2 KiB
TypeScript
176 lines
5.2 KiB
TypeScript
import axios from "axios";
|
|
import { promises as fs } from "fs";
|
|
import { v4 as uuidV4 } from "uuid";
|
|
|
|
interface Result {
|
|
start_url: string;
|
|
job_id?: string;
|
|
idempotency_key?: string;
|
|
result_data_jsonb?: any;
|
|
}
|
|
|
|
async function sendCrawl(result: Result): Promise<string | undefined> {
|
|
const idempotencyKey = uuidV4();
|
|
const url = result.start_url;
|
|
try {
|
|
const response = await axios.post(
|
|
"https://staging-firecrawl-scraper-js.fly.dev/v0/crawl",
|
|
{
|
|
url: url,
|
|
crawlerOptions: {
|
|
limit: 75,
|
|
},
|
|
pageOptions: {
|
|
includeHtml: true,
|
|
replaceAllPathsWithAbsolutePaths: true,
|
|
waitFor: 1000,
|
|
},
|
|
},
|
|
{
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
Authorization: `Bearer `,
|
|
},
|
|
}
|
|
);
|
|
result.idempotency_key = idempotencyKey;
|
|
return response.data.jobId;
|
|
} catch (error) {
|
|
console.error("Error sending crawl:", error);
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
async function getContent(result: Result): Promise<boolean> {
|
|
let attempts = 0;
|
|
while (attempts < 120) {
|
|
// Reduce the number of attempts to speed up
|
|
try {
|
|
const response = await axios.get(
|
|
`https://staging-firecrawl-scraper-js.fly.dev/v0/crawl/status/${result.job_id}`,
|
|
{
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
Authorization: `Bearer `,
|
|
},
|
|
}
|
|
);
|
|
if (response.data.status === "completed") {
|
|
result.result_data_jsonb = response.data.data;
|
|
// Job actually completed
|
|
return true;
|
|
}
|
|
} catch (error) {
|
|
console.error("Error getting content:", error);
|
|
}
|
|
const randomSleep = Math.floor(Math.random() * 15000) + 5000;
|
|
await new Promise((resolve) => setTimeout(resolve, randomSleep)); // Reduce sleep time to 1.5 seconds
|
|
attempts++;
|
|
}
|
|
// Set result as null if timed out
|
|
result.result_data_jsonb = null;
|
|
return false;
|
|
}
|
|
|
|
async function processResults(results: Result[]): Promise<void> {
|
|
let processedCount = 0;
|
|
let starterCount = 0;
|
|
const queue: Result[] = [];
|
|
const processedUrls = new Set<string>();
|
|
|
|
// Initialize the queue with the first 1000 results
|
|
for (let i = 0; i < Math.min(100, results.length); i++) {
|
|
queue.push(results[i]);
|
|
processedUrls.add(results[i].start_url);
|
|
}
|
|
|
|
// Function to process a single result
|
|
const processSingleResult = async (result: Result) => {
|
|
const jobId = await sendCrawl(result);
|
|
if (jobId) {
|
|
console.log(`Job requested count: ${starterCount}`);
|
|
starterCount++;
|
|
result.job_id = jobId;
|
|
processedCount++;
|
|
// Save the result to the file
|
|
try {
|
|
// Save job id along with the start_url
|
|
const resultWithJobId = results.map(r => ({
|
|
start_url: r.start_url,
|
|
job_id: r.job_id,
|
|
}));
|
|
await fs.writeFile(
|
|
"results_with_job_id_4000_6000.json",
|
|
JSON.stringify(resultWithJobId, null, 4)
|
|
);
|
|
} catch (error) {
|
|
console.error("Error writing to results_with_content.json:", error);
|
|
}
|
|
|
|
// Add a new result to the queue if there are more results to process
|
|
// if (processedCount < results.length) {
|
|
// for (let i = queue.length; i < results.length; i++) {
|
|
// if (!processedUrls.has(results[i].start_url)) {
|
|
// const nextResult = results[i];
|
|
// console.log("Next result:", nextResult.start_url);
|
|
// queue.push(nextResult);
|
|
// processedUrls.add(nextResult.start_url);
|
|
// console.log(`Queue length: ${queue.length}`);
|
|
// processSingleResult(nextResult);
|
|
// break;
|
|
// }
|
|
// }
|
|
// }
|
|
}
|
|
};
|
|
|
|
// Start processing the initial queue concurrently
|
|
// for (let i = 0; i < queue.length; i++) {
|
|
// processSingleResult(queue[i]);
|
|
// if ((i + 1) % 500 === 0) {
|
|
// console.log(`Processed ${i + 1} results, waiting for 1 minute before adding the next batch...`);
|
|
// await new Promise(resolve => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
|
// }
|
|
// }
|
|
// Start processing the initial queue concurrently
|
|
// await Promise.all(queue.map(result => processSingleResult(result)));
|
|
for (let i = 0; i < results.length; i += 100) {
|
|
const batch = results.slice(i, i + 100);
|
|
Promise.all(batch.map((result) => processSingleResult(result)))
|
|
.then(() => {
|
|
console.log(`Processed ${i + 100} results.`);
|
|
})
|
|
.catch((error) => {
|
|
console.error(`Error processing batch starting at index ${i}:`, error);
|
|
});
|
|
await new Promise((resolve) => setTimeout(resolve, 60 * 1000)); // Wait for 1 minute
|
|
}
|
|
}
|
|
|
|
// Example call
|
|
|
|
async function getStartUrls(): Promise<Result[]> {
|
|
try {
|
|
const data = await fs.readFile("starturls.json", "utf-8");
|
|
return JSON.parse(data);
|
|
} catch (error) {
|
|
console.error("Error reading starturls.json:", error);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const results: Result[] = (await getStartUrls()).slice(3999, 6000);
|
|
// console.log(results.map((r) => r.start_url).slice(0, 3));
|
|
|
|
processResults(results)
|
|
.then(() => {
|
|
console.log("All results processed.");
|
|
})
|
|
.catch((error) => {
|
|
console.error("Error processing results:", error);
|
|
});
|
|
}
|
|
|
|
main();
|