mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-02 23:50:41 +08:00
Nick: resolved conflicts
This commit is contained in:
commit
4d5477f357
@ -37,4 +37,6 @@ COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src
|
|||||||
|
|
||||||
# Start the server by default, this can be overwritten at runtime
|
# Start the server by default, this can be overwritten at runtime
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
|
ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
|
||||||
|
|
||||||
|
ENTRYPOINT "/app/docker-entrypoint.sh"
|
19
apps/api/docker-entrypoint.sh
Executable file
19
apps/api/docker-entrypoint.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash -e
|
||||||
|
|
||||||
|
if [ $UID -eq 0 ]; then
|
||||||
|
ulimit -n 65535
|
||||||
|
echo "NEW ULIMIT: $(ulimit -n)"
|
||||||
|
else
|
||||||
|
echo ENTRYPOINT DID NOT RUN AS ROOT
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $FLY_PROCESS_GROUP = "app" ]; then
|
||||||
|
echo "RUNNING app"
|
||||||
|
node --max-old-space-size=8192 dist/src/index.js
|
||||||
|
elif [ $FLY_PROCESS_GROUP = "worker" ]; then
|
||||||
|
echo "RUNNING worker"
|
||||||
|
node --max-old-space-size=8192 dist/src/services/queue-worker.js
|
||||||
|
else
|
||||||
|
echo "NO FLY PROCESS GROUP"
|
||||||
|
node --max-old-space-size=8192 dist/src/index.js
|
||||||
|
fi
|
@ -67,6 +67,7 @@
|
|||||||
"async": "^3.2.5",
|
"async": "^3.2.5",
|
||||||
"async-mutex": "^0.5.0",
|
"async-mutex": "^0.5.0",
|
||||||
"axios": "^1.3.4",
|
"axios": "^1.3.4",
|
||||||
|
"axios-retry": "^4.5.0",
|
||||||
"bottleneck": "^2.19.5",
|
"bottleneck": "^2.19.5",
|
||||||
"bullmq": "^5.11.0",
|
"bullmq": "^5.11.0",
|
||||||
"cacheable-lookup": "^6.1.0",
|
"cacheable-lookup": "^6.1.0",
|
||||||
|
21
apps/api/pnpm-lock.yaml
generated
21
apps/api/pnpm-lock.yaml
generated
@ -65,6 +65,9 @@ importers:
|
|||||||
axios:
|
axios:
|
||||||
specifier: ^1.3.4
|
specifier: ^1.3.4
|
||||||
version: 1.7.2
|
version: 1.7.2
|
||||||
|
axios-retry:
|
||||||
|
specifier: ^4.5.0
|
||||||
|
version: 4.5.0(axios@1.7.2)
|
||||||
bottleneck:
|
bottleneck:
|
||||||
specifier: ^2.19.5
|
specifier: ^2.19.5
|
||||||
version: 2.19.5
|
version: 2.19.5
|
||||||
@ -1903,6 +1906,11 @@ packages:
|
|||||||
axios-retry@3.9.1:
|
axios-retry@3.9.1:
|
||||||
resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}
|
resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}
|
||||||
|
|
||||||
|
axios-retry@4.5.0:
|
||||||
|
resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
|
||||||
|
peerDependencies:
|
||||||
|
axios: 0.x || 1.x
|
||||||
|
|
||||||
axios@0.26.1:
|
axios@0.26.1:
|
||||||
resolution: {integrity: sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==}
|
resolution: {integrity: sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==}
|
||||||
|
|
||||||
@ -4518,8 +4526,8 @@ packages:
|
|||||||
engines: {node: '>=14.17'}
|
engines: {node: '>=14.17'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
typescript@5.5.4:
|
typescript@5.6.2:
|
||||||
resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==}
|
resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==}
|
||||||
engines: {node: '>=14.17'}
|
engines: {node: '>=14.17'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
@ -6950,6 +6958,11 @@ snapshots:
|
|||||||
'@babel/runtime': 7.24.6
|
'@babel/runtime': 7.24.6
|
||||||
is-retry-allowed: 2.2.0
|
is-retry-allowed: 2.2.0
|
||||||
|
|
||||||
|
axios-retry@4.5.0(axios@1.7.2):
|
||||||
|
dependencies:
|
||||||
|
axios: 1.7.2
|
||||||
|
is-retry-allowed: 2.2.0
|
||||||
|
|
||||||
axios@0.26.1:
|
axios@0.26.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
follow-redirects: 1.15.6
|
follow-redirects: 1.15.6
|
||||||
@ -9195,7 +9208,7 @@ snapshots:
|
|||||||
csv-parse: 5.5.6
|
csv-parse: 5.5.6
|
||||||
gpt3-tokenizer: 1.1.5
|
gpt3-tokenizer: 1.1.5
|
||||||
openai: 3.3.0
|
openai: 3.3.0
|
||||||
typescript: 5.5.4
|
typescript: 5.6.2
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
zod: 3.23.8
|
zod: 3.23.8
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
@ -9793,7 +9806,7 @@ snapshots:
|
|||||||
|
|
||||||
typescript@5.4.5: {}
|
typescript@5.4.5: {}
|
||||||
|
|
||||||
typescript@5.5.4: {}
|
typescript@5.6.2: {}
|
||||||
|
|
||||||
typesense@1.8.2(@babel/runtime@7.24.6):
|
typesense@1.8.2(@babel/runtime@7.24.6):
|
||||||
dependencies:
|
dependencies:
|
||||||
|
@ -61,11 +61,10 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk
|
|||||||
|
|
||||||
// Cache for 10 minutes. This means that changing subscription tier could have
|
// Cache for 10 minutes. This means that changing subscription tier could have
|
||||||
// a maximum of 10 minutes of a delay. - mogery
|
// a maximum of 10 minutes of a delay. - mogery
|
||||||
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600);
|
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error updating cached ACUC: ${error}`);
|
Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
|
||||||
Sentry.captureException(error);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,8 +60,8 @@ export async function scrapeHelper(
|
|||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
|
||||||
pageOptions,
|
pageOptions,
|
||||||
|
plan,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
@ -197,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
await checkTeamCredits(chunk, team_id, 1);
|
await checkTeamCredits(chunk, team_id, 1);
|
||||||
if (!creditsCheckSuccess) {
|
if (!creditsCheckSuccess) {
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(402).json({ error: "Insufficient credits" });
|
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(error);
|
Logger.error(error);
|
||||||
|
@ -37,7 +37,12 @@ export async function searchHelper(
|
|||||||
|
|
||||||
const tbs = searchOptions.tbs ?? null;
|
const tbs = searchOptions.tbs ?? null;
|
||||||
const filter = searchOptions.filter ?? null;
|
const filter = searchOptions.filter ?? null;
|
||||||
const num_results = searchOptions.limit ?? 7;
|
let num_results = Math.min(searchOptions.limit ?? 7, 10);
|
||||||
|
|
||||||
|
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
||||||
|
num_results = 1;
|
||||||
|
}
|
||||||
|
|
||||||
const num_results_buffer = Math.floor(num_results * 1.5);
|
const num_results_buffer = Math.floor(num_results * 1.5);
|
||||||
|
|
||||||
let res = await search({
|
let res = await search({
|
||||||
@ -98,7 +103,7 @@ export async function searchHelper(
|
|||||||
if (Sentry.isInitialized()) {
|
if (Sentry.isInitialized()) {
|
||||||
for (const job of jobDatas) {
|
for (const job of jobDatas) {
|
||||||
// add with sentry instrumentation
|
// add with sentry instrumentation
|
||||||
jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId));
|
jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
jobs = await getScrapeQueue().addBulk(jobDatas);
|
jobs = await getScrapeQueue().addBulk(jobDatas);
|
||||||
@ -170,7 +175,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
jobId,
|
jobId,
|
||||||
req,
|
req,
|
||||||
team_id,
|
team_id,
|
||||||
chunk.sub_id,
|
chunk?.sub_id,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
searchOptions,
|
searchOptions,
|
||||||
|
@ -86,8 +86,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we ran over the bytes limit, remove the last document
|
// if we ran over the bytes limit, remove the last document, except if it's the only document
|
||||||
if (bytes > bytesLimit) {
|
if (bytes > bytesLimit && doneJobs.length !== 1) {
|
||||||
doneJobs.splice(doneJobs.length - 1, 1);
|
doneJobs.splice(doneJobs.length - 1, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -152,7 +152,7 @@ export async function mapController(
|
|||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = removeDuplicateUrls(links);
|
links = removeDuplicateUrls(links);
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc.sub_id, 1).catch((error) => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||||
Logger.error(
|
Logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
||||||
);
|
);
|
||||||
|
@ -109,7 +109,7 @@ export async function scrapeController(
|
|||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc.sub_id, creditsToBeBilled).catch(error => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
||||||
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
|
@ -390,6 +390,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
generateImgAltText: false,
|
generateImgAltText: false,
|
||||||
allowBackwardCrawling: x.allowBackwardLinks,
|
allowBackwardCrawling: x.allowBackwardLinks,
|
||||||
allowExternalContentLinks: x.allowExternalLinks,
|
allowExternalContentLinks: x.allowExternalLinks,
|
||||||
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,6 +37,10 @@ export async function getJobPriority({
|
|||||||
team_id: string;
|
team_id: string;
|
||||||
basePriority?: number;
|
basePriority?: number;
|
||||||
}): Promise<number> {
|
}): Promise<number> {
|
||||||
|
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
||||||
|
return 50;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const setKey = SET_KEY_PREFIX + team_id;
|
const setKey = SET_KEY_PREFIX + team_id;
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ export class Logger {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static log (message: string, level: LogLevel) {
|
static log (message: string, level: LogLevel) {
|
||||||
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
|
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
|
||||||
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
|
||||||
const currentLevelIndex = levels.indexOf(logLevel);
|
const currentLevelIndex = levels.indexOf(logLevel);
|
||||||
const messageLevelIndex = levels.indexOf(level);
|
const messageLevelIndex = levels.indexOf(level);
|
||||||
|
@ -35,7 +35,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||||||
if (!success) {
|
if (!success) {
|
||||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
req.account = { remainingCredits };
|
req.account = { remainingCredits };
|
||||||
|
@ -6,7 +6,11 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
|||||||
import { universalTimeout } from "../global";
|
import { universalTimeout } from "../global";
|
||||||
import { Logger } from "../../../lib/logger";
|
import { Logger } from "../../../lib/logger";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
import axiosRetry from 'axios-retry';
|
||||||
|
|
||||||
|
axiosRetry(axios, { retries: 3 , onRetry:()=>{
|
||||||
|
console.log("Retrying (fire-engine)...");
|
||||||
|
}, retryDelay: axiosRetry.exponentialDelay});
|
||||||
/**
|
/**
|
||||||
* Scrapes a URL with Fire-Engine
|
* Scrapes a URL with Fire-Engine
|
||||||
* @param url The URL to scrape
|
* @param url The URL to scrape
|
||||||
@ -203,10 +207,10 @@ export async function scrapWithFireEngine({
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === "ECONNABORTED") {
|
if (error.code === "ECONNABORTED") {
|
||||||
Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
|
Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
|
||||||
logParams.error_message = "Request timed out";
|
logParams.error_message = "Request timed out";
|
||||||
} else {
|
} else {
|
||||||
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
}
|
}
|
||||||
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
|
@ -29,8 +29,8 @@ const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIR
|
|||||||
|
|
||||||
export const baseScrapers = [
|
export const baseScrapers = [
|
||||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||||
useScrapingBee ? "scrapingBee" : undefined,
|
|
||||||
useFireEngine ? "fire-engine" : undefined,
|
useFireEngine ? "fire-engine" : undefined,
|
||||||
|
useScrapingBee ? "scrapingBee" : undefined,
|
||||||
useFireEngine ? undefined : "playwright",
|
useFireEngine ? undefined : "playwright",
|
||||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||||
"fetch",
|
"fetch",
|
||||||
@ -95,8 +95,8 @@ function getScrapingFallbackOrder(
|
|||||||
|
|
||||||
let defaultOrder = [
|
let defaultOrder = [
|
||||||
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||||
useScrapingBee ? "scrapingBee" : undefined,
|
|
||||||
useFireEngine ? "fire-engine" : undefined,
|
useFireEngine ? "fire-engine" : undefined,
|
||||||
|
useScrapingBee ? "scrapingBee" : undefined,
|
||||||
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||||
useFireEngine ? undefined : "playwright",
|
useFireEngine ? undefined : "playwright",
|
||||||
"fetch",
|
"fetch",
|
||||||
@ -424,7 +424,7 @@ export async function scrapSingleUrl(
|
|||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
if (pageStatusCode && (pageStatusCode == 404)) {
|
||||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,30 +1,31 @@
|
|||||||
import cheerio, { AnyNode, Cheerio } from "cheerio";
|
import { AnyNode, Cheerio, load } from "cheerio";
|
||||||
import { PageOptions } from "../../../lib/entities";
|
import { PageOptions } from "../../../lib/entities";
|
||||||
import { excludeNonMainTags } from "./excludeTags";
|
import { excludeNonMainTags } from "./excludeTags";
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const removeUnwantedElements = (
|
||||||
html: string,
|
html: string,
|
||||||
pageOptions: PageOptions
|
pageOptions: PageOptions,
|
||||||
) => {
|
) => {
|
||||||
const soup = cheerio.load(html);
|
let soup = load(html);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
pageOptions.onlyIncludeTags &&
|
pageOptions.onlyIncludeTags &&
|
||||||
pageOptions.onlyIncludeTags.length > 0 &&
|
pageOptions.onlyIncludeTags.length > 0 &&
|
||||||
pageOptions.onlyIncludeTags[0] !== ''
|
pageOptions.onlyIncludeTags[0] !== ""
|
||||||
) {
|
) {
|
||||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||||
}
|
}
|
||||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||||
// Create a new root element to hold the tags to keep
|
// Create a new root element to hold the tags to keep
|
||||||
const newRoot = cheerio.load("<div></div>")("div");
|
const newRoot = load("<div></div>")("div");
|
||||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||||
soup(tag).each((index, element) => {
|
soup(tag).each((index, element) => {
|
||||||
newRoot.append(soup(element).clone());
|
newRoot.append(soup(element).clone());
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
return newRoot.html();
|
|
||||||
|
soup = load(newRoot.html());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,7 +34,7 @@ export const removeUnwantedElements = (
|
|||||||
if (
|
if (
|
||||||
pageOptions.removeTags &&
|
pageOptions.removeTags &&
|
||||||
pageOptions.removeTags.length > 0 &&
|
pageOptions.removeTags.length > 0 &&
|
||||||
pageOptions.removeTags[0] !== ''
|
pageOptions.removeTags[0] !== ""
|
||||||
) {
|
) {
|
||||||
if (typeof pageOptions.removeTags === "string") {
|
if (typeof pageOptions.removeTags === "string") {
|
||||||
pageOptions.removeTags = [pageOptions.removeTags];
|
pageOptions.removeTags = [pageOptions.removeTags];
|
||||||
@ -51,11 +52,11 @@ export const removeUnwantedElements = (
|
|||||||
const attributes = element.attribs;
|
const attributes = element.attribs;
|
||||||
const tagNameMatches = regexPattern.test(element.name);
|
const tagNameMatches = regexPattern.test(element.name);
|
||||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||||
);
|
);
|
||||||
if (tag.startsWith("*.")) {
|
if (tag.startsWith("*.")) {
|
||||||
classMatch = Object.keys(attributes).some((attr) =>
|
classMatch = Object.keys(attributes).some((attr) =>
|
||||||
regexPattern.test(`class="${attributes[attr]}"`)
|
regexPattern.test(`class="${attributes[attr]}"`),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return tagNameMatches || attributesMatch || classMatch;
|
return tagNameMatches || attributesMatch || classMatch;
|
||||||
|
@ -66,7 +66,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id:
|
|||||||
chunk.sub_current_period_start,
|
chunk.sub_current_period_start,
|
||||||
chunk.sub_current_period_end
|
chunk.sub_current_period_end
|
||||||
);
|
);
|
||||||
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: chunk.remaining_credits, chunk };
|
return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk };
|
||||||
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||||
// Send email notification for approaching credit limit
|
// Send email notification for approaching credit limit
|
||||||
sendNotification(
|
sendNotification(
|
||||||
|
@ -35,12 +35,15 @@ redisRateLimitClient.on("connect", (err) => {
|
|||||||
* @param {string} value The value to store.
|
* @param {string} value The value to store.
|
||||||
* @param {number} [expire] Optional expiration time in seconds.
|
* @param {number} [expire] Optional expiration time in seconds.
|
||||||
*/
|
*/
|
||||||
const setValue = async (key: string, value: string, expire?: number) => {
|
const setValue = async (key: string, value: string, expire?: number, nx = false) => {
|
||||||
if (expire) {
|
if (expire && !nx) {
|
||||||
await redisRateLimitClient.set(key, value, "EX", expire);
|
await redisRateLimitClient.set(key, value, "EX", expire);
|
||||||
} else {
|
} else {
|
||||||
await redisRateLimitClient.set(key, value);
|
await redisRateLimitClient.set(key, value);
|
||||||
}
|
}
|
||||||
|
if (expire && nx) {
|
||||||
|
await redisRateLimitClient.expire(key, expire, "NX");
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
283
examples/o1_job_recommender/o1_job_recommender.py
Normal file
283
examples/o1_job_recommender/o1_job_recommender.py
Normal file
@ -0,0 +1,283 @@
|
|||||||
|
# %%
|
||||||
|
# %%
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# ANSI color codes
|
||||||
|
class Colors:
|
||||||
|
CYAN = '\033[96m'
|
||||||
|
YELLOW = '\033[93m'
|
||||||
|
GREEN = '\033[92m'
|
||||||
|
RED = '\033[91m'
|
||||||
|
MAGENTA = '\033[95m'
|
||||||
|
BLUE = '\033[94m'
|
||||||
|
RESET = '\033[0m'
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Initialize the FirecrawlApp with your API key
|
||||||
|
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||||
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
# Set the jobs page URL
|
||||||
|
jobs_page_url = "https://openai.com/careers/search"
|
||||||
|
|
||||||
|
# Resume
|
||||||
|
resume_paste = """"
|
||||||
|
Eric Ciarla
|
||||||
|
Co-Founder @ Firecrawl
|
||||||
|
San Francisco, California, United States
|
||||||
|
Summary
|
||||||
|
Building…
|
||||||
|
Experience
|
||||||
|
Firecrawl
|
||||||
|
Co-Founder
|
||||||
|
April 2024 - Present (6 months)
|
||||||
|
San Francisco, California, United States
|
||||||
|
Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by
|
||||||
|
Amazon, Zapier, and Nvidia (YC S22)
|
||||||
|
Mendable
|
||||||
|
2 years 7 months
|
||||||
|
Co-Founder @ Mendable.ai
|
||||||
|
March 2022 - Present (2 years 7 months)
|
||||||
|
San Francisco, California, United States
|
||||||
|
- Built an AI powered search platform that that served millions of queries for
|
||||||
|
hundreds of customers (YC S22)
|
||||||
|
- We were one of the first LLM powered apps adopted by industry leaders like
|
||||||
|
Coinbase, Snap, DoorDash, and MongoDB
|
||||||
|
Co-Founder @ SideGuide
|
||||||
|
March 2022 - Present (2 years 7 months)
|
||||||
|
San Francisco, California, United States
|
||||||
|
- Built and scaled an online course platform with a community of over 50,000
|
||||||
|
developers
|
||||||
|
- Selected for Y Combinator S22 batch, 2% acceptance rate
|
||||||
|
Fracta
|
||||||
|
Data Engineer
|
||||||
|
2022 - 2022 (less than a year)
|
||||||
|
Palo Alto, California, United States
|
||||||
|
- Demoed tool during sales calls and provided technical support during the
|
||||||
|
entire customer lifecycle
|
||||||
|
Page 1 of 2
|
||||||
|
- Mined, wrangled, & visualized geospatial and water utility data for predictive
|
||||||
|
analytics & ML workflows (Python, QGIS)
|
||||||
|
Ford Motor Company
|
||||||
|
Data Scientist
|
||||||
|
2021 - 2021 (less than a year)
|
||||||
|
Dearborn, Michigan, United States
|
||||||
|
- Extracted, cleaned, and joined data from multiple sources using SQL,
|
||||||
|
Hadoop, and Alteryx
|
||||||
|
- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the
|
||||||
|
relationships between survey free response verbatim topics (derived from
|
||||||
|
natural language processing models) and numerical customer experience
|
||||||
|
scores
|
||||||
|
MDRemindME
|
||||||
|
Co-Founder
|
||||||
|
2018 - 2020 (2 years)
|
||||||
|
Durham, New Hampshire, United States
|
||||||
|
- Founded and led a healthtech startup aimed at improving patient adherence
|
||||||
|
to treatment plans through an innovative engagement and retention tool
|
||||||
|
- Piloted the product with healthcare providers and patients, gathering critical
|
||||||
|
insights to refine functionality and enhance user experience
|
||||||
|
- Secured funding through National Science Foundation I-CORPS Grant and
|
||||||
|
UNH Entrepreneurship Center Seed Grant
|
||||||
|
Education
|
||||||
|
Y Combinator
|
||||||
|
S22
|
||||||
|
University of New Hampshire
|
||||||
|
Economics and Philosophy
|
||||||
|
"""
|
||||||
|
|
||||||
|
# First, scrape the jobs page using Firecrawl
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"https://api.firecrawl.dev/v1/scrape",
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {firecrawl_api_key}"
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"url": jobs_page_url,
|
||||||
|
"formats": ["markdown"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
if result.get('success'):
|
||||||
|
html_content = result['data']['markdown']
|
||||||
|
# Define the O1 prompt for extracting apply links
|
||||||
|
prompt = f"""
|
||||||
|
Extract up to 30 job application links from the given markdown content.
|
||||||
|
Return the result as a JSON object with a single key 'apply_links' containing an array of strings (the links).
|
||||||
|
The output should be a valid JSON object, with no additional text.
|
||||||
|
Do not include any JSON markdown formatting or code block indicators.
|
||||||
|
Provide only the raw JSON object as the response.
|
||||||
|
|
||||||
|
Example of the expected format:
|
||||||
|
{{"apply_links": ["https://example.com/job1", "https://example.com/job2", ...]}}
|
||||||
|
|
||||||
|
Markdown content:
|
||||||
|
{html_content[:100000]}
|
||||||
|
"""
|
||||||
|
print(f"{Colors.GREEN}Successfully scraped the jobs page{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{Colors.RED}Failed to scrape the jobs page: {result.get('message', 'Unknown error')}{Colors.RESET}")
|
||||||
|
html_content = ""
|
||||||
|
else:
|
||||||
|
print(f"{Colors.RED}Error {response.status_code}: {response.text}{Colors.RESET}")
|
||||||
|
html_content = ""
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"{Colors.RED}An error occurred while scraping: {str(e)}{Colors.RESET}")
|
||||||
|
html_content = ""
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"{Colors.RED}Error decoding JSON response: {str(e)}{Colors.RESET}")
|
||||||
|
html_content = ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.RED}An unexpected error occurred while scraping: {str(e)}{Colors.RESET}")
|
||||||
|
html_content = ""
|
||||||
|
|
||||||
|
# Extract apply links from the scraped HTML using O1
|
||||||
|
apply_links = []
|
||||||
|
if html_content:
|
||||||
|
try:
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-4o",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if completion.choices:
|
||||||
|
print(completion.choices[0].message.content)
|
||||||
|
result = json.loads(completion.choices[0].message.content.strip())
|
||||||
|
|
||||||
|
apply_links = result['apply_links']
|
||||||
|
print(f"{Colors.GREEN}Successfully extracted {len(apply_links)} apply links{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{Colors.RED}No apply links extracted{Colors.RESET}")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"{Colors.RED}Error decoding JSON from OpenAI response: {str(e)}{Colors.RESET}")
|
||||||
|
except KeyError as e:
|
||||||
|
print(f"{Colors.RED}Expected key not found in OpenAI response: {str(e)}{Colors.RESET}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.RED}An unexpected error occurred during extraction: {str(e)}{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{Colors.RED}No HTML content to process{Colors.RESET}")
|
||||||
|
|
||||||
|
# Initialize a list to store the extracted data
|
||||||
|
extracted_data = []
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print(f"{Colors.CYAN}Apply links:{Colors.RESET}")
|
||||||
|
for link in apply_links:
|
||||||
|
print(f"{Colors.YELLOW}{link}{Colors.RESET}")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Process each apply link
|
||||||
|
for index, link in enumerate(apply_links):
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"https://api.firecrawl.dev/v1/scrape",
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {firecrawl_api_key}"
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"url": link,
|
||||||
|
"formats": ["extract"],
|
||||||
|
"actions": [{
|
||||||
|
"type": "click",
|
||||||
|
"selector": "#job-overview"
|
||||||
|
}],
|
||||||
|
"extract": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"job_title": {"type": "string"},
|
||||||
|
"sub_division_of_organization": {"type": "string"},
|
||||||
|
"key_skills": {"type": "array", "items": {"type": "string"}},
|
||||||
|
"compensation": {"type": "string"},
|
||||||
|
"location": {"type": "string"},
|
||||||
|
"apply_link": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["job_title", "sub_division_of_organization", "key_skills", "compensation", "location", "apply_link"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
if result.get('success'):
|
||||||
|
extracted_data.append(result['data']['extract'])
|
||||||
|
print(f"{Colors.GREEN}Data extracted for job {index}{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f"")
|
||||||
|
else:
|
||||||
|
print(f"")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# %%
|
||||||
|
# Print the extracted data
|
||||||
|
print(f"{Colors.CYAN}Extracted data:{Colors.RESET}")
|
||||||
|
for job in extracted_data:
|
||||||
|
print(json.dumps(job, indent=2))
|
||||||
|
print(f"{Colors.MAGENTA}{'-' * 50}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Use o1-preview to choose which jobs should be applied to based on the resume
|
||||||
|
prompt = f"""
|
||||||
|
Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:
|
||||||
|
|
||||||
|
[
|
||||||
|
{{
|
||||||
|
"job_title": "Job Title",
|
||||||
|
"compensation": "Compensation (if available, otherwise empty string)",
|
||||||
|
"apply_link": "Application URL"
|
||||||
|
}},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
|
||||||
|
Based on the following resume:
|
||||||
|
{resume_paste}
|
||||||
|
|
||||||
|
And the following job listings:
|
||||||
|
{json.dumps(extracted_data, indent=2)}
|
||||||
|
"""
|
||||||
|
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="o1-preview",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
recommended_jobs = json.loads(completion.choices[0].message.content.strip())
|
||||||
|
|
||||||
|
print(f"{Colors.CYAN}Recommended jobs:{Colors.RESET}")
|
||||||
|
print(json.dumps(recommended_jobs, indent=2))
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user