mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-17 02:45:55 +08:00
Nick: all tests passing
This commit is contained in:
parent
5205c5f005
commit
ba5279eafc
File diff suppressed because it is too large
Load Diff
@ -41,9 +41,10 @@ export async function mapController(
|
|||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
const sitemap =
|
||||||
? null
|
sc.crawlerOptions.ignoreSitemap || req.body.search
|
||||||
: await crawler.tryGetSitemap();
|
? null
|
||||||
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
sitemap.map((x) => {
|
sitemap.map((x) => {
|
||||||
@ -51,13 +52,23 @@ export async function mapController(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const mapResults = await fireEngineMap(`site:${req.body.url}`, {
|
let mapUrl = req.body.search
|
||||||
|
? `"${req.body.search}" site:${req.body.url}`
|
||||||
|
: `site:${req.body.url}`;
|
||||||
|
console.log(mapUrl);
|
||||||
|
// www. seems to exclude subdomains in some cases
|
||||||
|
const mapResults = await fireEngineMap(mapUrl, {
|
||||||
numResults: 50,
|
numResults: 50,
|
||||||
});
|
});
|
||||||
|
console.log(mapResults);
|
||||||
|
|
||||||
if (mapResults.length > 0) {
|
if (mapResults.length > 0) {
|
||||||
mapResults.map((x) => {
|
mapResults.map((x) => {
|
||||||
links.push(x.url);
|
if (req.body.search) {
|
||||||
|
links.unshift(x.url);
|
||||||
|
} else {
|
||||||
|
links.push(x.url);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,22 +3,46 @@ import { z } from "zod";
|
|||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { PageOptions } from "../../lib/entities";
|
import { PageOptions } from "../../lib/entities";
|
||||||
|
|
||||||
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
|
export type Format =
|
||||||
|
| "markdown"
|
||||||
|
| "html"
|
||||||
|
| "rawHtml"
|
||||||
|
| "links"
|
||||||
|
| "screenshot"
|
||||||
|
| "screenshot@fullPage";
|
||||||
|
|
||||||
const url = z.preprocess(x => {
|
const url = z.preprocess(
|
||||||
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
|
(x) => {
|
||||||
if (x.startsWith("://")) {
|
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
|
||||||
return "http" + x;
|
if (x.startsWith("://")) {
|
||||||
|
return "http" + x;
|
||||||
|
} else {
|
||||||
|
return "http://" + x;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return "http://" + x;
|
return x;
|
||||||
}
|
}
|
||||||
} else {
|
},
|
||||||
return x;
|
z
|
||||||
}
|
.string()
|
||||||
}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."));
|
.url()
|
||||||
|
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||||
|
.refine(
|
||||||
|
(x) => !isUrlBlocked(x),
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
export const scrapeOptions = z.object({
|
export const scrapeOptions = z.object({
|
||||||
formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"])
|
formats: z
|
||||||
|
.enum([
|
||||||
|
"markdown",
|
||||||
|
"html",
|
||||||
|
"rawHtml",
|
||||||
|
"links",
|
||||||
|
"screenshot",
|
||||||
|
"screenshot@fullPage",
|
||||||
|
])
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
.default(["markdown"]),
|
.default(["markdown"]),
|
||||||
@ -34,7 +58,7 @@ export const scrapeOptions = z.object({
|
|||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||||
url: z.string().url(),
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -90,10 +114,10 @@ export const crawlRequestSchema = z.object({
|
|||||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||||
|
|
||||||
export const mapRequestSchema = crawlerOptions.extend({
|
export const mapRequestSchema = crawlerOptions.extend({
|
||||||
url,
|
url: z.string().url(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
includeSubdomains: z.boolean().default(false),
|
includeSubdomains: z.boolean().default(false),
|
||||||
searchEngine: z.string().optional(),
|
search: z.string().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
@ -104,11 +128,11 @@ export const mapRequestSchema = crawlerOptions.extend({
|
|||||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||||
|
|
||||||
export type Document = {
|
export type Document = {
|
||||||
markdown?: string,
|
markdown?: string;
|
||||||
html?: string,
|
html?: string;
|
||||||
rawHtml?: string,
|
rawHtml?: string;
|
||||||
links?: string[],
|
links?: string[];
|
||||||
screenshot?: string,
|
screenshot?: string;
|
||||||
metadata: {
|
metadata: {
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
@ -142,8 +166,8 @@ export type Document = {
|
|||||||
sourceURL?: string;
|
sourceURL?: string;
|
||||||
statusCode?: number;
|
statusCode?: number;
|
||||||
error?: string;
|
error?: string;
|
||||||
},
|
};
|
||||||
}
|
};
|
||||||
|
|
||||||
export type ErrorResponse = {
|
export type ErrorResponse = {
|
||||||
success: false;
|
success: false;
|
||||||
@ -151,11 +175,13 @@ export type ErrorResponse = {
|
|||||||
details?: any;
|
details?: any;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ScrapeResponse = ErrorResponse | {
|
export type ScrapeResponse =
|
||||||
success: true;
|
| ErrorResponse
|
||||||
warning?: string;
|
| {
|
||||||
data: Document;
|
success: true;
|
||||||
};
|
warning?: string;
|
||||||
|
data: Document;
|
||||||
|
};
|
||||||
|
|
||||||
export interface ScrapeResponseRequestTest {
|
export interface ScrapeResponseRequestTest {
|
||||||
statusCode: number;
|
statusCode: number;
|
||||||
@ -163,40 +189,54 @@ export interface ScrapeResponseRequestTest {
|
|||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type CrawlResponse = ErrorResponse | {
|
export type CrawlResponse =
|
||||||
success: true;
|
| ErrorResponse
|
||||||
id: string;
|
| {
|
||||||
url: string;
|
success: true;
|
||||||
}
|
id: string;
|
||||||
|
url: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type MapResponse = ErrorResponse | {
|
export type MapResponse =
|
||||||
success: true;
|
| ErrorResponse
|
||||||
links: string[];
|
| {
|
||||||
}
|
success: true;
|
||||||
|
links: string[];
|
||||||
|
};
|
||||||
|
|
||||||
export type CrawlStatusParams = {
|
export type CrawlStatusParams = {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
}
|
};
|
||||||
|
|
||||||
export type CrawlStatusResponse = ErrorResponse | {
|
export type CrawlStatusResponse =
|
||||||
status: "scraping" | "completed" | "failed" | "cancelled",
|
| ErrorResponse
|
||||||
totalCount: number;
|
| {
|
||||||
creditsUsed: number;
|
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||||
expiresAt: string;
|
totalCount: number;
|
||||||
next?: string;
|
creditsUsed: number;
|
||||||
data: Document[];
|
expiresAt: string;
|
||||||
}
|
next?: string;
|
||||||
|
data: Document[];
|
||||||
|
};
|
||||||
|
|
||||||
type AuthObject = {
|
type AuthObject = {
|
||||||
team_id: string;
|
team_id: string;
|
||||||
plan: string;
|
plan: string;
|
||||||
}
|
};
|
||||||
|
|
||||||
export interface RequestWithMaybeAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> {
|
export interface RequestWithMaybeAuth<
|
||||||
|
ReqParams = {},
|
||||||
|
ReqBody = undefined,
|
||||||
|
ResBody = undefined
|
||||||
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth?: AuthObject;
|
auth?: AuthObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RequestWithAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> {
|
export interface RequestWithAuth<
|
||||||
|
ReqParams = {},
|
||||||
|
ReqBody = undefined,
|
||||||
|
ResBody = undefined
|
||||||
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -225,7 +265,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
includeLinks: x.formats.includes("links"),
|
includeLinks: x.formats.includes("links"),
|
||||||
screenshot: x.formats.includes("screenshot"),
|
screenshot: x.formats.includes("screenshot"),
|
||||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||||
parsePDF: x.parsePDF
|
parsePDF: x.parsePDF,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,5 +283,5 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||||||
error: doc.metadata.pageError,
|
error: doc.metadata.pageError,
|
||||||
statusCode: doc.metadata.pageStatusCode,
|
statusCode: doc.metadata.pageStatusCode,
|
||||||
},
|
},
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user