Nick: all tests passing

This commit is contained in:
Nicolas 2024-08-16 19:55:44 -04:00
parent 5205c5f005
commit ba5279eafc
3 changed files with 888 additions and 568 deletions

File diff suppressed because it is too large Load Diff

View File

@ -41,9 +41,10 @@ export async function mapController(
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap const sitemap =
? null sc.crawlerOptions.ignoreSitemap || req.body.search
: await crawler.tryGetSitemap(); ? null
: await crawler.tryGetSitemap();
if (sitemap !== null) { if (sitemap !== null) {
sitemap.map((x) => { sitemap.map((x) => {
@ -51,13 +52,23 @@ export async function mapController(
}); });
} }
const mapResults = await fireEngineMap(`site:${req.body.url}`, { let mapUrl = req.body.search
? `"${req.body.search}" site:${req.body.url}`
: `site:${req.body.url}`;
console.log(mapUrl);
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
numResults: 50, numResults: 50,
}); });
console.log(mapResults);
if (mapResults.length > 0) { if (mapResults.length > 0) {
mapResults.map((x) => { mapResults.map((x) => {
links.push(x.url); if (req.body.search) {
links.unshift(x.url);
} else {
links.push(x.url);
}
}); });
} }

View File

@ -3,22 +3,46 @@ import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities"; import { PageOptions } from "../../lib/entities";
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage"; export type Format =
| "markdown"
| "html"
| "rawHtml"
| "links"
| "screenshot"
| "screenshot@fullPage";
const url = z.preprocess(x => { const url = z.preprocess(
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) { (x) => {
if (x.startsWith("://")) { if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
return "http" + x; if (x.startsWith("://")) {
return "http" + x;
} else {
return "http://" + x;
}
} else { } else {
return "http://" + x; return x;
} }
} else { },
return x; z
} .string()
}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.")); .url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => !isUrlBlocked(x),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
)
);
export const scrapeOptions = z.object({ export const scrapeOptions = z.object({
formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]) formats: z
.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
])
.array() .array()
.optional() .optional()
.default(["markdown"]), .default(["markdown"]),
@ -34,7 +58,7 @@ export const scrapeOptions = z.object({
export type ScrapeOptions = z.infer<typeof scrapeOptions>; export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({ export const scrapeRequestSchema = scrapeOptions.extend({
url: z.string().url(), url,
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
}); });
@ -90,10 +114,10 @@ export const crawlRequestSchema = z.object({
export type CrawlRequest = z.infer<typeof crawlRequestSchema>; export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({ export const mapRequestSchema = crawlerOptions.extend({
url, url: z.string().url(),
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(false), includeSubdomains: z.boolean().default(false),
searchEngine: z.string().optional(), search: z.string().optional(),
}); });
// export type MapRequest = { // export type MapRequest = {
@ -104,11 +128,11 @@ export const mapRequestSchema = crawlerOptions.extend({
export type MapRequest = z.infer<typeof mapRequestSchema>; export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = { export type Document = {
markdown?: string, markdown?: string;
html?: string, html?: string;
rawHtml?: string, rawHtml?: string;
links?: string[], links?: string[];
screenshot?: string, screenshot?: string;
metadata: { metadata: {
title?: string; title?: string;
description?: string; description?: string;
@ -142,8 +166,8 @@ export type Document = {
sourceURL?: string; sourceURL?: string;
statusCode?: number; statusCode?: number;
error?: string; error?: string;
}, };
} };
export type ErrorResponse = { export type ErrorResponse = {
success: false; success: false;
@ -151,11 +175,13 @@ export type ErrorResponse = {
details?: any; details?: any;
}; };
export type ScrapeResponse = ErrorResponse | { export type ScrapeResponse =
success: true; | ErrorResponse
warning?: string; | {
data: Document; success: true;
}; warning?: string;
data: Document;
};
export interface ScrapeResponseRequestTest { export interface ScrapeResponseRequestTest {
statusCode: number; statusCode: number;
@ -163,40 +189,54 @@ export interface ScrapeResponseRequestTest {
error?: string; error?: string;
} }
export type CrawlResponse = ErrorResponse | { export type CrawlResponse =
success: true; | ErrorResponse
id: string; | {
url: string; success: true;
} id: string;
url: string;
};
export type MapResponse = ErrorResponse | { export type MapResponse =
success: true; | ErrorResponse
links: string[]; | {
} success: true;
links: string[];
};
export type CrawlStatusParams = { export type CrawlStatusParams = {
jobId: string; jobId: string;
} };
export type CrawlStatusResponse = ErrorResponse | { export type CrawlStatusResponse =
status: "scraping" | "completed" | "failed" | "cancelled", | ErrorResponse
totalCount: number; | {
creditsUsed: number; status: "scraping" | "completed" | "failed" | "cancelled";
expiresAt: string; totalCount: number;
next?: string; creditsUsed: number;
data: Document[]; expiresAt: string;
} next?: string;
data: Document[];
};
type AuthObject = { type AuthObject = {
team_id: string; team_id: string;
plan: string; plan: string;
} };
export interface RequestWithMaybeAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> { export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth?: AuthObject; auth?: AuthObject;
} }
export interface RequestWithAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> { export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject; auth: AuthObject;
} }
@ -225,7 +265,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
includeLinks: x.formats.includes("links"), includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"), screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF parsePDF: x.parsePDF,
}; };
} }
@ -243,5 +283,5 @@ export function legacyDocumentConverter(doc: any): Document {
error: doc.metadata.pageError, error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode, statusCode: doc.metadata.pageStatusCode,
}, },
} };
} }