mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 04:59:02 +08:00
feat(api/tests/scrape): Playwright test improvements (#1626)
* feat(api/tests/scrape): verify that proxy works on Playwright * debug: logs * remove logs * feat(playwright): add contentType relaying * fix tests * debug * fix json
This commit is contained in:
parent
95f204aab7
commit
8dd5bf7bd9
@ -39,7 +39,16 @@ describe("Scrape tests", () => {
|
|||||||
url: "https://icanhazip.com"
|
url: "https://icanhazip.com"
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
|
expect(response.markdown?.trim()).toContain(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
|
||||||
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("self-hosted proxy works on playwright", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://icanhazip.com",
|
||||||
|
waitFor: 100,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.markdown?.trim()).toContain(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
|
||||||
}, 30000);
|
}, 30000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ import { EngineScrapeResult } from "..";
|
|||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { TimeoutError } from "../../error";
|
import { TimeoutError } from "../../error";
|
||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
import { getInnerJSON } from "../../../../lib/html-transformer";
|
||||||
|
|
||||||
export async function scrapeURLWithPlaywright(
|
export async function scrapeURLWithPlaywright(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
@ -28,6 +29,7 @@ export async function scrapeURLWithPlaywright(
|
|||||||
content: z.string(),
|
content: z.string(),
|
||||||
pageStatusCode: z.number(),
|
pageStatusCode: z.number(),
|
||||||
pageError: z.string().optional(),
|
pageError: z.string().optional(),
|
||||||
|
contentType: z.string().optional(),
|
||||||
}),
|
}),
|
||||||
mock: meta.mock,
|
mock: meta.mock,
|
||||||
abort: AbortSignal.timeout(timeout),
|
abort: AbortSignal.timeout(timeout),
|
||||||
@ -41,10 +43,15 @@ export async function scrapeURLWithPlaywright(
|
|||||||
})(),
|
})(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
if (response.contentType?.includes("application/json")) {
|
||||||
|
response.content = await getInnerJSON(response.content);
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: meta.url, // TODO: impove redirect following
|
url: meta.url, // TODO: impove redirect following
|
||||||
html: response.content,
|
html: response.content,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
|
contentType: response.contentType,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -136,9 +136,10 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
|
|||||||
}
|
}
|
||||||
|
|
||||||
let headers = null, content = await page.content();
|
let headers = null, content = await page.content();
|
||||||
|
let ct: string | undefined = undefined;
|
||||||
if (response) {
|
if (response) {
|
||||||
headers = await response.allHeaders();
|
headers = await response.allHeaders();
|
||||||
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
|
ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type")?.[1];
|
||||||
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
|
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
|
||||||
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
|
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
|
||||||
}
|
}
|
||||||
@ -148,6 +149,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
|
|||||||
content,
|
content,
|
||||||
status: response ? response.status() : null,
|
status: response ? response.status() : null,
|
||||||
headers,
|
headers,
|
||||||
|
contentType: ct,
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -214,6 +216,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
|
|||||||
res.json({
|
res.json({
|
||||||
content: result.content,
|
content: result.content,
|
||||||
pageStatusCode: result.status,
|
pageStatusCode: result.status,
|
||||||
|
contentType: result.contentType,
|
||||||
...(pageError && { pageError })
|
...(pageError && { pageError })
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user