feat(api/tests/scrape): Playwright test improvements (#1626)

* feat(api/tests/scrape): verify that proxy works on Playwright

* debug: logs

* remove logs

* feat(playwright): add contentType relaying

* fix tests

* debug

* fix json
This commit is contained in:
Gergő Móricz 2025-06-04 01:24:19 +02:00 committed by GitHub
parent 95f204aab7
commit 8dd5bf7bd9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 21 additions and 2 deletions

View File

@ -39,7 +39,16 @@ describe("Scrape tests", () => {
url: "https://icanhazip.com"
});
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
expect(response.markdown?.trim()).toContain(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
}, 30000);
it.concurrent("self-hosted proxy works on playwright", async () => {
const response = await scrape({
url: "https://icanhazip.com",
waitFor: 100,
});
expect(response.markdown?.trim()).toContain(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
}, 30000);
}

View File

@ -3,6 +3,7 @@ import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { robustFetch } from "../../lib/fetch";
import { getInnerJSON } from "../../../../lib/html-transformer";
export async function scrapeURLWithPlaywright(
meta: Meta,
@ -28,6 +29,7 @@ export async function scrapeURLWithPlaywright(
content: z.string(),
pageStatusCode: z.number(),
pageError: z.string().optional(),
contentType: z.string().optional(),
}),
mock: meta.mock,
abort: AbortSignal.timeout(timeout),
@ -41,10 +43,15 @@ export async function scrapeURLWithPlaywright(
})(),
]);
if (response.contentType?.includes("application/json")) {
response.content = await getInnerJSON(response.content);
}
return {
url: meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
error: response.pageError,
contentType: response.contentType,
};
}

View File

@ -136,9 +136,10 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
}
let headers = null, content = await page.content();
let ct: string | undefined = undefined;
if (response) {
headers = await response.allHeaders();
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type")?.[1];
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
}
@ -148,6 +149,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
content,
status: response ? response.status() : null,
headers,
contentType: ct,
};
};
@ -214,6 +216,7 @@ app.post('/scrape', async (req: Request, res: Response) => {
res.json({
content: result.content,
pageStatusCode: result.status,
contentType: result.contentType,
...(pageError && { pageError })
});
});