From 8dd5bf7bd9ee1a4b0e115a96ce0aeabd69934bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 4 Jun 2025 01:24:19 +0200 Subject: [PATCH] feat(api/tests/scrape): Playwright test improvements (#1626) * feat(api/tests/scrape): verify that proxy works on Playwright * debug: logs * remove logs * feat(playwright): add contentType relaying * fix tests * debug * fix json --- apps/api/src/__tests__/snips/scrape.test.ts | 11 ++++++++++- .../src/scraper/scrapeURL/engines/playwright/index.ts | 7 +++++++ apps/playwright-service-ts/api.ts | 5 ++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index ccb38712..f135c0b5 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -39,7 +39,16 @@ describe("Scrape tests", () => { url: "https://icanhazip.com" }); - expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); + expect(response.markdown?.trim()).toContain(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); + }, 30000); + + it.concurrent("self-hosted proxy works on playwright", async () => { + const response = await scrape({ + url: "https://icanhazip.com", + waitFor: 100, + }); + + expect(response.markdown?.trim()).toContain(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); }, 30000); } diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index 259f4938..57ae8f6b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -3,6 +3,7 @@ import { EngineScrapeResult } from ".."; import { Meta } from "../.."; import { TimeoutError } from "../../error"; import { robustFetch } from "../../lib/fetch"; +import { getInnerJSON } from "../../../../lib/html-transformer"; export async function scrapeURLWithPlaywright( meta: Meta, @@ -28,6 +29,7 @@ export async function scrapeURLWithPlaywright( content: z.string(), pageStatusCode: z.number(), pageError: z.string().optional(), + contentType: z.string().optional(), }), mock: meta.mock, abort: AbortSignal.timeout(timeout), @@ -41,10 +43,15 @@ export async function scrapeURLWithPlaywright( })(), ]); + if (response.contentType?.includes("application/json")) { + response.content = await getInnerJSON(response.content); + } + return { url: meta.url, // TODO: impove redirect following html: response.content, statusCode: response.pageStatusCode, error: response.pageError, + contentType: response.contentType, }; } diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index f6dd25a1..aabcf3bf 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -136,9 +136,10 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki } let headers = null, content = await page.content(); + let ct: string | undefined = undefined; if (response) { headers = await response.allHeaders(); - const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type"); + ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type")?.[1]; if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) { content = (await response.body()).toString("utf8"); // TODO: determine real encoding } @@ -148,6 +149,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki content, status: response ? response.status() : null, headers, + contentType: ct, }; }; @@ -214,6 +216,7 @@ app.post('/scrape', async (req: Request, res: Response) => { res.json({ content: result.content, pageStatusCode: result.status, + contentType: result.contentType, ...(pageError && { pageError }) }); });