Fixed PDF match custom scraping

Now it's working for both `https://getgc.ai/privacy` and `https://prairie.cards/products/wood-designs` usecases.
This commit is contained in:
rafaelsideguide 2024-07-02 11:25:17 -03:00
parent 96de948d6b
commit 0175152577

View File

@ -29,16 +29,16 @@ export async function handleCustomScraping(
}; };
} }
// Check for Google Drive PDF links in the raw HTML // Check for Google Drive PDF links in meta tags
const googleDrivePdfPattern = const googleDriveMetaPattern = /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/; const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
const googleDrivePdfLink = url.match(googleDrivePdfPattern); if (googleDriveMetaMatch) {
if (googleDrivePdfLink) { const url = googleDriveMetaMatch[1];
console.log( console.log(`Google Drive PDF link detected: ${url}`);
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
);
const fileId = googleDrivePdfLink[1]; const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
if (fileIdMatch) {
const fileId = fileIdMatch[1];
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`; const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
return { return {
@ -46,6 +46,7 @@ export async function handleCustomScraping(
url: pdfUrl url: pdfUrl
}; };
} }
}
return null; return null;
} }