mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 15:45:59 +08:00
Fixed PDF match custom scraping
Now it's working for both `https://getgc.ai/privacy` and `https://prairie.cards/products/wood-designs` usecases.
This commit is contained in:
parent
96de948d6b
commit
0175152577
@ -29,16 +29,16 @@ export async function handleCustomScraping(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for Google Drive PDF links in the raw HTML
|
// Check for Google Drive PDF links in meta tags
|
||||||
const googleDrivePdfPattern =
|
const googleDriveMetaPattern = /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
|
||||||
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/;
|
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||||
const googleDrivePdfLink = url.match(googleDrivePdfPattern);
|
if (googleDriveMetaMatch) {
|
||||||
if (googleDrivePdfLink) {
|
const url = googleDriveMetaMatch[1];
|
||||||
console.log(
|
console.log(`Google Drive PDF link detected: ${url}`);
|
||||||
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
|
|
||||||
);
|
|
||||||
|
|
||||||
const fileId = googleDrivePdfLink[1];
|
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
||||||
|
if (fileIdMatch) {
|
||||||
|
const fileId = fileIdMatch[1];
|
||||||
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -46,6 +46,7 @@ export async function handleCustomScraping(
|
|||||||
url: pdfUrl
|
url: pdfUrl
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user