Merge pull request #436 from mendableai/mog/fix-infinite-regex

fix(WebScraper): infinite regex leading to fly.io instance hangs
This commit is contained in:
Nicolas 2024-07-18 13:32:44 -04:00 committed by GitHub
commit d338b05446
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -6,13 +6,13 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
const baseUrl = new URL(document.metadata.sourceURL).origin;
const paths =
document.content.match(
/(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g
/!?\[.*?\]\(.*?\)|href=".+?"/g
) || [];
paths.forEach((path: string) => {
try {
const isImage = path.startsWith("!");
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
let url = matchedUrl[1];
if (!url.startsWith("data:") && !url.startsWith("http")) {
@ -50,11 +50,11 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
const baseUrl = new URL(document.metadata.sourceURL).origin;
const images =
document.content.match(
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
/!\[.*?\]\(.*?\)/g
) || [];
images.forEach((image: string) => {
let imageUrl = image.match(/\(([^)]+)\)/)[1];
let imageUrl = image.match(/\((.*?)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
if (!imageUrl.startsWith("data:image")) {