mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 04:16:02 +08:00
fix: fail early on special cookie redirects
This commit is contained in:
parent
26f6202f79
commit
e551695d17
@ -294,7 +294,8 @@ export class CurlControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||||
let leftRedirection = 10;
|
let leftRedirection = 6;
|
||||||
|
let cookieRedirects = 0;
|
||||||
let opts = { ...crawlOpts };
|
let opts = { ...crawlOpts };
|
||||||
let nextHopUrl = urlToCrawl;
|
let nextHopUrl = urlToCrawl;
|
||||||
const fakeHeaderInfos: HeaderInfo[] = [];
|
const fakeHeaderInfos: HeaderInfo[] = [];
|
||||||
@ -312,10 +313,16 @@ export class CurlControl extends AsyncService {
|
|||||||
if (parsed.length) {
|
if (parsed.length) {
|
||||||
opts.cookies = [...(opts.cookies || []), ...parsed];
|
opts.cookies = [...(opts.cookies || []), ...parsed];
|
||||||
}
|
}
|
||||||
|
if (!location) {
|
||||||
|
cookieRedirects += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!location && !setCookieHeader) {
|
if (!location && !setCookieHeader) {
|
||||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
|
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
|
||||||
|
}
|
||||||
|
if (!location && cookieRedirects > 1) {
|
||||||
|
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
nextHopUrl = new URL(location || '', nextHopUrl);
|
nextHopUrl = new URL(location || '', nextHopUrl);
|
||||||
@ -331,7 +338,7 @@ export class CurlControl extends AsyncService {
|
|||||||
};
|
};
|
||||||
} while (leftRedirection > 0);
|
} while (leftRedirection > 0);
|
||||||
|
|
||||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user