From 7680ac25179aed4d48815e178aa22ac8399c6381 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 19:57:06 +0200 Subject: [PATCH 01/15] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 107 +++++++++++------- 1 file changed, 63 insertions(+), 44 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index f59dd7df5..337436960 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -70,48 +70,67 @@ class YoutubeLoader: self.language = language def load(self) -> List[Document]: - """Load YouTube transcripts into `Document` objects.""" - try: - from youtube_transcript_api import ( - NoTranscriptFound, - TranscriptsDisabled, - YouTubeTranscriptApi, - ) - except ImportError: - raise ImportError( - 'Could not import "youtube_transcript_api" Python package. ' - "Please install it with `pip install youtube-transcript-api`." - ) - - if self.proxy_url: - youtube_proxies = { - "http": self.proxy_url, - "https": self.proxy_url, - } - # Don't log complete URL because it might contain secrets - log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") - else: - youtube_proxies = None - - try: - transcript_list = YouTubeTranscriptApi.list_transcripts( - self.video_id, proxies=youtube_proxies - ) - except Exception as e: - log.exception("Loading YouTube transcript failed") - return [] - - try: - transcript = transcript_list.find_transcript(self.language) - except NoTranscriptFound: - transcript = transcript_list.find_transcript(["en"]) - - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - - transcript = " ".join( - map( - lambda transcript_piece: transcript_piece.text.strip(" "), - transcript_pieces, - ) + """Load YouTube transcripts into `Document` objects.""" + try: + from youtube_transcript_api import ( + NoTranscriptFound, + TranscriptsDisabled, + YouTubeTranscriptApi, ) - return [Document(page_content=transcript, metadata=self._metadata)] + except ImportError: + raise ImportError( + 'Could not import "youtube_transcript_api" Python package. ' + "Please install it with `pip install youtube-transcript-api`." + ) + + if self.proxy_url: + youtube_proxies = { + "http": self.proxy_url, + "https": self.proxy_url, + } + # Don't log complete URL because it might contain secrets + log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") + else: + youtube_proxies = None + + try: + transcript_list = YouTubeTranscriptApi.list_transcripts( + self.video_id, proxies=youtube_proxies + ) + except Exception as e: + log.exception("Loading YouTube transcript failed") + return [] + + # Try each language in order of priority + last_exception = None + for lang in self.language: + try: + log.debug(f"Attempting to find transcript for language '{lang}'") + transcript = transcript_list.find_transcript([lang]) + log.info(f"Found transcript for language '{lang}'") + + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + transcript_text = " ".join( + map( + lambda transcript_piece: transcript_piece.text.strip(" "), + transcript_pieces, + ) + ) + return [Document(page_content=transcript_text, metadata=self._metadata)] + except NoTranscriptFound as e: + log.debug(f"No transcript found for language '{lang}'") + last_exception = e + continue + except Exception as e: + # If we hit any other type of exception, log it and re-raise + log.exception(f"Error finding transcript for language '{lang}'") + raise e + + # If all specified languages fail, raise the last exception + # This maintains compatibility with the error handling in the rest of the application + if last_exception: + log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") + raise last_exception + + # This should never happen (we'd have raised an exception above) + return [] From 0a845db8eca7554d6310b7fad4d7360e2db66b91 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 19:57:21 +0200 Subject: [PATCH 02/15] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 122 +++++++++--------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 337436960..c1c8669f1 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -70,67 +70,67 @@ class YoutubeLoader: self.language = language def load(self) -> List[Document]: - """Load YouTube transcripts into `Document` objects.""" - try: - from youtube_transcript_api import ( - NoTranscriptFound, - TranscriptsDisabled, - YouTubeTranscriptApi, - ) - except ImportError: - raise ImportError( - 'Could not import "youtube_transcript_api" Python package. ' - "Please install it with `pip install youtube-transcript-api`." - ) - - if self.proxy_url: - youtube_proxies = { - "http": self.proxy_url, - "https": self.proxy_url, - } - # Don't log complete URL because it might contain secrets - log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") - else: - youtube_proxies = None - - try: - transcript_list = YouTubeTranscriptApi.list_transcripts( - self.video_id, proxies=youtube_proxies - ) - except Exception as e: - log.exception("Loading YouTube transcript failed") - return [] - - # Try each language in order of priority - last_exception = None - for lang in self.language: + """Load YouTube transcripts into `Document` objects.""" try: - log.debug(f"Attempting to find transcript for language '{lang}'") - transcript = transcript_list.find_transcript([lang]) - log.info(f"Found transcript for language '{lang}'") - - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - transcript_text = " ".join( - map( - lambda transcript_piece: transcript_piece.text.strip(" "), - transcript_pieces, - ) + from youtube_transcript_api import ( + NoTranscriptFound, + TranscriptsDisabled, + YouTubeTranscriptApi, + ) + except ImportError: + raise ImportError( + 'Could not import "youtube_transcript_api" Python package. ' + "Please install it with `pip install youtube-transcript-api`." ) - return [Document(page_content=transcript_text, metadata=self._metadata)] - except NoTranscriptFound as e: - log.debug(f"No transcript found for language '{lang}'") - last_exception = e - continue - except Exception as e: - # If we hit any other type of exception, log it and re-raise - log.exception(f"Error finding transcript for language '{lang}'") - raise e - - # If all specified languages fail, raise the last exception - # This maintains compatibility with the error handling in the rest of the application - if last_exception: - log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") - raise last_exception - # This should never happen (we'd have raised an exception above) - return [] + if self.proxy_url: + youtube_proxies = { + "http": self.proxy_url, + "https": self.proxy_url, + } + # Don't log complete URL because it might contain secrets + log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") + else: + youtube_proxies = None + + try: + transcript_list = YouTubeTranscriptApi.list_transcripts( + self.video_id, proxies=youtube_proxies + ) + except Exception as e: + log.exception("Loading YouTube transcript failed") + return [] + + # Try each language in order of priority + last_exception = None + for lang in self.language: + try: + log.debug(f"Attempting to find transcript for language '{lang}'") + transcript = transcript_list.find_transcript([lang]) + log.info(f"Found transcript for language '{lang}'") + + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + transcript_text = " ".join( + map( + lambda transcript_piece: transcript_piece.text.strip(" "), + transcript_pieces, + ) + ) + return [Document(page_content=transcript_text, metadata=self._metadata)] + except NoTranscriptFound as e: + log.debug(f"No transcript found for language '{lang}'") + last_exception = e + continue + except Exception as e: + # If we hit any other type of exception, log it and re-raise + log.exception(f"Error finding transcript for language '{lang}'") + raise e + + # If all specified languages fail, raise the last exception + # This maintains compatibility with the error handling in the rest of the application + if last_exception: + log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") + raise last_exception + + # This should never happen (we'd have raised an exception above) + return [] From 0a3817ed860b2f1d1db190ec6a539b037d1f0701 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 20:00:10 +0200 Subject: [PATCH 03/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index c1c8669f1..48347aa09 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -88,8 +88,7 @@ class YoutubeLoader: "http": self.proxy_url, "https": self.proxy_url, } - # Don't log complete URL because it might contain secrets - log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") + log.debug(f"Using proxy URL: {self.proxy_url}...") else: youtube_proxies = None @@ -105,9 +104,8 @@ class YoutubeLoader: last_exception = None for lang in self.language: try: - log.debug(f"Attempting to find transcript for language '{lang}'") transcript = transcript_list.find_transcript([lang]) - log.info(f"Found transcript for language '{lang}'") + log.debug(f"Found transcript for language '{lang}'") transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_text = " ".join( @@ -127,10 +125,8 @@ class YoutubeLoader: raise e # If all specified languages fail, raise the last exception - # This maintains compatibility with the error handling in the rest of the application if last_exception: log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") raise last_exception - # This should never happen (we'd have raised an exception above) return [] From 1a30b3746ed05e9888b038e025075b6e1c17767a Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 20:03:00 +0200 Subject: [PATCH 04/15] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 48347aa09..0bd286bca 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -88,7 +88,8 @@ class YoutubeLoader: "http": self.proxy_url, "https": self.proxy_url, } - log.debug(f"Using proxy URL: {self.proxy_url}...") + # Don't log complete URL because it might contain secrets + log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") else: youtube_proxies = None @@ -101,12 +102,10 @@ class YoutubeLoader: return [] # Try each language in order of priority - last_exception = None for lang in self.language: try: transcript = transcript_list.find_transcript([lang]) log.debug(f"Found transcript for language '{lang}'") - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_text = " ".join( map( @@ -115,18 +114,37 @@ class YoutubeLoader: ) ) return [Document(page_content=transcript_text, metadata=self._metadata)] - except NoTranscriptFound as e: + except NoTranscriptFound: log.debug(f"No transcript found for language '{lang}'") - last_exception = e continue except Exception as e: # If we hit any other type of exception, log it and re-raise log.exception(f"Error finding transcript for language '{lang}'") raise e - # If all specified languages fail, raise the last exception - if last_exception: - log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") - raise last_exception + # If all specified languages fail, fall back to English (unless English was already tried) + if "en" not in self.language: + try: + log.debug("Falling back to English transcript") + transcript = transcript_list.find_transcript(["en"]) + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + transcript_text = " ".join( + map( + lambda transcript_piece: transcript_piece.text.strip(" "), + transcript_pieces, + ) + ) + return [Document(page_content=transcript_text, metadata=self._metadata)] + except NoTranscriptFound: + log.warning("No English transcript found as fallback") + except Exception as e: + log.exception("Error finding English transcript fallback") + raise e - return [] + # If we get here, all languages failed including the English fallback + languages_tried = ", ".join(self.language) + if "en" not in self.language: + languages_tried += ", en (fallback)" + + log.warning(f"No transcript found for any of the specified languages: {languages_tried}") + raise NoTranscriptFound(f"No transcript found for any supported language") From b0d74a59f14d8f9c8fbe6aa2676039523a45ef62 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 20:07:37 +0200 Subject: [PATCH 05/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 0bd286bca..958dcfd61 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -118,8 +118,7 @@ class YoutubeLoader: log.debug(f"No transcript found for language '{lang}'") continue except Exception as e: - # If we hit any other type of exception, log it and re-raise - log.exception(f"Error finding transcript for language '{lang}'") + log.warning(f"Error finding transcript for language '{lang}'") raise e # If all specified languages fail, fall back to English (unless English was already tried) @@ -141,7 +140,7 @@ class YoutubeLoader: log.exception("Error finding English transcript fallback") raise e - # If we get here, all languages failed including the English fallback + # All languages failed languages_tried = ", ".join(self.language) if "en" not in self.language: languages_tried += ", en (fallback)" From 9cf33813813f92dc97ce33c4b89e79dcdc3f3a13 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 20:07:52 +0200 Subject: [PATCH 06/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 958dcfd61..ea8983b31 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -88,8 +88,7 @@ class YoutubeLoader: "http": self.proxy_url, "https": self.proxy_url, } - # Don't log complete URL because it might contain secrets - log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") + log.debug(f"Using proxy URL: {self.proxy_url}...") else: youtube_proxies = None From 791dd24ace6054d1822c4ad76f272c3228337d8c Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 20:08:25 +0200 Subject: [PATCH 07/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index ea8983b31..958dcfd61 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -88,7 +88,8 @@ class YoutubeLoader: "http": self.proxy_url, "https": self.proxy_url, } - log.debug(f"Using proxy URL: {self.proxy_url}...") + # Don't log complete URL because it might contain secrets + log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") else: youtube_proxies = None From 67a612fe2404edd7819717005981070339043932 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 20:40:48 +0200 Subject: [PATCH 08/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 958dcfd61..1f78131e2 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -118,7 +118,7 @@ class YoutubeLoader: log.debug(f"No transcript found for language '{lang}'") continue except Exception as e: - log.warning(f"Error finding transcript for language '{lang}'") + log.info(f"Error finding transcript for language '{lang}'") raise e # If all specified languages fail, fall back to English (unless English was already tried) From 5e1cb76b93ea3b632ca0ddf9cbe308fd8ecd1d4d Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 16:16:58 +0200 Subject: [PATCH 09/15] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 36 ++++++------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 1f78131e2..67b3715fd 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -101,8 +101,16 @@ class YoutubeLoader: log.exception("Loading YouTube transcript failed") return [] + # Make a copy of the language list to avoid modifying the original + languages_to_try = list(self.language) + + # Add English as fallback, if not already in the list + if "en" not in languages_to_try: + log.debug("Adding English as fallback language") + languages_to_try.append("en") + # Try each language in order of priority - for lang in self.language: + for lang in languages_to_try: try: transcript = transcript_list.find_transcript([lang]) log.debug(f"Found transcript for language '{lang}'") @@ -120,30 +128,8 @@ class YoutubeLoader: except Exception as e: log.info(f"Error finding transcript for language '{lang}'") raise e - - # If all specified languages fail, fall back to English (unless English was already tried) - if "en" not in self.language: - try: - log.debug("Falling back to English transcript") - transcript = transcript_list.find_transcript(["en"]) - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - transcript_text = " ".join( - map( - lambda transcript_piece: transcript_piece.text.strip(" "), - transcript_pieces, - ) - ) - return [Document(page_content=transcript_text, metadata=self._metadata)] - except NoTranscriptFound: - log.warning("No English transcript found as fallback") - except Exception as e: - log.exception("Error finding English transcript fallback") - raise e - - # All languages failed - languages_tried = ", ".join(self.language) - if "en" not in self.language: - languages_tried += ", en (fallback)" + # If we get here, all languages failed including the English fallback + languages_tried = ", ".join(languages_to_try) log.warning(f"No transcript found for any of the specified languages: {languages_tried}") raise NoTranscriptFound(f"No transcript found for any supported language") From a129e0954ec7be642d57df816c98bd8a05c99d87 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 16:22:40 +0200 Subject: [PATCH 10/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 67b3715fd..88938d0f2 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -64,6 +64,7 @@ class YoutubeLoader: self._metadata = {"source": video_id} self.language = language self.proxy_url = proxy_url + # If language is string, convert to list if isinstance(language, str): self.language = [language] else: @@ -100,7 +101,7 @@ class YoutubeLoader: except Exception as e: log.exception("Loading YouTube transcript failed") return [] - + # Make a copy of the language list to avoid modifying the original languages_to_try = list(self.language) @@ -129,7 +130,7 @@ class YoutubeLoader: log.info(f"Error finding transcript for language '{lang}'") raise e - # If we get here, all languages failed including the English fallback + # If we get here, all languages failed languages_tried = ", ".join(languages_to_try) log.warning(f"No transcript found for any of the specified languages: {languages_tried}") - raise NoTranscriptFound(f"No transcript found for any supported language") + raise NoTranscriptFound(f"No transcript found for any supported language. Add additional supported languages and verify whether the video has any transcripts.") From c69278c13c366777806a6272d83bd0851992c340 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 16:24:27 +0200 Subject: [PATCH 11/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 88938d0f2..17b1fad60 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -64,7 +64,7 @@ class YoutubeLoader: self._metadata = {"source": video_id} self.language = language self.proxy_url = proxy_url - # If language is string, convert to list + # Ensure language is a list if isinstance(language, str): self.language = [language] else: From f65dc715f91ce94750934e457ac14dbebab084e9 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 16:30:18 +0200 Subject: [PATCH 12/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 17b1fad60..7fa0247da 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -83,7 +83,7 @@ class YoutubeLoader: 'Could not import "youtube_transcript_api" Python package. ' "Please install it with `pip install youtube-transcript-api`." ) - + if self.proxy_url: youtube_proxies = { "http": self.proxy_url, @@ -93,7 +93,7 @@ class YoutubeLoader: log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") else: youtube_proxies = None - + try: transcript_list = YouTubeTranscriptApi.list_transcripts( self.video_id, proxies=youtube_proxies @@ -101,11 +101,11 @@ class YoutubeLoader: except Exception as e: log.exception("Loading YouTube transcript failed") return [] - + # Make a copy of the language list to avoid modifying the original languages_to_try = list(self.language) - # Add English as fallback, if not already in the list + # Add English as fallback if not already in the list if "en" not in languages_to_try: log.debug("Adding English as fallback language") languages_to_try.append("en") @@ -129,8 +129,8 @@ class YoutubeLoader: except Exception as e: log.info(f"Error finding transcript for language '{lang}'") raise e - + # If we get here, all languages failed languages_tried = ", ".join(languages_to_try) - log.warning(f"No transcript found for any of the specified languages: {languages_tried}") - raise NoTranscriptFound(f"No transcript found for any supported language. Add additional supported languages and verify whether the video has any transcripts.") + log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.") + raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.") From d7927506f12be656bcc1c452281c8f8733ea7baa Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 17:06:21 +0200 Subject: [PATCH 13/15] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 7fa0247da..1fa2b635c 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -62,13 +62,17 @@ class YoutubeLoader: _video_id = _parse_video_id(video_id) self.video_id = _video_id if _video_id is not None else video_id self._metadata = {"source": video_id} - self.language = language self.proxy_url = proxy_url + # Ensure language is a list if isinstance(language, str): self.language = [language] else: - self.language = language + self.language = list(language) # Make a copy to avoid modifying the original + + # Add English as fallback if not already in the list + if "en" not in self.language: + self.language.append("en") def load(self) -> List[Document]: """Load YouTube transcripts into `Document` objects.""" @@ -83,7 +87,7 @@ class YoutubeLoader: 'Could not import "youtube_transcript_api" Python package. ' "Please install it with `pip install youtube-transcript-api`." ) - + if self.proxy_url: youtube_proxies = { "http": self.proxy_url, @@ -102,16 +106,8 @@ class YoutubeLoader: log.exception("Loading YouTube transcript failed") return [] - # Make a copy of the language list to avoid modifying the original - languages_to_try = list(self.language) - - # Add English as fallback if not already in the list - if "en" not in languages_to_try: - log.debug("Adding English as fallback language") - languages_to_try.append("en") - # Try each language in order of priority - for lang in languages_to_try: + for lang in self.language: try: transcript = transcript_list.find_transcript([lang]) log.debug(f"Found transcript for language '{lang}'") @@ -129,8 +125,8 @@ class YoutubeLoader: except Exception as e: log.info(f"Error finding transcript for language '{lang}'") raise e - + # If we get here, all languages failed - languages_tried = ", ".join(languages_to_try) - log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.") - raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.") + languages_tried = ", ".join(self.language) + log.warning(f"No transcript found for any of the specified languages: {languages_tried}") + raise NoTranscriptFound(f"No transcript found for any supported language") From 87dcbd198c3aed00e22c11dcc0e591f72126a057 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 17:11:03 +0200 Subject: [PATCH 14/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 1fa2b635c..70153f8cf 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -68,7 +68,7 @@ class YoutubeLoader: if isinstance(language, str): self.language = [language] else: - self.language = list(language) # Make a copy to avoid modifying the original + self.language = list(language) # Add English as fallback if not already in the list if "en" not in self.language: From 1dcbec71ec054f79f570cd95da5a4031568a63fe Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 6 May 2025 17:14:00 +0200 Subject: [PATCH 15/15] Update youtube.py --- backend/open_webui/retrieval/loaders/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 70153f8cf..763d73094 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -128,5 +128,5 @@ class YoutubeLoader: # If we get here, all languages failed languages_tried = ", ".join(self.language) - log.warning(f"No transcript found for any of the specified languages: {languages_tried}") - raise NoTranscriptFound(f"No transcript found for any supported language") + log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.") + raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")