From adb0a93d955ca5d6117c871a77d0c3eadb02fa6b Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 22 Oct 2024 14:16:44 +0800 Subject: [PATCH] add component invoke (#2967) ### What problem does this PR solve? #2908 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- agent/component/crawler.py | 23 +++++------ agent/component/invoke.py | 84 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 agent/component/invoke.py diff --git a/agent/component/crawler.py b/agent/component/crawler.py index 4168e96a1..f0012d71d 100644 --- a/agent/component/crawler.py +++ b/agent/component/crawler.py @@ -18,6 +18,7 @@ import asyncio from crawl4ai import AsyncWebCrawler from agent.component.base import ComponentBase, ComponentParamBase + class CrawlerParam(ComponentParamBase): """ Define the Crawler component parameters. @@ -25,9 +26,11 @@ class CrawlerParam(ComponentParamBase): def __init__(self): super().__init__() + self.proxy = None + self.extract_type = "markdown" def check(self): - return True + self.check_valid_value(self.extract_type, "Type of content from the crawler", ['html', 'markdown', 'content']) class Crawler(ComponentBase, ABC): @@ -46,7 +49,6 @@ class Crawler(ComponentBase, ABC): except Exception as e: return Crawler.be_output(f"An unexpected error occurred: {str(e)}") - async def get_web(self, url): proxy = self._param.proxy if self._param.proxy else None async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler: @@ -55,16 +57,13 @@ class Crawler(ComponentBase, ABC): bypass_cache=True ) - match self._param.extract_type: - case 'html': - return result.cleaned_html - case 'markdown': - return result.markdown - case 'content': - return result.extracted_content - case _: - return result.markdown - # print(result.markdown) + if self._param.extract_type == 'html': + return result.cleaned_html + elif self._param.extract_type == 'markdown': + return result.markdown + elif self._param.extract_type == 'content': + result.extracted_content + return result.markdown diff --git a/agent/component/invoke.py b/agent/component/invoke.py new file mode 100644 index 000000000..1078d35a9 --- /dev/null +++ b/agent/component/invoke.py @@ -0,0 +1,84 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +from abc import ABC + +import requests + +from agent.component.base import ComponentBase, ComponentParamBase + + +class InvokeParam(ComponentParamBase): + """ + Define the Crawler component parameters. + """ + + def __init__(self): + super().__init__() + self.proxy = None + self.headers = "" + self.method = "get" + self.variables = [] + self.url = "" + self.timeout = 60 + + def check(self): + self.check_valid_value(self.method.lower(), "Type of content from the crawler", ['get', 'post', 'put']) + self.check_empty(self.url, "End point URL") + self.check_positive_integer(self.timeout, "Timeout time in second") + + +class Invoke(ComponentBase, ABC): + component_name = "Invoke" + + def _run(self, history, **kwargs): + args = {} + for para in self._param.variables: + if para.get("component_id"): + cpn = self._canvas.get_component(para["component_id"])["obj"] + _, out = cpn.output(allow_partial=False) + args[para["key"]] = "\n".join(out["content"]) + else: + args[para["key"]] = "\n".join(para["value"]) + + url = self._param.url.strip() + if url.find("http") != 0: + url = "http://" + url + + method = self._param.method.lower() + headers = {} + if self._param.headers: + headers = json.loads(self._param.headers) + proxies = None + if self._param.proxy: + proxies = {"http": self._param.proxy, "https": self._param.proxy} + + if method == 'get': + response = requests.get(url=url, + params=args, + headers=headers, + proxies=proxies, + timeout=self._param.timeout) + return Invoke.be_output(response.text) + + if method == 'put': + response = requests.put(url=url, + data=args, + headers=headers, + proxies=proxies, + timeout=self._param.timeout) + + return Invoke.be_output(response.text)