From 4bdf3fd48e27164839862fbc56c0f4b443bee733 Mon Sep 17 00:00:00 2001 From: wwwlll Date: Mon, 21 Oct 2024 11:38:41 +0800 Subject: [PATCH] Add agent component for web crawler (#2878) ### What problem does this PR solve? Add agent component for web crawler ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- agent/component/__init__.py | 1 + agent/component/crawler.py | 71 +++++++++++++++++++ web/src/assets/svg/crawler.svg | 1 + web/src/locales/en.ts | 10 +++ web/src/locales/zh-traditional.ts | 9 +++ web/src/locales/zh.ts | 9 +++ web/src/pages/flow/constant.tsx | 12 ++++ web/src/pages/flow/flow-drawer/index.tsx | 2 + .../pages/flow/form/crawler-form/index.tsx | 37 ++++++++++ 9 files changed, 152 insertions(+) create mode 100644 agent/component/crawler.py create mode 100644 web/src/assets/svg/crawler.svg create mode 100644 web/src/pages/flow/form/crawler-form/index.tsx diff --git a/agent/component/__init__.py b/agent/component/__init__.py index bc3a0c026..2e85a3e09 100644 --- a/agent/component/__init__.py +++ b/agent/component/__init__.py @@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam from .jin10 import Jin10, Jin10Param from .tushare import TuShare, TuShareParam from .akshare import AkShare, AkShareParam +from .crawler import Crawler, CrawlerParam def component_class(class_name): diff --git a/agent/component/crawler.py b/agent/component/crawler.py new file mode 100644 index 000000000..4168e96a1 --- /dev/null +++ b/agent/component/crawler.py @@ -0,0 +1,71 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from abc import ABC +import asyncio +from crawl4ai import AsyncWebCrawler +from agent.component.base import ComponentBase, ComponentParamBase + +class CrawlerParam(ComponentParamBase): + """ + Define the Crawler component parameters. + """ + + def __init__(self): + super().__init__() + + def check(self): + return True + + +class Crawler(ComponentBase, ABC): + component_name = "Crawler" + + def _run(self, history, **kwargs): + ans = self.get_input() + ans = " - ".join(ans["content"]) if "content" in ans else "" + if not ans: + return Crawler.be_output("") + try: + result = asyncio.run(self.get_web(ans)) + + return Crawler.be_output(result) + + except Exception as e: + return Crawler.be_output(f"An unexpected error occurred: {str(e)}") + + + async def get_web(self, url): + proxy = self._param.proxy if self._param.proxy else None + async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler: + result = await crawler.arun( + url=url, + bypass_cache=True + ) + + match self._param.extract_type: + case 'html': + return result.cleaned_html + case 'markdown': + return result.markdown + case 'content': + return result.extracted_content + case _: + return result.markdown + # print(result.markdown) + + + + \ No newline at end of file diff --git a/web/src/assets/svg/crawler.svg b/web/src/assets/svg/crawler.svg new file mode 100644 index 000000000..b61773fbc --- /dev/null +++ b/web/src/assets/svg/crawler.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index d843eabda..cbc0bc874 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -928,6 +928,16 @@ The above is the content you need to summarize.`, yahooFinance: 'YahooFinance', yahooFinanceDescription: 'The component queries information about the company based on the provided ticker symbol.', + crawler: 'Web Crawler', + crawlerDescription: + 'This component can be used to crawl HTML source code from a specified URL.', + proxy: 'Proxy', + crawlerResultOptions: { + html: 'Html', + markdown: 'Markdown', + content: 'Content', + }, + extractType: 'extractType', info: 'Info', history: 'History', financials: 'Financials', diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index fc2eda46d..5edf21d0f 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -877,6 +877,15 @@ export default { akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。', yahooFinance: '雅虎財經', yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。', + crawler: '網頁爬蟲', + crawlerDescription: '該組件可用於從指定url爬取HTML源碼。', + proxy: '代理', + crawlerResultOptions: { + html: 'Html', + markdown: 'Markdown', + content: '文本', + }, + extractType: '提取類型', info: '訊息', history: '歷史', financials: '財務', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index dc59ac43e..36e39e126 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -897,6 +897,15 @@ export default { akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。', yahooFinance: '雅虎财经', yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。', + crawler: '网页爬虫', + crawlerDescription: '该组件可用于从指定url爬取html源码。', + proxy: '代理', + crawlerResultOptions: { + html: 'Html', + markdown: 'Markdown', + content: '文本', + }, + extractType: '提取类型', info: '信息', history: '历史', financials: '财务', diff --git a/web/src/pages/flow/constant.tsx b/web/src/pages/flow/constant.tsx index ad5fbe62a..7aae71a68 100644 --- a/web/src/pages/flow/constant.tsx +++ b/web/src/pages/flow/constant.tsx @@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg'; import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg'; import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg'; import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg'; +import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg'; import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg'; import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg'; import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg'; @@ -73,6 +74,7 @@ export enum Operator { Concentrator = 'Concentrator', TuShare = 'TuShare', Note = 'Note', + Crawler = 'Crawler', } export const CommonOperatorList = Object.values(Operator).filter( @@ -110,6 +112,7 @@ export const operatorIconMap = { [Operator.Concentrator]: ConcentratorIcon, [Operator.TuShare]: TuShareIcon, [Operator.Note]: NoteIcon, + [Operator.Crawler]: CrawlerIcon, }; export const operatorMap: Record< @@ -233,6 +236,9 @@ export const operatorMap: Record< }, [Operator.TuShare]: { backgroundColor: '#f8cfa0' }, [Operator.Note]: { backgroundColor: '#f8cfa0' }, + [Operator.Crawler]: { + backgroundColor: '#dee0e2', + }, }; export const componentMenuList = [ @@ -323,6 +329,9 @@ export const componentMenuList = [ { name: Operator.TuShare, }, + { + name: Operator.Crawler, + }, ]; export const initialRetrievalValues = { @@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = { [Operator.Jin10]: [Operator.Begin], [Operator.Concentrator]: [Operator.Begin], [Operator.TuShare]: [Operator.Begin], + [Operator.Crawler]: [Operator.Begin], }; export const NodeMap = { @@ -605,6 +615,7 @@ export const NodeMap = { [Operator.Jin10]: 'ragNode', [Operator.TuShare]: 'ragNode', [Operator.Note]: 'noteNode', + [Operator.Crawler]: 'ragNode', }; export const LanguageOptions = [ @@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [ 'fenghuang', 'jinrongjie', ]; +export const CrawlerResultOptions = ['markdown', 'html', 'content']; diff --git a/web/src/pages/flow/flow-drawer/index.tsx b/web/src/pages/flow/flow-drawer/index.tsx index 639675ec3..a505bfbb1 100644 --- a/web/src/pages/flow/flow-drawer/index.tsx +++ b/web/src/pages/flow/flow-drawer/index.tsx @@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form'; import BeginForm from '../form/begin-form'; import BingForm from '../form/bing-form'; import CategorizeForm from '../form/categorize-form'; +import CrawlerForm from '../form/crawler-form'; import DeepLForm from '../form/deepl-form'; import DuckDuckGoForm from '../form/duckduckgo-form'; import ExeSQLForm from '../form/exesql-form'; @@ -70,6 +71,7 @@ const FormMap = { [Operator.YahooFinance]: YahooFinanceForm, [Operator.Jin10]: Jin10Form, [Operator.TuShare]: TuShareForm, + [Operator.Crawler]: CrawlerForm, }; const EmptyContent = () =>
empty
; diff --git a/web/src/pages/flow/form/crawler-form/index.tsx b/web/src/pages/flow/form/crawler-form/index.tsx new file mode 100644 index 000000000..57c761f75 --- /dev/null +++ b/web/src/pages/flow/form/crawler-form/index.tsx @@ -0,0 +1,37 @@ +import { useTranslate } from '@/hooks/common-hooks'; +import { Form, Input, Select } from 'antd'; +import { useMemo } from 'react'; +import { CrawlerResultOptions } from '../../constant'; +import { IOperatorForm } from '../../interface'; +const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => { + const { t } = useTranslate('flow'); + const crawlerResultOptions = useMemo(() => { + return CrawlerResultOptions.map((x) => ({ + value: x, + label: t(`crawlerResultOptions.${x}`), + })); + }, [t]); + return ( +
+ + + + + + +
+ ); +}; + +export default CrawlerForm;