Add agent component for web crawler (#2878)

### What problem does this PR solve?

Add agent component for  web crawler

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
wwwlll 2024-10-21 11:38:41 +08:00 committed by GitHub
parent c1d0473f49
commit 4bdf3fd48e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 152 additions and 0 deletions

View File

@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam
from .jin10 import Jin10, Jin10Param from .jin10 import Jin10, Jin10Param
from .tushare import TuShare, TuShareParam from .tushare import TuShare, TuShareParam
from .akshare import AkShare, AkShareParam from .akshare import AkShare, AkShareParam
from .crawler import Crawler, CrawlerParam
def component_class(class_name): def component_class(class_name):

View File

@ -0,0 +1,71 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from abc import ABC
import asyncio
from crawl4ai import AsyncWebCrawler
from agent.component.base import ComponentBase, ComponentParamBase
class CrawlerParam(ComponentParamBase):
"""
Define the Crawler component parameters.
"""
def __init__(self):
super().__init__()
def check(self):
return True
class Crawler(ComponentBase, ABC):
component_name = "Crawler"
def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return Crawler.be_output("")
try:
result = asyncio.run(self.get_web(ans))
return Crawler.be_output(result)
except Exception as e:
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
async def get_web(self, url):
proxy = self._param.proxy if self._param.proxy else None
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
result = await crawler.arun(
url=url,
bypass_cache=True
)
match self._param.extract_type:
case 'html':
return result.cleaned_html
case 'markdown':
return result.markdown
case 'content':
return result.extracted_content
case _:
return result.markdown
# print(result.markdown)

View File

@ -0,0 +1 @@
<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg class="icon" width="200px" height="200.00px" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"><path d="M 777.121 313.158 a 265.121 265.121 0 0 0 -530.243 0 Z m 165.7 265.121 H 843.402 V 425.836 l 84.176 -84.176 a 33.1402 33.1402 0 1 0 -47.0591 -47.0591 l -66.2803 66.2803 h -596.524 l -66.2803 -66.2803 a 33.1402 33.1402 0 1 0 -47.0591 47.0591 L 180.598 425.836 V 578.281 H 81.177 a 33.1402 33.1402 0 0 0 0 66.2803 H 180.598 v 33.1402 a 294.285 294.285 0 0 0 39.7682 145.817 l -96.1069 95.4441 a 33.1402 33.1402 0 0 0 47.0591 47.0591 l 88.8157 -88.1529 A 296.937 296.937 0 0 0 478.859 975.959 h 3.97682 V 401.974 h 66.2803 V 975.959 a 296.937 296.937 0 0 0 215.411 -98.0944 l 88.8157 88.8157 a 33.1402 33.1402 0 0 0 47.0591 -47.0591 l -93.4554 -93.4554 a 293.621 293.621 0 0 0 36.4542 -149.131 V 644.561 h 99.4209 a 33.1402 33.1402 0 0 0 0 -66.2803 Z" fill="#1B69FD" /></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

View File

@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
yahooFinance: 'YahooFinance', yahooFinance: 'YahooFinance',
yahooFinanceDescription: yahooFinanceDescription:
'The component queries information about the company based on the provided ticker symbol.', 'The component queries information about the company based on the provided ticker symbol.',
crawler: 'Web Crawler',
crawlerDescription:
'This component can be used to crawl HTML source code from a specified URL.',
proxy: 'Proxy',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: 'Content',
},
extractType: 'extractType',
info: 'Info', info: 'Info',
history: 'History', history: 'History',
financials: 'Financials', financials: 'Financials',

View File

@ -877,6 +877,15 @@ export default {
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。', akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
yahooFinance: '雅虎財經', yahooFinance: '雅虎財經',
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。', yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
crawler: '網頁爬蟲',
crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
proxy: '代理',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: '文本',
},
extractType: '提取類型',
info: '訊息', info: '訊息',
history: '歷史', history: '歷史',
financials: '財務', financials: '財務',

View File

@ -897,6 +897,15 @@ export default {
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。', akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
yahooFinance: '雅虎财经', yahooFinance: '雅虎财经',
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。', yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
crawler: '网页爬虫',
crawlerDescription: '该组件可用于从指定url爬取html源码。',
proxy: '代理',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: '文本',
},
extractType: '提取类型',
info: '信息', info: '信息',
history: '历史', history: '历史',
financials: '财务', financials: '财务',

View File

@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg'; import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg'; import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg'; import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg'; import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg'; import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg'; import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
@ -73,6 +74,7 @@ export enum Operator {
Concentrator = 'Concentrator', Concentrator = 'Concentrator',
TuShare = 'TuShare', TuShare = 'TuShare',
Note = 'Note', Note = 'Note',
Crawler = 'Crawler',
} }
export const CommonOperatorList = Object.values(Operator).filter( export const CommonOperatorList = Object.values(Operator).filter(
@ -110,6 +112,7 @@ export const operatorIconMap = {
[Operator.Concentrator]: ConcentratorIcon, [Operator.Concentrator]: ConcentratorIcon,
[Operator.TuShare]: TuShareIcon, [Operator.TuShare]: TuShareIcon,
[Operator.Note]: NoteIcon, [Operator.Note]: NoteIcon,
[Operator.Crawler]: CrawlerIcon,
}; };
export const operatorMap: Record< export const operatorMap: Record<
@ -233,6 +236,9 @@ export const operatorMap: Record<
}, },
[Operator.TuShare]: { backgroundColor: '#f8cfa0' }, [Operator.TuShare]: { backgroundColor: '#f8cfa0' },
[Operator.Note]: { backgroundColor: '#f8cfa0' }, [Operator.Note]: { backgroundColor: '#f8cfa0' },
[Operator.Crawler]: {
backgroundColor: '#dee0e2',
},
}; };
export const componentMenuList = [ export const componentMenuList = [
@ -323,6 +329,9 @@ export const componentMenuList = [
{ {
name: Operator.TuShare, name: Operator.TuShare,
}, },
{
name: Operator.Crawler,
},
]; ];
export const initialRetrievalValues = { export const initialRetrievalValues = {
@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
[Operator.Jin10]: [Operator.Begin], [Operator.Jin10]: [Operator.Begin],
[Operator.Concentrator]: [Operator.Begin], [Operator.Concentrator]: [Operator.Begin],
[Operator.TuShare]: [Operator.Begin], [Operator.TuShare]: [Operator.Begin],
[Operator.Crawler]: [Operator.Begin],
}; };
export const NodeMap = { export const NodeMap = {
@ -605,6 +615,7 @@ export const NodeMap = {
[Operator.Jin10]: 'ragNode', [Operator.Jin10]: 'ragNode',
[Operator.TuShare]: 'ragNode', [Operator.TuShare]: 'ragNode',
[Operator.Note]: 'noteNode', [Operator.Note]: 'noteNode',
[Operator.Crawler]: 'ragNode',
}; };
export const LanguageOptions = [ export const LanguageOptions = [
@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
'fenghuang', 'fenghuang',
'jinrongjie', 'jinrongjie',
]; ];
export const CrawlerResultOptions = ['markdown', 'html', 'content'];

View File

@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
import BeginForm from '../form/begin-form'; import BeginForm from '../form/begin-form';
import BingForm from '../form/bing-form'; import BingForm from '../form/bing-form';
import CategorizeForm from '../form/categorize-form'; import CategorizeForm from '../form/categorize-form';
import CrawlerForm from '../form/crawler-form';
import DeepLForm from '../form/deepl-form'; import DeepLForm from '../form/deepl-form';
import DuckDuckGoForm from '../form/duckduckgo-form'; import DuckDuckGoForm from '../form/duckduckgo-form';
import ExeSQLForm from '../form/exesql-form'; import ExeSQLForm from '../form/exesql-form';
@ -70,6 +71,7 @@ const FormMap = {
[Operator.YahooFinance]: YahooFinanceForm, [Operator.YahooFinance]: YahooFinanceForm,
[Operator.Jin10]: Jin10Form, [Operator.Jin10]: Jin10Form,
[Operator.TuShare]: TuShareForm, [Operator.TuShare]: TuShareForm,
[Operator.Crawler]: CrawlerForm,
}; };
const EmptyContent = () => <div>empty</div>; const EmptyContent = () => <div>empty</div>;

View File

@ -0,0 +1,37 @@
import { useTranslate } from '@/hooks/common-hooks';
import { Form, Input, Select } from 'antd';
import { useMemo } from 'react';
import { CrawlerResultOptions } from '../../constant';
import { IOperatorForm } from '../../interface';
const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
const { t } = useTranslate('flow');
const crawlerResultOptions = useMemo(() => {
return CrawlerResultOptions.map((x) => ({
value: x,
label: t(`crawlerResultOptions.${x}`),
}));
}, [t]);
return (
<Form
name="basic"
labelCol={{ span: 6 }}
wrapperCol={{ span: 18 }}
autoComplete="off"
form={form}
onValuesChange={onValuesChange}
>
<Form.Item label={t('proxy')} name={'proxy'}>
<Input placeholder="like: http://127.0.0.1:8888"></Input>
</Form.Item>
<Form.Item
label={t('extractType')}
name={'extract_type'}
initialValue="markdown"
>
<Select options={crawlerResultOptions}></Select>
</Form.Item>
</Form>
);
};
export default CrawlerForm;