Add agent component for web crawler (#2878)

### What problem does this PR solve?

Add agent component for  web crawler

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
wwwlll 2024-10-21 11:38:41 +08:00 committed by GitHub
parent c1d0473f49
commit 4bdf3fd48e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 152 additions and 0 deletions

View File

@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam
from .jin10 import Jin10, Jin10Param
from .tushare import TuShare, TuShareParam
from .akshare import AkShare, AkShareParam
from .crawler import Crawler, CrawlerParam
def component_class(class_name):

View File

@ -0,0 +1,71 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from abc import ABC
import asyncio
from crawl4ai import AsyncWebCrawler
from agent.component.base import ComponentBase, ComponentParamBase
class CrawlerParam(ComponentParamBase):
"""
Define the Crawler component parameters.
"""
def __init__(self):
super().__init__()
def check(self):
return True
class Crawler(ComponentBase, ABC):
component_name = "Crawler"
def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return Crawler.be_output("")
try:
result = asyncio.run(self.get_web(ans))
return Crawler.be_output(result)
except Exception as e:
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
async def get_web(self, url):
proxy = self._param.proxy if self._param.proxy else None
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
result = await crawler.arun(
url=url,
bypass_cache=True
)
match self._param.extract_type:
case 'html':
return result.cleaned_html
case 'markdown':
return result.markdown
case 'content':
return result.extracted_content
case _:
return result.markdown
# print(result.markdown)

View File

@ -0,0 +1 @@
<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg class="icon" width="200px" height="200.00px" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"><path d="M 777.121 313.158 a 265.121 265.121 0 0 0 -530.243 0 Z m 165.7 265.121 H 843.402 V 425.836 l 84.176 -84.176 a 33.1402 33.1402 0 1 0 -47.0591 -47.0591 l -66.2803 66.2803 h -596.524 l -66.2803 -66.2803 a 33.1402 33.1402 0 1 0 -47.0591 47.0591 L 180.598 425.836 V 578.281 H 81.177 a 33.1402 33.1402 0 0 0 0 66.2803 H 180.598 v 33.1402 a 294.285 294.285 0 0 0 39.7682 145.817 l -96.1069 95.4441 a 33.1402 33.1402 0 0 0 47.0591 47.0591 l 88.8157 -88.1529 A 296.937 296.937 0 0 0 478.859 975.959 h 3.97682 V 401.974 h 66.2803 V 975.959 a 296.937 296.937 0 0 0 215.411 -98.0944 l 88.8157 88.8157 a 33.1402 33.1402 0 0 0 47.0591 -47.0591 l -93.4554 -93.4554 a 293.621 293.621 0 0 0 36.4542 -149.131 V 644.561 h 99.4209 a 33.1402 33.1402 0 0 0 0 -66.2803 Z" fill="#1B69FD" /></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

View File

@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
yahooFinance: 'YahooFinance',
yahooFinanceDescription:
'The component queries information about the company based on the provided ticker symbol.',
crawler: 'Web Crawler',
crawlerDescription:
'This component can be used to crawl HTML source code from a specified URL.',
proxy: 'Proxy',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: 'Content',
},
extractType: 'extractType',
info: 'Info',
history: 'History',
financials: 'Financials',

View File

@ -877,6 +877,15 @@ export default {
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
yahooFinance: '雅虎財經',
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
crawler: '網頁爬蟲',
crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
proxy: '代理',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: '文本',
},
extractType: '提取類型',
info: '訊息',
history: '歷史',
financials: '財務',

View File

@ -897,6 +897,15 @@ export default {
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
yahooFinance: '雅虎财经',
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
crawler: '网页爬虫',
crawlerDescription: '该组件可用于从指定url爬取html源码。',
proxy: '代理',
crawlerResultOptions: {
html: 'Html',
markdown: 'Markdown',
content: '文本',
},
extractType: '提取类型',
info: '信息',
history: '历史',
financials: '财务',

View File

@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
@ -73,6 +74,7 @@ export enum Operator {
Concentrator = 'Concentrator',
TuShare = 'TuShare',
Note = 'Note',
Crawler = 'Crawler',
}
export const CommonOperatorList = Object.values(Operator).filter(
@ -110,6 +112,7 @@ export const operatorIconMap = {
[Operator.Concentrator]: ConcentratorIcon,
[Operator.TuShare]: TuShareIcon,
[Operator.Note]: NoteIcon,
[Operator.Crawler]: CrawlerIcon,
};
export const operatorMap: Record<
@ -233,6 +236,9 @@ export const operatorMap: Record<
},
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
[Operator.Note]: { backgroundColor: '#f8cfa0' },
[Operator.Crawler]: {
backgroundColor: '#dee0e2',
},
};
export const componentMenuList = [
@ -323,6 +329,9 @@ export const componentMenuList = [
{
name: Operator.TuShare,
},
{
name: Operator.Crawler,
},
];
export const initialRetrievalValues = {
@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
[Operator.Jin10]: [Operator.Begin],
[Operator.Concentrator]: [Operator.Begin],
[Operator.TuShare]: [Operator.Begin],
[Operator.Crawler]: [Operator.Begin],
};
export const NodeMap = {
@ -605,6 +615,7 @@ export const NodeMap = {
[Operator.Jin10]: 'ragNode',
[Operator.TuShare]: 'ragNode',
[Operator.Note]: 'noteNode',
[Operator.Crawler]: 'ragNode',
};
export const LanguageOptions = [
@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
'fenghuang',
'jinrongjie',
];
export const CrawlerResultOptions = ['markdown', 'html', 'content'];

View File

@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
import BeginForm from '../form/begin-form';
import BingForm from '../form/bing-form';
import CategorizeForm from '../form/categorize-form';
import CrawlerForm from '../form/crawler-form';
import DeepLForm from '../form/deepl-form';
import DuckDuckGoForm from '../form/duckduckgo-form';
import ExeSQLForm from '../form/exesql-form';
@ -70,6 +71,7 @@ const FormMap = {
[Operator.YahooFinance]: YahooFinanceForm,
[Operator.Jin10]: Jin10Form,
[Operator.TuShare]: TuShareForm,
[Operator.Crawler]: CrawlerForm,
};
const EmptyContent = () => <div>empty</div>;

View File

@ -0,0 +1,37 @@
import { useTranslate } from '@/hooks/common-hooks';
import { Form, Input, Select } from 'antd';
import { useMemo } from 'react';
import { CrawlerResultOptions } from '../../constant';
import { IOperatorForm } from '../../interface';
const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
const { t } = useTranslate('flow');
const crawlerResultOptions = useMemo(() => {
return CrawlerResultOptions.map((x) => ({
value: x,
label: t(`crawlerResultOptions.${x}`),
}));
}, [t]);
return (
<Form
name="basic"
labelCol={{ span: 6 }}
wrapperCol={{ span: 18 }}
autoComplete="off"
form={form}
onValuesChange={onValuesChange}
>
<Form.Item label={t('proxy')} name={'proxy'}>
<Input placeholder="like: http://127.0.0.1:8888"></Input>
</Form.Item>
<Form.Item
label={t('extractType')}
name={'extract_type'}
initialValue="markdown"
>
<Select options={crawlerResultOptions}></Select>
</Form.Item>
</Form>
);
};
export default CrawlerForm;