mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 07:09:01 +08:00
Add agent component for web crawler (#2878)
### What problem does this PR solve? Add agent component for web crawler ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
c1d0473f49
commit
4bdf3fd48e
@ -28,6 +28,7 @@ from .wencai import WenCai, WenCaiParam
|
||||
from .jin10 import Jin10, Jin10Param
|
||||
from .tushare import TuShare, TuShareParam
|
||||
from .akshare import AkShare, AkShareParam
|
||||
from .crawler import Crawler, CrawlerParam
|
||||
|
||||
|
||||
def component_class(class_name):
|
||||
|
71
agent/component/crawler.py
Normal file
71
agent/component/crawler.py
Normal file
@ -0,0 +1,71 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from abc import ABC
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from agent.component.base import ComponentBase, ComponentParamBase
|
||||
|
||||
class CrawlerParam(ComponentParamBase):
|
||||
"""
|
||||
Define the Crawler component parameters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def check(self):
|
||||
return True
|
||||
|
||||
|
||||
class Crawler(ComponentBase, ABC):
|
||||
component_name = "Crawler"
|
||||
|
||||
def _run(self, history, **kwargs):
|
||||
ans = self.get_input()
|
||||
ans = " - ".join(ans["content"]) if "content" in ans else ""
|
||||
if not ans:
|
||||
return Crawler.be_output("")
|
||||
try:
|
||||
result = asyncio.run(self.get_web(ans))
|
||||
|
||||
return Crawler.be_output(result)
|
||||
|
||||
except Exception as e:
|
||||
return Crawler.be_output(f"An unexpected error occurred: {str(e)}")
|
||||
|
||||
|
||||
async def get_web(self, url):
|
||||
proxy = self._param.proxy if self._param.proxy else None
|
||||
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
bypass_cache=True
|
||||
)
|
||||
|
||||
match self._param.extract_type:
|
||||
case 'html':
|
||||
return result.cleaned_html
|
||||
case 'markdown':
|
||||
return result.markdown
|
||||
case 'content':
|
||||
return result.extracted_content
|
||||
case _:
|
||||
return result.markdown
|
||||
# print(result.markdown)
|
||||
|
||||
|
||||
|
||||
|
1
web/src/assets/svg/crawler.svg
Normal file
1
web/src/assets/svg/crawler.svg
Normal file
@ -0,0 +1 @@
|
||||
<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg class="icon" width="200px" height="200.00px" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"><path d="M 777.121 313.158 a 265.121 265.121 0 0 0 -530.243 0 Z m 165.7 265.121 H 843.402 V 425.836 l 84.176 -84.176 a 33.1402 33.1402 0 1 0 -47.0591 -47.0591 l -66.2803 66.2803 h -596.524 l -66.2803 -66.2803 a 33.1402 33.1402 0 1 0 -47.0591 47.0591 L 180.598 425.836 V 578.281 H 81.177 a 33.1402 33.1402 0 0 0 0 66.2803 H 180.598 v 33.1402 a 294.285 294.285 0 0 0 39.7682 145.817 l -96.1069 95.4441 a 33.1402 33.1402 0 0 0 47.0591 47.0591 l 88.8157 -88.1529 A 296.937 296.937 0 0 0 478.859 975.959 h 3.97682 V 401.974 h 66.2803 V 975.959 a 296.937 296.937 0 0 0 215.411 -98.0944 l 88.8157 88.8157 a 33.1402 33.1402 0 0 0 47.0591 -47.0591 l -93.4554 -93.4554 a 293.621 293.621 0 0 0 36.4542 -149.131 V 644.561 h 99.4209 a 33.1402 33.1402 0 0 0 0 -66.2803 Z" fill="#1B69FD" /></svg>
|
After Width: | Height: | Size: 1.0 KiB |
@ -928,6 +928,16 @@ The above is the content you need to summarize.`,
|
||||
yahooFinance: 'YahooFinance',
|
||||
yahooFinanceDescription:
|
||||
'The component queries information about the company based on the provided ticker symbol.',
|
||||
crawler: 'Web Crawler',
|
||||
crawlerDescription:
|
||||
'This component can be used to crawl HTML source code from a specified URL.',
|
||||
proxy: 'Proxy',
|
||||
crawlerResultOptions: {
|
||||
html: 'Html',
|
||||
markdown: 'Markdown',
|
||||
content: 'Content',
|
||||
},
|
||||
extractType: 'extractType',
|
||||
info: 'Info',
|
||||
history: 'History',
|
||||
financials: 'Financials',
|
||||
|
@ -877,6 +877,15 @@ export default {
|
||||
akShareDescription: '此組件可用於從東方財富網取得對應股票的新聞資訊。',
|
||||
yahooFinance: '雅虎財經',
|
||||
yahooFinanceDescription: '該組件根據提供的股票代碼查詢有關公司的資訊。',
|
||||
crawler: '網頁爬蟲',
|
||||
crawlerDescription: '該組件可用於從指定url爬取HTML源碼。',
|
||||
proxy: '代理',
|
||||
crawlerResultOptions: {
|
||||
html: 'Html',
|
||||
markdown: 'Markdown',
|
||||
content: '文本',
|
||||
},
|
||||
extractType: '提取類型',
|
||||
info: '訊息',
|
||||
history: '歷史',
|
||||
financials: '財務',
|
||||
|
@ -897,6 +897,15 @@ export default {
|
||||
akShareDescription: '该组件可用于从东方财富网站获取相应股票的新闻信息。',
|
||||
yahooFinance: '雅虎财经',
|
||||
yahooFinanceDescription: '该组件根据提供的股票代码查询有关公司的信息。',
|
||||
crawler: '网页爬虫',
|
||||
crawlerDescription: '该组件可用于从指定url爬取html源码。',
|
||||
proxy: '代理',
|
||||
crawlerResultOptions: {
|
||||
html: 'Html',
|
||||
markdown: 'Markdown',
|
||||
content: '文本',
|
||||
},
|
||||
extractType: '提取类型',
|
||||
info: '信息',
|
||||
history: '历史',
|
||||
financials: '财务',
|
||||
|
@ -4,6 +4,7 @@ import { ReactComponent as baiduFanyiIcon } from '@/assets/svg/baidu-fanyi.svg';
|
||||
import { ReactComponent as BaiduIcon } from '@/assets/svg/baidu.svg';
|
||||
import { ReactComponent as BingIcon } from '@/assets/svg/bing.svg';
|
||||
import { ReactComponent as ConcentratorIcon } from '@/assets/svg/concentrator.svg';
|
||||
import { ReactComponent as CrawlerIcon } from '@/assets/svg/crawler.svg';
|
||||
import { ReactComponent as DeepLIcon } from '@/assets/svg/deepl.svg';
|
||||
import { ReactComponent as DuckIcon } from '@/assets/svg/duck.svg';
|
||||
import { ReactComponent as ExeSqlIcon } from '@/assets/svg/exesql.svg';
|
||||
@ -73,6 +74,7 @@ export enum Operator {
|
||||
Concentrator = 'Concentrator',
|
||||
TuShare = 'TuShare',
|
||||
Note = 'Note',
|
||||
Crawler = 'Crawler',
|
||||
}
|
||||
|
||||
export const CommonOperatorList = Object.values(Operator).filter(
|
||||
@ -110,6 +112,7 @@ export const operatorIconMap = {
|
||||
[Operator.Concentrator]: ConcentratorIcon,
|
||||
[Operator.TuShare]: TuShareIcon,
|
||||
[Operator.Note]: NoteIcon,
|
||||
[Operator.Crawler]: CrawlerIcon,
|
||||
};
|
||||
|
||||
export const operatorMap: Record<
|
||||
@ -233,6 +236,9 @@ export const operatorMap: Record<
|
||||
},
|
||||
[Operator.TuShare]: { backgroundColor: '#f8cfa0' },
|
||||
[Operator.Note]: { backgroundColor: '#f8cfa0' },
|
||||
[Operator.Crawler]: {
|
||||
backgroundColor: '#dee0e2',
|
||||
},
|
||||
};
|
||||
|
||||
export const componentMenuList = [
|
||||
@ -323,6 +329,9 @@ export const componentMenuList = [
|
||||
{
|
||||
name: Operator.TuShare,
|
||||
},
|
||||
{
|
||||
name: Operator.Crawler,
|
||||
},
|
||||
];
|
||||
|
||||
export const initialRetrievalValues = {
|
||||
@ -572,6 +581,7 @@ export const RestrictedUpstreamMap = {
|
||||
[Operator.Jin10]: [Operator.Begin],
|
||||
[Operator.Concentrator]: [Operator.Begin],
|
||||
[Operator.TuShare]: [Operator.Begin],
|
||||
[Operator.Crawler]: [Operator.Begin],
|
||||
};
|
||||
|
||||
export const NodeMap = {
|
||||
@ -605,6 +615,7 @@ export const NodeMap = {
|
||||
[Operator.Jin10]: 'ragNode',
|
||||
[Operator.TuShare]: 'ragNode',
|
||||
[Operator.Note]: 'noteNode',
|
||||
[Operator.Crawler]: 'ragNode',
|
||||
};
|
||||
|
||||
export const LanguageOptions = [
|
||||
@ -2791,3 +2802,4 @@ export const TuShareSrcOptions = [
|
||||
'fenghuang',
|
||||
'jinrongjie',
|
||||
];
|
||||
export const CrawlerResultOptions = ['markdown', 'html', 'content'];
|
||||
|
@ -12,6 +12,7 @@ import BaiduForm from '../form/baidu-form';
|
||||
import BeginForm from '../form/begin-form';
|
||||
import BingForm from '../form/bing-form';
|
||||
import CategorizeForm from '../form/categorize-form';
|
||||
import CrawlerForm from '../form/crawler-form';
|
||||
import DeepLForm from '../form/deepl-form';
|
||||
import DuckDuckGoForm from '../form/duckduckgo-form';
|
||||
import ExeSQLForm from '../form/exesql-form';
|
||||
@ -70,6 +71,7 @@ const FormMap = {
|
||||
[Operator.YahooFinance]: YahooFinanceForm,
|
||||
[Operator.Jin10]: Jin10Form,
|
||||
[Operator.TuShare]: TuShareForm,
|
||||
[Operator.Crawler]: CrawlerForm,
|
||||
};
|
||||
|
||||
const EmptyContent = () => <div>empty</div>;
|
||||
|
37
web/src/pages/flow/form/crawler-form/index.tsx
Normal file
37
web/src/pages/flow/form/crawler-form/index.tsx
Normal file
@ -0,0 +1,37 @@
|
||||
import { useTranslate } from '@/hooks/common-hooks';
|
||||
import { Form, Input, Select } from 'antd';
|
||||
import { useMemo } from 'react';
|
||||
import { CrawlerResultOptions } from '../../constant';
|
||||
import { IOperatorForm } from '../../interface';
|
||||
const CrawlerForm = ({ onValuesChange, form }: IOperatorForm) => {
|
||||
const { t } = useTranslate('flow');
|
||||
const crawlerResultOptions = useMemo(() => {
|
||||
return CrawlerResultOptions.map((x) => ({
|
||||
value: x,
|
||||
label: t(`crawlerResultOptions.${x}`),
|
||||
}));
|
||||
}, [t]);
|
||||
return (
|
||||
<Form
|
||||
name="basic"
|
||||
labelCol={{ span: 6 }}
|
||||
wrapperCol={{ span: 18 }}
|
||||
autoComplete="off"
|
||||
form={form}
|
||||
onValuesChange={onValuesChange}
|
||||
>
|
||||
<Form.Item label={t('proxy')} name={'proxy'}>
|
||||
<Input placeholder="like: http://127.0.0.1:8888"></Input>
|
||||
</Form.Item>
|
||||
<Form.Item
|
||||
label={t('extractType')}
|
||||
name={'extract_type'}
|
||||
initialValue="markdown"
|
||||
>
|
||||
<Select options={crawlerResultOptions}></Select>
|
||||
</Form.Item>
|
||||
</Form>
|
||||
);
|
||||
};
|
||||
|
||||
export default CrawlerForm;
|
Loading…
x
Reference in New Issue
Block a user