diff --git a/api/apps/document_app.py b/api/apps/document_app.py index f8485e560..3ad88f4fe 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -39,6 +39,7 @@ from api.settings import RetCode from api.utils.api_utils import get_json_result from rag.utils.minio_conn import MINIO from api.utils.file_utils import filename_type, thumbnail +from api.utils.web_utils import html2pdf, is_valid_url @manager.route('/upload', methods=['POST']) @@ -289,7 +290,7 @@ def run(): return get_data_error_result(retmsg="Tenant not found!") ELASTICSEARCH.deleteByQuery( Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) - + if str(req["run"]) == TaskStatus.RUNNING.value: TaskService.filter_delete([Task.doc_id == id]) e, doc = DocumentService.get_by_id(id) @@ -416,3 +417,69 @@ def get_image(image_id): return response except Exception as e: return server_error_response(e) + + +@manager.route('/web_crawl', methods=['POST']) +@login_required +def web_crawl(): + kb_id = request.form.get("kb_id") + if not kb_id: + return get_json_result( + data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR) + name = request.form.get("name") + url = request.form.get("url") + if not name: + return get_json_result( + data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR) + if not url: + return get_json_result( + data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR) + if not is_valid_url(url): + return get_json_result( + data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR) + e, kb = KnowledgebaseService.get_by_id(kb_id) + if not e: + raise LookupError("Can't find this knowledgebase!") + + root_folder = FileService.get_root_folder(current_user.id) + pf_id = root_folder["id"] + FileService.init_knowledgebase_docs(pf_id, current_user.id) + kb_root_folder = FileService.get_kb_folder(current_user.id) + kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) + + try: + filename = duplicate_name( + DocumentService.query, + name=name+".pdf", + kb_id=kb.id) + filetype = filename_type(filename) + if filetype == FileType.OTHER.value: + raise RuntimeError("This type of file has not been supported yet!") + + location = filename + while MINIO.obj_exist(kb_id, location): + location += "_" + blob = html2pdf(url) + MINIO.put(kb_id, location, blob) + doc = { + "id": get_uuid(), + "kb_id": kb.id, + "parser_id": kb.parser_id, + "parser_config": kb.parser_config, + "created_by": current_user.id, + "type": filetype, + "name": filename, + "location": location, + "size": len(blob), + "thumbnail": thumbnail(filename, blob) + } + if doc["type"] == FileType.VISUAL: + doc["parser_id"] = ParserType.PICTURE.value + if re.search(r"\.(ppt|pptx|pages)$", filename): + doc["parser_id"] = ParserType.PRESENTATION.value + DocumentService.insert(doc) + FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) + except Exception as e: + return get_json_result( + data=False, retmsg=e, retcode=RetCode.SERVER_ERROR) + return get_json_result(data=True) diff --git a/api/utils/web_utils.py b/api/utils/web_utils.py new file mode 100644 index 000000000..bf4282e1d --- /dev/null +++ b/api/utils/web_utils.py @@ -0,0 +1,82 @@ +import re +import json +import base64 + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.expected_conditions import staleness_of +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.common.by import By + + +def html2pdf( + source: str, + timeout: int = 2, + install_driver: bool = True, + print_options: dict = {}, +): + result = __get_pdf_from_html(source, timeout, install_driver, print_options) + return result + + +def __send_devtools(driver, cmd, params={}): + resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id + url = driver.command_executor._url + resource + body = json.dumps({"cmd": cmd, "params": params}) + response = driver.command_executor._request("POST", url, body) + + if not response: + raise Exception(response.get("value")) + + return response.get("value") + + +def __get_pdf_from_html( + path: str, + timeout: int, + install_driver: bool, + print_options: dict +): + webdriver_options = Options() + webdriver_prefs = {} + webdriver_options.add_argument("--headless") + webdriver_options.add_argument("--disable-gpu") + webdriver_options.add_argument("--no-sandbox") + webdriver_options.add_argument("--disable-dev-shm-usage") + webdriver_options.experimental_options["prefs"] = webdriver_prefs + + webdriver_prefs["profile.default_content_settings"] = {"images": 2} + + if install_driver: + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=webdriver_options) + else: + driver = webdriver.Chrome(options=webdriver_options) + + driver.get(path) + + try: + WebDriverWait(driver, timeout).until( + staleness_of(driver.find_element(by=By.TAG_NAME, value="html")) + ) + except TimeoutException: + calculated_print_options = { + "landscape": False, + "displayHeaderFooter": False, + "printBackground": True, + "preferCSSPageSize": True, + } + calculated_print_options.update(print_options) + result = __send_devtools( + driver, "Page.printToPDF", calculated_print_options) + driver.quit() + return base64.b64decode(result["data"]) + + +def is_valid_url(url: str) -> bool: + return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url)) + + diff --git a/requirements.txt b/requirements.txt index 6f45cad04..3127ad72a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -138,4 +138,6 @@ umap-learn fasttext==0.9.2 volcengine==1.0.141 readability-lxml==0.8.1 -html_text==0.6.2 \ No newline at end of file +html_text==0.6.2 +selenium==4.21.0 +webdriver-manager==4.0.1 diff --git a/web/src/hooks/documentHooks.ts b/web/src/hooks/documentHooks.ts index ffa524ff3..212b5167c 100644 --- a/web/src/hooks/documentHooks.ts +++ b/web/src/hooks/documentHooks.ts @@ -1,13 +1,13 @@ -import { IChunk, IKnowledgeFile } from '@/interfaces/database/knowledge'; -import { IChangeParserConfigRequestBody } from '@/interfaces/request/document'; -import { api_host } from '@/utils/api'; -import { buildChunkHighlights } from '@/utils/documentUtils'; -import { UploadFile } from 'antd'; -import { useCallback, useMemo, useState } from 'react'; -import { IHighlight } from 'react-pdf-highlighter'; -import { useDispatch, useSelector } from 'umi'; -import { useGetKnowledgeSearchParams } from './routeHook'; -import { useOneNamespaceEffectsLoading } from './storeHooks'; +import {IChunk, IKnowledgeFile} from '@/interfaces/database/knowledge'; +import {IChangeParserConfigRequestBody} from '@/interfaces/request/document'; +import {api_host} from '@/utils/api'; +import {buildChunkHighlights} from '@/utils/documentUtils'; +import {UploadFile} from 'antd'; +import {useCallback, useMemo, useState} from 'react'; +import {IHighlight} from 'react-pdf-highlighter'; +import {useDispatch, useSelector} from 'umi'; +import {useGetKnowledgeSearchParams} from './routeHook'; +import {useOneNamespaceEffectsLoading} from './storeHooks'; export const useGetDocumentUrl = (documentId?: string) => { const getDocumentUrl = useCallback( @@ -207,6 +207,28 @@ export const useUploadDocument = () => { return uploadDocument; }; +export const useWebCrawl = () => { + const dispatch = useDispatch(); + const { knowledgeId } = useGetKnowledgeSearchParams(); + return useCallback( + (name: string, url: string) => { + try { + return dispatch({ + type: 'kFModel/web_crawl', + payload: { + name, + url, + kb_id: knowledgeId, + }, + }); + } catch (errorInfo) { + console.log('Failed:', errorInfo); + } + }, + [dispatch], + ); +}; + export const useRunDocument = () => { const dispatch = useDispatch(); diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index d31634493..4a89e0a8d 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -81,6 +81,7 @@ export default { searchFiles: 'Search your files', localFiles: 'Local files', emptyFiles: 'Create empty file', + webCrawl: 'Web Crawl', chunkNumber: 'Chunk Number', uploadDate: 'Upload Date', chunkMethod: 'Chunk Method', diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index 07c95be96..3b96cf4da 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -80,6 +80,7 @@ export default { searchFiles: '搜索文件', localFiles: '本地文件', emptyFiles: '新建空文件', + webCrawl: '網頁抓取', chunkNumber: '分塊數', uploadDate: '上傳日期', chunkMethod: '解析方法', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index acdffb5c6..920d15127 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -80,6 +80,7 @@ export default { searchFiles: '搜索文件', localFiles: '本地文件', emptyFiles: '新建空文件', + webCrawl: '网页抓取', chunkNumber: '分块数', uploadDate: '上传日期', chunkMethod: '解析方法', diff --git a/web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx b/web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx index f964c7f9c..11ddeb779 100644 --- a/web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx +++ b/web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx @@ -29,13 +29,15 @@ import styles from './index.less'; interface IProps { selectedRowKeys: string[]; showCreateModal(): void; + showWebCrawlModal(): void; showDocumentUploadModal(): void; } const DocumentToolbar = ({ - selectedRowKeys, - showCreateModal, - showDocumentUploadModal, + selectedRowKeys, + showCreateModal, + showWebCrawlModal, + showDocumentUploadModal, }: IProps) => { const { t } = useTranslate('knowledgeDetails'); const { fetchDocumentList } = useFetchDocumentListOnMount(); @@ -66,6 +68,19 @@ const DocumentToolbar = ({ { type: 'divider' }, { key: '2', + onClick: showWebCrawlModal, + label: ( +
+ +
+ ), + }, + { type: 'divider' }, + { + key: '3', onClick: showCreateModal, label: (
@@ -77,7 +92,7 @@ const DocumentToolbar = ({ ), }, ]; - }, [showDocumentUploadModal, showCreateModal, t]); + }, [showDocumentUploadModal, showWebCrawlModal, showCreateModal, t]); const handleDelete = useCallback(() => { showDeleteConfirm({ diff --git a/web/src/pages/add-knowledge/components/knowledge-file/hooks.ts b/web/src/pages/add-knowledge/components/knowledge-file/hooks.ts index be0649161..0c58103a3 100644 --- a/web/src/pages/add-knowledge/components/knowledge-file/hooks.ts +++ b/web/src/pages/add-knowledge/components/knowledge-file/hooks.ts @@ -7,6 +7,7 @@ import { useSelectRunDocumentLoading, useSetDocumentParser, useUploadDocument, + useWebCrawl, } from '@/hooks/documentHooks'; import { useGetKnowledgeSearchParams } from '@/hooks/routeHook'; import { useOneNamespaceEffectsLoading } from '@/hooks/storeHooks'; @@ -286,6 +287,37 @@ export const useHandleUploadDocument = () => { }; }; +export const useHandleWebCrawl = () => { + const { + visible: webCrawlUploadVisible, + hideModal: hideWebCrawlUploadModal, + showModal: showWebCrawlUploadModal, + } = useSetModalState(); + const webCrawl = useWebCrawl(); + + const onWebCrawlUploadOk = useCallback( + async (name: string, url: string ) => { + const ret = await webCrawl(name, url); + if (ret === 0) { + hideWebCrawlUploadModal(); + return 0 + } + return -1 + }, + [webCrawl, hideWebCrawlUploadModal], + ); + + const loading = useOneNamespaceEffectsLoading('kFModel', ['web_crawl']); + + return { + webCrawlUploadLoading: loading, + onWebCrawlUploadOk, + webCrawlUploadVisible, + hideWebCrawlUploadModal, + showWebCrawlUploadModal, + }; +}; + export const useHandleRunDocumentByIds = (id: string) => { const loading = useSelectRunDocumentLoading(); const runDocumentByIds = useRunDocument(); diff --git a/web/src/pages/add-knowledge/components/knowledge-file/index.tsx b/web/src/pages/add-knowledge/components/knowledge-file/index.tsx index 6ad33210a..f0b2e0e35 100644 --- a/web/src/pages/add-knowledge/components/knowledge-file/index.tsx +++ b/web/src/pages/add-knowledge/components/knowledge-file/index.tsx @@ -12,6 +12,7 @@ import { Divider, Flex, Switch, Table, Typography } from 'antd'; import type { ColumnsType } from 'antd/es/table'; import { useTranslation } from 'react-i18next'; import CreateFileModal from './create-file-modal'; +import WebCrawlModal from './web-crawl-modal'; import DocumentToolbar from './document-toolbar'; import { useChangeDocumentParser, @@ -19,7 +20,7 @@ import { useFetchDocumentListOnMount, useGetPagination, useGetRowSelection, - useHandleUploadDocument, + useHandleUploadDocument, useHandleWebCrawl, useNavigateToOtherPage, useRenameDocument, } from './hooks'; @@ -69,6 +70,13 @@ const KnowledgeFile = () => { onDocumentUploadOk, documentUploadLoading, } = useHandleUploadDocument(); + const { + webCrawlUploadVisible, + hideWebCrawlUploadModal, + showWebCrawlUploadModal, + onWebCrawlUploadOk, + webCrawlUploadLoading, + } = useHandleWebCrawl(); const { t } = useTranslation('translation', { keyPrefix: 'knowledgeDetails', }); @@ -170,6 +178,7 @@ const KnowledgeFile = () => { { loading={documentUploadLoading} onOk={onDocumentUploadOk} > + ); }; diff --git a/web/src/pages/add-knowledge/components/knowledge-file/model.ts b/web/src/pages/add-knowledge/components/knowledge-file/model.ts index 743fd58ef..f248e9a24 100644 --- a/web/src/pages/add-knowledge/components/knowledge-file/model.ts +++ b/web/src/pages/add-knowledge/components/knowledge-file/model.ts @@ -232,6 +232,27 @@ const model: DvaModel = { } return data; }, + *web_crawl({ payload = {} }, { call, put }) { + const formData = new FormData(); + formData.append('name', payload.name); + formData.append('url', payload.url); + formData.append('kb_id', payload.kb_id); + + const { data } = yield call(kbService.web_crawl, formData); + + const succeed = data.retcode === 0; + + if (succeed) { + message.success(i18n.t('message.uploaded')); + } + if (succeed || data.retcode === 500) { + yield put({ + type: 'getKfList', + payload: { kb_id: payload.kb_id }, + }); + } + return data.retcode; + }, }, subscriptions: { setup({ dispatch, history }) { diff --git a/web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx b/web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx new file mode 100644 index 000000000..a5170570a --- /dev/null +++ b/web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx @@ -0,0 +1,54 @@ +import { IModalManagerChildrenProps } from '@/components/modal-manager'; +import { Form, Input, Modal } from 'antd'; +import React from 'react'; +import {useTranslate} from "@/hooks/commonHooks"; + + +interface IProps extends Omit { + loading: boolean; + onOk: (name: string, url: string) => void; + showModal?(): void; +} + +const WebCrawlModal: React.FC = ({ visible, hideModal, onOk }) => { + const [form] = Form.useForm(); + const { t } = useTranslate('knowledgeDetails'); + const handleOk = async () => { + const values = await form.validateFields(); + onOk(values.name, values.url); + }; + + return ( + +
+ + + + + + + +
+ ); +}; +export default WebCrawlModal; diff --git a/web/src/services/kbService.ts b/web/src/services/kbService.ts index decb8c527..d3e73e338 100644 --- a/web/src/services/kbService.ts +++ b/web/src/services/kbService.ts @@ -26,6 +26,7 @@ const { document_run, get_document_file, document_upload, + web_crawl, } = api; const methods = { @@ -87,6 +88,10 @@ const methods = { url: document_upload, method: 'post', }, + web_crawl: { + url: web_crawl, + method: 'post', + }, // chunk管理 chunk_list: { url: chunk_list, diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index a05c2fee2..bcd201cad 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -48,6 +48,7 @@ export default { document_thumbnails: `${api_host}/document/thumbnails`, get_document_file: `${api_host}/document/get`, document_upload: `${api_host}/document/upload`, + web_crawl: `${api_host}/document/web_crawl`, // chat setDialog: `${api_host}/dialog/set`,