Supports obtaining PDF documents from web pages (#1107)

### What problem does this PR solve? Knowledge base management supports crawling information from web pages and generating PDF documents ### Type of change - [x] New Feature (Support document from web pages)
2025-08-14 06:05:59 +08:00 · 2024-06-11 10:45:19 +08:00 · 2024-06-11 10:45:19 +08:00 · 7eb69fe6d9
commit 7eb69fe6d9
parent 68a698655a
14 changed files with 336 additions and 17 deletions
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -39,6 +39,7 @@ from api.settings import RetCode
 from api.utils.api_utils import get_json_result
 from rag.utils.minio_conn import MINIO
 from api.utils.file_utils import filename_type, thumbnail
+from api.utils.web_utils import html2pdf, is_valid_url


@manager.route('/upload', methods=['POST'])
@ -289,7 +290,7 @@ def run():
                return get_data_error_result(retmsg="Tenant not found!")
            ELASTICSEARCH.deleteByQuery(
                Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
-            
+
            if str(req["run"]) == TaskStatus.RUNNING.value:
                TaskService.filter_delete([Task.doc_id == id])
                e, doc = DocumentService.get_by_id(id)
@ -416,3 +417,69 @@ def get_image(image_id):
        return response
    except Exception as e:
        return server_error_response(e)
+
+
+@manager.route('/web_crawl', methods=['POST'])
+@login_required
+def web_crawl():
+    kb_id = request.form.get("kb_id")
+    if not kb_id:
+        return get_json_result(
+            data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
+    name = request.form.get("name")
+    url = request.form.get("url")
+    if not name:
+        return get_json_result(
+            data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
+    if not url:
+        return get_json_result(
+            data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
+    if not is_valid_url(url):
+        return get_json_result(
+            data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
+    e, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not e:
+        raise LookupError("Can't find this knowledgebase!")
+
+    root_folder = FileService.get_root_folder(current_user.id)
+    pf_id = root_folder["id"]
+    FileService.init_knowledgebase_docs(pf_id, current_user.id)
+    kb_root_folder = FileService.get_kb_folder(current_user.id)
+    kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
+
+    try:
+        filename = duplicate_name(
+            DocumentService.query,
+            name=name+".pdf",
+            kb_id=kb.id)
+        filetype = filename_type(filename)
+        if filetype == FileType.OTHER.value:
+            raise RuntimeError("This type of file has not been supported yet!")
+
+        location = filename
+        while MINIO.obj_exist(kb_id, location):
+            location += "_"
+        blob = html2pdf(url)
+        MINIO.put(kb_id, location, blob)
+        doc = {
+            "id": get_uuid(),
+            "kb_id": kb.id,
+            "parser_id": kb.parser_id,
+            "parser_config": kb.parser_config,
+            "created_by": current_user.id,
+            "type": filetype,
+            "name": filename,
+            "location": location,
+            "size": len(blob),
+            "thumbnail": thumbnail(filename, blob)
+        }
+        if doc["type"] == FileType.VISUAL:
+            doc["parser_id"] = ParserType.PICTURE.value
+        if re.search(r"\.(ppt|pptx|pages)$", filename):
+            doc["parser_id"] = ParserType.PRESENTATION.value
+        DocumentService.insert(doc)
+        FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
+    except Exception as e:
+        return get_json_result(
+            data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
+    return get_json_result(data=True)
--- a/api/utils/web_utils.py
+++ b/api/utils/web_utils.py
@ -0,0 +1,82 @@
+import re
+import json
+import base64
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support.expected_conditions import staleness_of
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+
+
+def html2pdf(
+        source: str,
+        timeout: int = 2,
+        install_driver: bool = True,
+        print_options: dict = {},
+):
+    result = __get_pdf_from_html(source, timeout, install_driver, print_options)
+    return result
+
+
+def __send_devtools(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({"cmd": cmd, "params": params})
+    response = driver.command_executor._request("POST", url, body)
+
+    if not response:
+        raise Exception(response.get("value"))
+
+    return response.get("value")
+
+
+def __get_pdf_from_html(
+        path: str,
+        timeout: int,
+        install_driver: bool,
+        print_options: dict
+):
+    webdriver_options = Options()
+    webdriver_prefs = {}
+    webdriver_options.add_argument("--headless")
+    webdriver_options.add_argument("--disable-gpu")
+    webdriver_options.add_argument("--no-sandbox")
+    webdriver_options.add_argument("--disable-dev-shm-usage")
+    webdriver_options.experimental_options["prefs"] = webdriver_prefs
+
+    webdriver_prefs["profile.default_content_settings"] = {"images": 2}
+
+    if install_driver:
+        service = Service(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service, options=webdriver_options)
+    else:
+        driver = webdriver.Chrome(options=webdriver_options)
+
+    driver.get(path)
+
+    try:
+        WebDriverWait(driver, timeout).until(
+            staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
+        )
+    except TimeoutException:
+        calculated_print_options = {
+            "landscape": False,
+            "displayHeaderFooter": False,
+            "printBackground": True,
+            "preferCSSPageSize": True,
+        }
+        calculated_print_options.update(print_options)
+        result = __send_devtools(
+            driver, "Page.printToPDF", calculated_print_options)
+        driver.quit()
+        return base64.b64decode(result["data"])
+
+
+def is_valid_url(url: str) -> bool:
+    return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
+
+
--- a/requirements.txt
+++ b/requirements.txt
@ -138,4 +138,6 @@ umap-learn
 fasttext==0.9.2
 volcengine==1.0.141
 readability-lxml==0.8.1
-html_text==0.6.2
+html_text==0.6.2
+selenium==4.21.0
+webdriver-manager==4.0.1
--- a/web/src/hooks/documentHooks.ts
+++ b/web/src/hooks/documentHooks.ts
@ -1,13 +1,13 @@
-import { IChunk, IKnowledgeFile } from '@/interfaces/database/knowledge';
-import { IChangeParserConfigRequestBody } from '@/interfaces/request/document';
-import { api_host } from '@/utils/api';
-import { buildChunkHighlights } from '@/utils/documentUtils';
-import { UploadFile } from 'antd';
-import { useCallback, useMemo, useState } from 'react';
-import { IHighlight } from 'react-pdf-highlighter';
-import { useDispatch, useSelector } from 'umi';
-import { useGetKnowledgeSearchParams } from './routeHook';
-import { useOneNamespaceEffectsLoading } from './storeHooks';
+import {IChunk, IKnowledgeFile} from '@/interfaces/database/knowledge';
+import {IChangeParserConfigRequestBody} from '@/interfaces/request/document';
+import {api_host} from '@/utils/api';
+import {buildChunkHighlights} from '@/utils/documentUtils';
+import {UploadFile} from 'antd';
+import {useCallback, useMemo, useState} from 'react';
+import {IHighlight} from 'react-pdf-highlighter';
+import {useDispatch, useSelector} from 'umi';
+import {useGetKnowledgeSearchParams} from './routeHook';
+import {useOneNamespaceEffectsLoading} from './storeHooks';

 export const useGetDocumentUrl = (documentId?: string) => {
  const getDocumentUrl = useCallback(
@ -207,6 +207,28 @@ export const useUploadDocument = () => {
  return uploadDocument;
 };

+export const useWebCrawl = () => {
+  const dispatch = useDispatch();
+  const { knowledgeId } = useGetKnowledgeSearchParams();
+  return useCallback(
+      (name: string, url: string) => {
+        try {
+          return dispatch<any>({
+            type: 'kFModel/web_crawl',
+            payload: {
+              name,
+              url,
+              kb_id: knowledgeId,
+            },
+          });
+        } catch (errorInfo) {
+          console.log('Failed:', errorInfo);
+        }
+      },
+      [dispatch],
+  );
+};
+
 export const useRunDocument = () => {
  const dispatch = useDispatch();

--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@ -81,6 +81,7 @@ export default {
      searchFiles: 'Search your files',
      localFiles: 'Local files',
      emptyFiles: 'Create empty file',
+      webCrawl: 'Web Crawl',
      chunkNumber: 'Chunk Number',
      uploadDate: 'Upload Date',
      chunkMethod: 'Chunk Method',
--- a/web/src/locales/zh-traditional.ts
+++ b/web/src/locales/zh-traditional.ts
@ -80,6 +80,7 @@ export default {
      searchFiles: '搜索文件',
      localFiles: '本地文件',
      emptyFiles: '新建空文件',
+      webCrawl: '網頁抓取',
      chunkNumber: '分塊數',
      uploadDate: '上傳日期',
      chunkMethod: '解析方法',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@ -80,6 +80,7 @@ export default {
      searchFiles: '搜索文件',
      localFiles: '本地文件',
      emptyFiles: '新建空文件',
+      webCrawl: '网页抓取',
      chunkNumber: '分块数',
      uploadDate: '上传日期',
      chunkMethod: '解析方法',
--- a/web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-file/document-toolbar.tsx
@ -29,13 +29,15 @@ import styles from './index.less';
 interface IProps {
  selectedRowKeys: string[];
  showCreateModal(): void;
+  showWebCrawlModal(): void;
  showDocumentUploadModal(): void;
 }

 const DocumentToolbar = ({
-  selectedRowKeys,
-  showCreateModal,
-  showDocumentUploadModal,
+                           selectedRowKeys,
+                           showCreateModal,
+                           showWebCrawlModal,
+                           showDocumentUploadModal,
 }: IProps) => {
  const { t } = useTranslate('knowledgeDetails');
  const { fetchDocumentList } = useFetchDocumentListOnMount();
@ -66,6 +68,19 @@ const DocumentToolbar = ({
      { type: 'divider' },
      {
        key: '2',
+        onClick: showWebCrawlModal,
+        label: (
+          <div>
+            <Button type="link">
+              <FileTextOutlined />
+              {t('webCrawl')}
+            </Button>
+          </div>
+        ),
+      },
+      { type: 'divider' },
+      {
+        key: '3',
        onClick: showCreateModal,
        label: (
          <div>
@ -77,7 +92,7 @@ const DocumentToolbar = ({
        ),
      },
    ];
-  }, [showDocumentUploadModal, showCreateModal, t]);
+  }, [showDocumentUploadModal, showWebCrawlModal, showCreateModal, t]);

  const handleDelete = useCallback(() => {
    showDeleteConfirm({
--- a/web/src/pages/add-knowledge/components/knowledge-file/hooks.ts
+++ b/web/src/pages/add-knowledge/components/knowledge-file/hooks.ts
@ -7,6 +7,7 @@ import {
  useSelectRunDocumentLoading,
  useSetDocumentParser,
  useUploadDocument,
+  useWebCrawl,
 } from '@/hooks/documentHooks';
 import { useGetKnowledgeSearchParams } from '@/hooks/routeHook';
 import { useOneNamespaceEffectsLoading } from '@/hooks/storeHooks';
@ -286,6 +287,37 @@ export const useHandleUploadDocument = () => {
  };
 };

+export const useHandleWebCrawl = () => {
+  const {
+    visible: webCrawlUploadVisible,
+    hideModal: hideWebCrawlUploadModal,
+    showModal: showWebCrawlUploadModal,
+  } = useSetModalState();
+  const webCrawl = useWebCrawl();
+
+  const onWebCrawlUploadOk = useCallback(
+    async (name: string, url: string ) => {
+      const ret = await webCrawl(name, url);
+      if (ret === 0) {
+        hideWebCrawlUploadModal();
+        return 0
+      }
+      return -1
+    },
+    [webCrawl, hideWebCrawlUploadModal],
+  );
+
+  const loading = useOneNamespaceEffectsLoading('kFModel', ['web_crawl']);
+
+  return {
+    webCrawlUploadLoading: loading,
+    onWebCrawlUploadOk,
+    webCrawlUploadVisible,
+    hideWebCrawlUploadModal,
+    showWebCrawlUploadModal,
+  };
+};
+
 export const useHandleRunDocumentByIds = (id: string) => {
  const loading = useSelectRunDocumentLoading();
  const runDocumentByIds = useRunDocument();
--- a/web/src/pages/add-knowledge/components/knowledge-file/index.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-file/index.tsx
@ -12,6 +12,7 @@ import { Divider, Flex, Switch, Table, Typography } from 'antd';
 import type { ColumnsType } from 'antd/es/table';
 import { useTranslation } from 'react-i18next';
 import CreateFileModal from './create-file-modal';
+import WebCrawlModal from './web-crawl-modal';
 import DocumentToolbar from './document-toolbar';
 import {
  useChangeDocumentParser,
@ -19,7 +20,7 @@ import {
  useFetchDocumentListOnMount,
  useGetPagination,
  useGetRowSelection,
-  useHandleUploadDocument,
+  useHandleUploadDocument, useHandleWebCrawl,
  useNavigateToOtherPage,
  useRenameDocument,
 } from './hooks';
@ -69,6 +70,13 @@ const KnowledgeFile = () => {
    onDocumentUploadOk,
    documentUploadLoading,
  } = useHandleUploadDocument();
+  const {
+    webCrawlUploadVisible,
+    hideWebCrawlUploadModal,
+    showWebCrawlUploadModal,
+    onWebCrawlUploadOk,
+    webCrawlUploadLoading,
+  } = useHandleWebCrawl();
  const { t } = useTranslation('translation', {
    keyPrefix: 'knowledgeDetails',
  });
@ -170,6 +178,7 @@ const KnowledgeFile = () => {
      <DocumentToolbar
        selectedRowKeys={rowSelection.selectedRowKeys as string[]}
        showCreateModal={showCreateModal}
+        showWebCrawlModal={showWebCrawlUploadModal}
        showDocumentUploadModal={showDocumentUploadModal}
      ></DocumentToolbar>
      <Table
@ -211,6 +220,12 @@ const KnowledgeFile = () => {
        loading={documentUploadLoading}
        onOk={onDocumentUploadOk}
      ></FileUploadModal>
+      <WebCrawlModal
+        visible={webCrawlUploadVisible}
+        hideModal={hideWebCrawlUploadModal}
+        loading={webCrawlUploadLoading}
+        onOk={onWebCrawlUploadOk}
+      ></WebCrawlModal>
    </div>
  );
 };
--- a/web/src/pages/add-knowledge/components/knowledge-file/model.ts
+++ b/web/src/pages/add-knowledge/components/knowledge-file/model.ts
@ -232,6 +232,27 @@ const model: DvaModel<KFModelState> = {
      }
      return data;
    },
+    *web_crawl({ payload = {} }, { call, put }) {
+      const formData = new FormData();
+      formData.append('name', payload.name);
+      formData.append('url', payload.url);
+      formData.append('kb_id', payload.kb_id);
+
+      const { data } = yield call(kbService.web_crawl, formData);
+
+      const succeed = data.retcode === 0;
+
+      if (succeed) {
+        message.success(i18n.t('message.uploaded'));
+      }
+      if (succeed || data.retcode === 500) {
+        yield put({
+          type: 'getKfList',
+          payload: { kb_id: payload.kb_id },
+        });
+      }
+      return data.retcode;
+    },
  },
  subscriptions: {
    setup({ dispatch, history }) {
--- a/web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-file/web-crawl-modal.tsx
@ -0,0 +1,54 @@
+import { IModalManagerChildrenProps } from '@/components/modal-manager';
+import { Form, Input, Modal } from 'antd';
+import React from 'react';
+import {useTranslate} from "@/hooks/commonHooks";
+
+
+interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
+  loading: boolean;
+  onOk: (name: string, url: string) => void;
+  showModal?(): void;
+}
+
+const WebCrawlModal: React.FC<IProps> = ({ visible, hideModal, onOk }) => {
+  const [form] = Form.useForm();
+  const { t } = useTranslate('knowledgeDetails');
+  const handleOk = async () => {
+    const values = await form.validateFields();
+    onOk(values.name, values.url);
+  };
+
+  return (
+    <Modal
+      title={t('webCrawl')}
+      open={visible}
+      onOk={handleOk}
+      onCancel={hideModal}
+    >
+      <Form
+        form={form}
+        name="validateOnly"
+        labelCol={{ span: 4 }}
+        wrapperCol={{ span: 20 }}
+        style={{ maxWidth: 600 }}
+        autoComplete="off"
+      >
+        <Form.Item
+          label="Name"
+          name="name"
+          rules={[{ required: true, message: 'Please input name!' },{ max: 10, message: 'The maximum length of name is 128 characters' }]}
+        >
+          <Input placeholder="Document name" />
+        </Form.Item>
+        <Form.Item
+          label="URL"
+          name="url"
+          rules={[{ required: true, message: 'Please input url!' },{pattern: new RegExp('(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'), message: 'Please enter a valid URL!'}]}
+        >
+          <Input placeholder="https://www.baidu.com" />
+        </Form.Item>
+      </Form>
+    </Modal>
+  );
+};
+export default WebCrawlModal;
--- a/web/src/services/kbService.ts
+++ b/web/src/services/kbService.ts
@ -26,6 +26,7 @@ const {
  document_run,
  get_document_file,
  document_upload,
+  web_crawl,
 } = api;

 const methods = {
@ -87,6 +88,10 @@ const methods = {
    url: document_upload,
    method: 'post',
  },
+  web_crawl: {
+    url: web_crawl,
+    method: 'post',
+  },
  // chunk管理
  chunk_list: {
    url: chunk_list,
--- a/web/src/utils/api.ts
+++ b/web/src/utils/api.ts
@ -48,6 +48,7 @@ export default {
  document_thumbnails: `${api_host}/document/thumbnails`,
  get_document_file: `${api_host}/document/get`,
  document_upload: `${api_host}/document/upload`,
+  web_crawl: `${api_host}/document/web_crawl`,

  // chat
  setDialog: `${api_host}/dialog/set`,