fix: document truncation and loss in notion document sync (#5631)

Co-authored-by: Aurelius Huang <cm.huang@aftership.com>
This commit is contained in:
Aurelius Huang 2024-07-05 11:48:17 +08:00 committed by GitHub
parent f8aaa57f31
commit f546db5437
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -140,11 +140,10 @@ class NotionExtractor(BaseExtractor):
def _get_notion_block_data(self, page_id: str) -> list[str]:
result_lines_arr = []
cur_block_id = page_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
res = requests.request(
"GET",
block_url,
@ -153,7 +152,7 @@ class NotionExtractor(BaseExtractor):
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
for result in data["results"]:
@ -191,16 +190,16 @@ class NotionExtractor(BaseExtractor):
if data["next_cursor"] is None:
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]
return result_lines_arr
def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"""Read a block."""
result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
res = requests.request(
"GET",
@ -210,7 +209,7 @@ class NotionExtractor(BaseExtractor):
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
if 'results' not in data or data["results"] is None:
@ -249,7 +248,7 @@ class NotionExtractor(BaseExtractor):
if data["next_cursor"] is None:
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
@ -258,10 +257,10 @@ class NotionExtractor(BaseExtractor):
"""Read table rows."""
done = False
result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
res = requests.request(
"GET",
@ -271,7 +270,7 @@ class NotionExtractor(BaseExtractor):
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
# get table headers text
@ -300,7 +299,7 @@ class NotionExtractor(BaseExtractor):
done = True
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines