From 88a9da33e46f03d831c1a2aa20c7f75f4e6e89ca Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Tue, 17 Sep 2024 17:21:34 +0800 Subject: [PATCH] wip --- backend/functions/src/services/html-to-md.ts | 102 ++++++++++++++----- thinapps-shared | 2 +- 2 files changed, 77 insertions(+), 27 deletions(-) diff --git a/backend/functions/src/services/html-to-md.ts b/backend/functions/src/services/html-to-md.ts index 8602f9c..208f9c3 100644 --- a/backend/functions/src/services/html-to-md.ts +++ b/backend/functions/src/services/html-to-md.ts @@ -1,9 +1,12 @@ import { AsyncService, AutoCastable, Prop } from 'civkit'; import { Logger } from '../shared/services/logger'; +const pLinkedom = import('linkedom'); export class HTMLtoMarkdown extends AsyncService { + linkedom!: Awaited; + constructor( protected logger: Logger, ) { @@ -14,20 +17,22 @@ export class HTMLtoMarkdown extends AsyncService { override async init() { await this.dependencyReady(); + this.linkedom = await pLinkedom; + this.emit('ready'); } } -class MarkdownASTNode extends AutoCastable { +export class MarkdownASTNode extends AutoCastable { @Prop({ required: true }) type!: string; } -class MDCode extends MarkdownASTNode { +export class MDCode extends MarkdownASTNode { @Prop({ default: 'code' }) @@ -43,7 +48,7 @@ class MDCode extends MarkdownASTNode { text!: string; } -class MDHTML extends MarkdownASTNode { +export class MDHTML extends MarkdownASTNode { @Prop({ default: 'html' }) @@ -57,7 +62,7 @@ class MDHTML extends MarkdownASTNode { } -class MarkdownASTParentNode extends MarkdownASTNode { +export class MarkdownASTParentNode extends MarkdownASTNode { @Prop({ default: [], @@ -67,21 +72,21 @@ class MarkdownASTParentNode extends MarkdownASTNode { } -class MarkdownASTRoot extends MarkdownASTParentNode { +export class MarkdownASTRoot extends MarkdownASTParentNode { @Prop({ default: 'root' }) override type!: 'root'; } -class MDParagraph extends MarkdownASTParentNode { +export class MDParagraph extends MarkdownASTParentNode { @Prop({ default: 'paragraph' }) override type!: 'paragraph'; } -class MDHeading extends MarkdownASTParentNode { +export class MDHeading extends MarkdownASTParentNode { @Prop({ default: 'heading' }) @@ -97,7 +102,7 @@ class MDHeading extends MarkdownASTParentNode { level!: 1 | 2 | 3 | 4 | 5 | 6; } -class MDList extends MarkdownASTParentNode { +export class MDList extends MarkdownASTParentNode { @Prop({ default: 'list' }) @@ -109,7 +114,7 @@ class MDList extends MarkdownASTParentNode { ordered!: boolean; } -class MDListItem extends MarkdownASTParentNode { +export class MDListItem extends MarkdownASTParentNode { @Prop({ default: 'listItem' }) @@ -130,7 +135,7 @@ class MDListItem extends MarkdownASTParentNode { override children!: MarkdownASTNode[]; } -class MDLink extends MarkdownASTParentNode { +export class MDLink extends MarkdownASTParentNode { @Prop({ default: 'link' }) @@ -151,7 +156,7 @@ class MDLink extends MarkdownASTParentNode { override children!: MarkdownASTNode[]; } -class MDStrong extends MarkdownASTParentNode { +export class MDStrong extends MarkdownASTParentNode { @Prop({ default: 'strong' }) @@ -164,7 +169,7 @@ class MDStrong extends MarkdownASTParentNode { override children!: MarkdownASTNode[]; } -class MDEmphasis extends MarkdownASTParentNode { +export class MDEmphasis extends MarkdownASTParentNode { @Prop({ default: 'emphasis' }) @@ -177,7 +182,7 @@ class MDEmphasis extends MarkdownASTParentNode { override children!: MarkdownASTNode[]; } -class MDDelete extends MarkdownASTParentNode { +export class MDDelete extends MarkdownASTParentNode { @Prop({ default: 'delete' }) @@ -191,7 +196,7 @@ class MDDelete extends MarkdownASTParentNode { } -class MDLiteral extends MarkdownASTNode { +export class MDLiteral extends MarkdownASTNode { @Prop({ default: 'literal' }) @@ -204,21 +209,21 @@ class MDLiteral extends MarkdownASTNode { text!: string; } -class MDLineBreak extends MarkdownASTNode { +export class MDLineBreak extends MarkdownASTNode { @Prop({ default: 'break' }) override type!: 'break'; } -class MDThematicBreak extends MarkdownASTNode { +export class MDThematicBreak extends MarkdownASTNode { @Prop({ default: 'thematicBreak' }) override type!: 'thematicBreak'; } -class MDImage extends MarkdownASTNode { +export class MDImage extends MarkdownASTNode { @Prop({ default: 'image' }) @@ -236,7 +241,7 @@ class MDImage extends MarkdownASTNode { title?: string; } -class MDInlineCode extends MarkdownASTNode { +export class MDInlineCode extends MarkdownASTNode { @Prop({ default: 'inlineCode' }) @@ -249,7 +254,7 @@ class MDInlineCode extends MarkdownASTNode { text!: string; } -class MDMath extends MarkdownASTNode { +export class MDMath extends MarkdownASTNode { @Prop({ default: 'math' }) @@ -265,7 +270,7 @@ class MDMath extends MarkdownASTNode { text!: string; } -class MDInlineMath extends MarkdownASTNode { +export class MDInlineMath extends MarkdownASTNode { @Prop({ default: 'inlineMath' }) @@ -282,7 +287,7 @@ class MDInlineMath extends MarkdownASTNode { } -class MDTableHeading extends MarkdownASTNode { +export class MDTableHeading extends MarkdownASTNode { @Prop({ default: 'tableHeading' }) @@ -302,7 +307,7 @@ class MDTableHeading extends MarkdownASTNode { align?: 'left' | 'center' | 'right'; } -class MDTableHeader extends MarkdownASTParentNode { +export class MDTableHeader extends MarkdownASTParentNode { @Prop({ default: 'tableHeader' }) @@ -314,7 +319,7 @@ class MDTableHeader extends MarkdownASTParentNode { }) override children!: MDTableHeading[]; } -class MDTableCell extends MarkdownASTParentNode { +export class MDTableCell extends MarkdownASTParentNode { @Prop({ default: 'tableCell' }) @@ -327,7 +332,7 @@ class MDTableCell extends MarkdownASTParentNode { override children!: MarkdownASTNode[]; } -class MDTableRow extends MarkdownASTParentNode { +export class MDTableRow extends MarkdownASTParentNode { @Prop({ default: 'tableRow' }) @@ -340,7 +345,7 @@ class MDTableRow extends MarkdownASTParentNode { override children!: MDTableCell[]; } -class MDTable extends MarkdownASTParentNode { +export class MDTable extends MarkdownASTParentNode { @Prop({ default: 'table' }) @@ -353,15 +358,60 @@ class MDTable extends MarkdownASTParentNode { override children!: (MDTableHeader | MDTableRow)[]; } -class MDBlockQuote extends MarkdownASTParentNode { +export class MDBlockQuote extends MarkdownASTParentNode { @Prop({ default: 'blockquote' }) override type!: 'blockquote'; } +export const flowContents = [MDBlockQuote, MDCode, MDHeading, MDHTML, MDList, MDThematicBreak, MDParagraph, MDMath, MDTable]; +export const phrasingContent = [MDLineBreak, MDEmphasis, MDStrong, MDHTML, MDImage, MDInlineCode, MDInlineMath, MDLink, MDLiteral, MDDelete]; + +export const childrenAllowedNodes = new Map([ + [MDBlockQuote, flowContents], + [MDHeading, phrasingContent], + [MDList, [MDListItem]], + [MDListItem, flowContents], + [MDParagraph, phrasingContent], + [MDTable, [MDTableHeader, MDTableRow]], + [MDTableHeader, [MDTableHeading]], + [MDTableRow, [MDTableCell]], + +]); + export class HTMLToMarkdownJob { + root = new MarkdownASTRoot(); + stack: MarkdownASTParentNode[] = [this.root]; + ptr: MarkdownASTNode = this.root; + metadata: Record = {}; + + constructor(public dom: Document) { + } + + restFlow() { + this.ptr = this.root; + } + + walk() { + const tw = this.dom.createTreeWalker( + this.dom.documentElement, + 1 | 4, + { + acceptNode: (node) => { + const tagName = node.nodeName.toLowerCase(); + if (['script', 'style', 'link'].includes(tagName)) { + return NodeFilter.FILTER_REJECT; // Ignore these nodes + } + + return NodeFilter.FILTER_ACCEPT; // Accept everything else + } + } + ); + + + } } diff --git a/thinapps-shared b/thinapps-shared index d287049..4532694 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit d287049d46781bff2032b02a2bd4322239145c95 +Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a