This commit is contained in:
Yanlong Wang 2024-09-17 17:21:34 +08:00
parent b3647492a8
commit 88a9da33e4
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 77 additions and 27 deletions

View File

@ -1,9 +1,12 @@
import { AsyncService, AutoCastable, Prop } from 'civkit'; import { AsyncService, AutoCastable, Prop } from 'civkit';
import { Logger } from '../shared/services/logger'; import { Logger } from '../shared/services/logger';
const pLinkedom = import('linkedom');
export class HTMLtoMarkdown extends AsyncService { export class HTMLtoMarkdown extends AsyncService {
linkedom!: Awaited<typeof pLinkedom>;
constructor( constructor(
protected logger: Logger, protected logger: Logger,
) { ) {
@ -14,20 +17,22 @@ export class HTMLtoMarkdown extends AsyncService {
override async init() { override async init() {
await this.dependencyReady(); await this.dependencyReady();
this.linkedom = await pLinkedom;
this.emit('ready'); this.emit('ready');
} }
} }
class MarkdownASTNode extends AutoCastable { export class MarkdownASTNode extends AutoCastable {
@Prop({ @Prop({
required: true required: true
}) })
type!: string; type!: string;
} }
class MDCode extends MarkdownASTNode { export class MDCode extends MarkdownASTNode {
@Prop({ @Prop({
default: 'code' default: 'code'
}) })
@ -43,7 +48,7 @@ class MDCode extends MarkdownASTNode {
text!: string; text!: string;
} }
class MDHTML extends MarkdownASTNode { export class MDHTML extends MarkdownASTNode {
@Prop({ @Prop({
default: 'html' default: 'html'
}) })
@ -57,7 +62,7 @@ class MDHTML extends MarkdownASTNode {
} }
class MarkdownASTParentNode extends MarkdownASTNode { export class MarkdownASTParentNode extends MarkdownASTNode {
@Prop({ @Prop({
default: [], default: [],
@ -67,21 +72,21 @@ class MarkdownASTParentNode extends MarkdownASTNode {
} }
class MarkdownASTRoot extends MarkdownASTParentNode { export class MarkdownASTRoot extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'root' default: 'root'
}) })
override type!: 'root'; override type!: 'root';
} }
class MDParagraph extends MarkdownASTParentNode { export class MDParagraph extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'paragraph' default: 'paragraph'
}) })
override type!: 'paragraph'; override type!: 'paragraph';
} }
class MDHeading extends MarkdownASTParentNode { export class MDHeading extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'heading' default: 'heading'
}) })
@ -97,7 +102,7 @@ class MDHeading extends MarkdownASTParentNode {
level!: 1 | 2 | 3 | 4 | 5 | 6; level!: 1 | 2 | 3 | 4 | 5 | 6;
} }
class MDList extends MarkdownASTParentNode { export class MDList extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'list' default: 'list'
}) })
@ -109,7 +114,7 @@ class MDList extends MarkdownASTParentNode {
ordered!: boolean; ordered!: boolean;
} }
class MDListItem extends MarkdownASTParentNode { export class MDListItem extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'listItem' default: 'listItem'
}) })
@ -130,7 +135,7 @@ class MDListItem extends MarkdownASTParentNode {
override children!: MarkdownASTNode[]; override children!: MarkdownASTNode[];
} }
class MDLink extends MarkdownASTParentNode { export class MDLink extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'link' default: 'link'
}) })
@ -151,7 +156,7 @@ class MDLink extends MarkdownASTParentNode {
override children!: MarkdownASTNode[]; override children!: MarkdownASTNode[];
} }
class MDStrong extends MarkdownASTParentNode { export class MDStrong extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'strong' default: 'strong'
}) })
@ -164,7 +169,7 @@ class MDStrong extends MarkdownASTParentNode {
override children!: MarkdownASTNode[]; override children!: MarkdownASTNode[];
} }
class MDEmphasis extends MarkdownASTParentNode { export class MDEmphasis extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'emphasis' default: 'emphasis'
}) })
@ -177,7 +182,7 @@ class MDEmphasis extends MarkdownASTParentNode {
override children!: MarkdownASTNode[]; override children!: MarkdownASTNode[];
} }
class MDDelete extends MarkdownASTParentNode { export class MDDelete extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'delete' default: 'delete'
}) })
@ -191,7 +196,7 @@ class MDDelete extends MarkdownASTParentNode {
} }
class MDLiteral extends MarkdownASTNode { export class MDLiteral extends MarkdownASTNode {
@Prop({ @Prop({
default: 'literal' default: 'literal'
}) })
@ -204,21 +209,21 @@ class MDLiteral extends MarkdownASTNode {
text!: string; text!: string;
} }
class MDLineBreak extends MarkdownASTNode { export class MDLineBreak extends MarkdownASTNode {
@Prop({ @Prop({
default: 'break' default: 'break'
}) })
override type!: 'break'; override type!: 'break';
} }
class MDThematicBreak extends MarkdownASTNode { export class MDThematicBreak extends MarkdownASTNode {
@Prop({ @Prop({
default: 'thematicBreak' default: 'thematicBreak'
}) })
override type!: 'thematicBreak'; override type!: 'thematicBreak';
} }
class MDImage extends MarkdownASTNode { export class MDImage extends MarkdownASTNode {
@Prop({ @Prop({
default: 'image' default: 'image'
}) })
@ -236,7 +241,7 @@ class MDImage extends MarkdownASTNode {
title?: string; title?: string;
} }
class MDInlineCode extends MarkdownASTNode { export class MDInlineCode extends MarkdownASTNode {
@Prop({ @Prop({
default: 'inlineCode' default: 'inlineCode'
}) })
@ -249,7 +254,7 @@ class MDInlineCode extends MarkdownASTNode {
text!: string; text!: string;
} }
class MDMath extends MarkdownASTNode { export class MDMath extends MarkdownASTNode {
@Prop({ @Prop({
default: 'math' default: 'math'
}) })
@ -265,7 +270,7 @@ class MDMath extends MarkdownASTNode {
text!: string; text!: string;
} }
class MDInlineMath extends MarkdownASTNode { export class MDInlineMath extends MarkdownASTNode {
@Prop({ @Prop({
default: 'inlineMath' default: 'inlineMath'
}) })
@ -282,7 +287,7 @@ class MDInlineMath extends MarkdownASTNode {
} }
class MDTableHeading extends MarkdownASTNode { export class MDTableHeading extends MarkdownASTNode {
@Prop({ @Prop({
default: 'tableHeading' default: 'tableHeading'
}) })
@ -302,7 +307,7 @@ class MDTableHeading extends MarkdownASTNode {
align?: 'left' | 'center' | 'right'; align?: 'left' | 'center' | 'right';
} }
class MDTableHeader extends MarkdownASTParentNode { export class MDTableHeader extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'tableHeader' default: 'tableHeader'
}) })
@ -314,7 +319,7 @@ class MDTableHeader extends MarkdownASTParentNode {
}) })
override children!: MDTableHeading[]; override children!: MDTableHeading[];
} }
class MDTableCell extends MarkdownASTParentNode { export class MDTableCell extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'tableCell' default: 'tableCell'
}) })
@ -327,7 +332,7 @@ class MDTableCell extends MarkdownASTParentNode {
override children!: MarkdownASTNode[]; override children!: MarkdownASTNode[];
} }
class MDTableRow extends MarkdownASTParentNode { export class MDTableRow extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'tableRow' default: 'tableRow'
}) })
@ -340,7 +345,7 @@ class MDTableRow extends MarkdownASTParentNode {
override children!: MDTableCell[]; override children!: MDTableCell[];
} }
class MDTable extends MarkdownASTParentNode { export class MDTable extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'table' default: 'table'
}) })
@ -353,15 +358,60 @@ class MDTable extends MarkdownASTParentNode {
override children!: (MDTableHeader | MDTableRow)[]; override children!: (MDTableHeader | MDTableRow)[];
} }
class MDBlockQuote extends MarkdownASTParentNode { export class MDBlockQuote extends MarkdownASTParentNode {
@Prop({ @Prop({
default: 'blockquote' default: 'blockquote'
}) })
override type!: 'blockquote'; override type!: 'blockquote';
} }
export const flowContents = [MDBlockQuote, MDCode, MDHeading, MDHTML, MDList, MDThematicBreak, MDParagraph, MDMath, MDTable];
export const phrasingContent = [MDLineBreak, MDEmphasis, MDStrong, MDHTML, MDImage, MDInlineCode, MDInlineMath, MDLink, MDLiteral, MDDelete];
export const childrenAllowedNodes = new Map<Function, Function[]>([
[MDBlockQuote, flowContents],
[MDHeading, phrasingContent],
[MDList, [MDListItem]],
[MDListItem, flowContents],
[MDParagraph, phrasingContent],
[MDTable, [MDTableHeader, MDTableRow]],
[MDTableHeader, [MDTableHeading]],
[MDTableRow, [MDTableCell]],
]);
export class HTMLToMarkdownJob { export class HTMLToMarkdownJob {
root = new MarkdownASTRoot();
stack: MarkdownASTParentNode[] = [this.root];
ptr: MarkdownASTNode = this.root;
metadata: Record<string, any> = {};
constructor(public dom: Document) {
}
restFlow() {
this.ptr = this.root;
}
walk() {
const tw = this.dom.createTreeWalker(
this.dom.documentElement,
1 | 4,
{
acceptNode: (node) => {
const tagName = node.nodeName.toLowerCase();
if (['script', 'style', 'link'].includes(tagName)) {
return NodeFilter.FILTER_REJECT; // Ignore these nodes
}
return NodeFilter.FILTER_ACCEPT; // Accept everything else
}
}
);
} }
}

@ -1 +1 @@
Subproject commit d287049d46781bff2032b02a2bd4322239145c95 Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a