mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 04:55:54 +08:00
wip
This commit is contained in:
parent
b3647492a8
commit
88a9da33e4
@ -1,9 +1,12 @@
|
|||||||
import { AsyncService, AutoCastable, Prop } from 'civkit';
|
import { AsyncService, AutoCastable, Prop } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
|
|
||||||
|
const pLinkedom = import('linkedom');
|
||||||
|
|
||||||
export class HTMLtoMarkdown extends AsyncService {
|
export class HTMLtoMarkdown extends AsyncService {
|
||||||
|
|
||||||
|
linkedom!: Awaited<typeof pLinkedom>;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected logger: Logger,
|
protected logger: Logger,
|
||||||
) {
|
) {
|
||||||
@ -14,20 +17,22 @@ export class HTMLtoMarkdown extends AsyncService {
|
|||||||
override async init() {
|
override async init() {
|
||||||
await this.dependencyReady();
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.linkedom = await pLinkedom;
|
||||||
|
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class MarkdownASTNode extends AutoCastable {
|
export class MarkdownASTNode extends AutoCastable {
|
||||||
@Prop({
|
@Prop({
|
||||||
required: true
|
required: true
|
||||||
})
|
})
|
||||||
type!: string;
|
type!: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDCode extends MarkdownASTNode {
|
export class MDCode extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'code'
|
default: 'code'
|
||||||
})
|
})
|
||||||
@ -43,7 +48,7 @@ class MDCode extends MarkdownASTNode {
|
|||||||
text!: string;
|
text!: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDHTML extends MarkdownASTNode {
|
export class MDHTML extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'html'
|
default: 'html'
|
||||||
})
|
})
|
||||||
@ -57,7 +62,7 @@ class MDHTML extends MarkdownASTNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class MarkdownASTParentNode extends MarkdownASTNode {
|
export class MarkdownASTParentNode extends MarkdownASTNode {
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
default: [],
|
default: [],
|
||||||
@ -67,21 +72,21 @@ class MarkdownASTParentNode extends MarkdownASTNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class MarkdownASTRoot extends MarkdownASTParentNode {
|
export class MarkdownASTRoot extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'root'
|
default: 'root'
|
||||||
})
|
})
|
||||||
override type!: 'root';
|
override type!: 'root';
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDParagraph extends MarkdownASTParentNode {
|
export class MDParagraph extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'paragraph'
|
default: 'paragraph'
|
||||||
})
|
})
|
||||||
override type!: 'paragraph';
|
override type!: 'paragraph';
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDHeading extends MarkdownASTParentNode {
|
export class MDHeading extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'heading'
|
default: 'heading'
|
||||||
})
|
})
|
||||||
@ -97,7 +102,7 @@ class MDHeading extends MarkdownASTParentNode {
|
|||||||
level!: 1 | 2 | 3 | 4 | 5 | 6;
|
level!: 1 | 2 | 3 | 4 | 5 | 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDList extends MarkdownASTParentNode {
|
export class MDList extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'list'
|
default: 'list'
|
||||||
})
|
})
|
||||||
@ -109,7 +114,7 @@ class MDList extends MarkdownASTParentNode {
|
|||||||
ordered!: boolean;
|
ordered!: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDListItem extends MarkdownASTParentNode {
|
export class MDListItem extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'listItem'
|
default: 'listItem'
|
||||||
})
|
})
|
||||||
@ -130,7 +135,7 @@ class MDListItem extends MarkdownASTParentNode {
|
|||||||
override children!: MarkdownASTNode[];
|
override children!: MarkdownASTNode[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDLink extends MarkdownASTParentNode {
|
export class MDLink extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'link'
|
default: 'link'
|
||||||
})
|
})
|
||||||
@ -151,7 +156,7 @@ class MDLink extends MarkdownASTParentNode {
|
|||||||
override children!: MarkdownASTNode[];
|
override children!: MarkdownASTNode[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDStrong extends MarkdownASTParentNode {
|
export class MDStrong extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'strong'
|
default: 'strong'
|
||||||
})
|
})
|
||||||
@ -164,7 +169,7 @@ class MDStrong extends MarkdownASTParentNode {
|
|||||||
override children!: MarkdownASTNode[];
|
override children!: MarkdownASTNode[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDEmphasis extends MarkdownASTParentNode {
|
export class MDEmphasis extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'emphasis'
|
default: 'emphasis'
|
||||||
})
|
})
|
||||||
@ -177,7 +182,7 @@ class MDEmphasis extends MarkdownASTParentNode {
|
|||||||
override children!: MarkdownASTNode[];
|
override children!: MarkdownASTNode[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDDelete extends MarkdownASTParentNode {
|
export class MDDelete extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'delete'
|
default: 'delete'
|
||||||
})
|
})
|
||||||
@ -191,7 +196,7 @@ class MDDelete extends MarkdownASTParentNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class MDLiteral extends MarkdownASTNode {
|
export class MDLiteral extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'literal'
|
default: 'literal'
|
||||||
})
|
})
|
||||||
@ -204,21 +209,21 @@ class MDLiteral extends MarkdownASTNode {
|
|||||||
text!: string;
|
text!: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDLineBreak extends MarkdownASTNode {
|
export class MDLineBreak extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'break'
|
default: 'break'
|
||||||
})
|
})
|
||||||
override type!: 'break';
|
override type!: 'break';
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDThematicBreak extends MarkdownASTNode {
|
export class MDThematicBreak extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'thematicBreak'
|
default: 'thematicBreak'
|
||||||
})
|
})
|
||||||
override type!: 'thematicBreak';
|
override type!: 'thematicBreak';
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDImage extends MarkdownASTNode {
|
export class MDImage extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'image'
|
default: 'image'
|
||||||
})
|
})
|
||||||
@ -236,7 +241,7 @@ class MDImage extends MarkdownASTNode {
|
|||||||
title?: string;
|
title?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDInlineCode extends MarkdownASTNode {
|
export class MDInlineCode extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'inlineCode'
|
default: 'inlineCode'
|
||||||
})
|
})
|
||||||
@ -249,7 +254,7 @@ class MDInlineCode extends MarkdownASTNode {
|
|||||||
text!: string;
|
text!: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDMath extends MarkdownASTNode {
|
export class MDMath extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'math'
|
default: 'math'
|
||||||
})
|
})
|
||||||
@ -265,7 +270,7 @@ class MDMath extends MarkdownASTNode {
|
|||||||
text!: string;
|
text!: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDInlineMath extends MarkdownASTNode {
|
export class MDInlineMath extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'inlineMath'
|
default: 'inlineMath'
|
||||||
})
|
})
|
||||||
@ -282,7 +287,7 @@ class MDInlineMath extends MarkdownASTNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class MDTableHeading extends MarkdownASTNode {
|
export class MDTableHeading extends MarkdownASTNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'tableHeading'
|
default: 'tableHeading'
|
||||||
})
|
})
|
||||||
@ -302,7 +307,7 @@ class MDTableHeading extends MarkdownASTNode {
|
|||||||
align?: 'left' | 'center' | 'right';
|
align?: 'left' | 'center' | 'right';
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDTableHeader extends MarkdownASTParentNode {
|
export class MDTableHeader extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'tableHeader'
|
default: 'tableHeader'
|
||||||
})
|
})
|
||||||
@ -314,7 +319,7 @@ class MDTableHeader extends MarkdownASTParentNode {
|
|||||||
})
|
})
|
||||||
override children!: MDTableHeading[];
|
override children!: MDTableHeading[];
|
||||||
}
|
}
|
||||||
class MDTableCell extends MarkdownASTParentNode {
|
export class MDTableCell extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'tableCell'
|
default: 'tableCell'
|
||||||
})
|
})
|
||||||
@ -327,7 +332,7 @@ class MDTableCell extends MarkdownASTParentNode {
|
|||||||
override children!: MarkdownASTNode[];
|
override children!: MarkdownASTNode[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDTableRow extends MarkdownASTParentNode {
|
export class MDTableRow extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'tableRow'
|
default: 'tableRow'
|
||||||
})
|
})
|
||||||
@ -340,7 +345,7 @@ class MDTableRow extends MarkdownASTParentNode {
|
|||||||
override children!: MDTableCell[];
|
override children!: MDTableCell[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDTable extends MarkdownASTParentNode {
|
export class MDTable extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'table'
|
default: 'table'
|
||||||
})
|
})
|
||||||
@ -353,15 +358,60 @@ class MDTable extends MarkdownASTParentNode {
|
|||||||
override children!: (MDTableHeader | MDTableRow)[];
|
override children!: (MDTableHeader | MDTableRow)[];
|
||||||
}
|
}
|
||||||
|
|
||||||
class MDBlockQuote extends MarkdownASTParentNode {
|
export class MDBlockQuote extends MarkdownASTParentNode {
|
||||||
@Prop({
|
@Prop({
|
||||||
default: 'blockquote'
|
default: 'blockquote'
|
||||||
})
|
})
|
||||||
override type!: 'blockquote';
|
override type!: 'blockquote';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const flowContents = [MDBlockQuote, MDCode, MDHeading, MDHTML, MDList, MDThematicBreak, MDParagraph, MDMath, MDTable];
|
||||||
|
export const phrasingContent = [MDLineBreak, MDEmphasis, MDStrong, MDHTML, MDImage, MDInlineCode, MDInlineMath, MDLink, MDLiteral, MDDelete];
|
||||||
|
|
||||||
|
export const childrenAllowedNodes = new Map<Function, Function[]>([
|
||||||
|
[MDBlockQuote, flowContents],
|
||||||
|
[MDHeading, phrasingContent],
|
||||||
|
[MDList, [MDListItem]],
|
||||||
|
[MDListItem, flowContents],
|
||||||
|
[MDParagraph, phrasingContent],
|
||||||
|
[MDTable, [MDTableHeader, MDTableRow]],
|
||||||
|
[MDTableHeader, [MDTableHeading]],
|
||||||
|
[MDTableRow, [MDTableCell]],
|
||||||
|
|
||||||
|
]);
|
||||||
|
|
||||||
export class HTMLToMarkdownJob {
|
export class HTMLToMarkdownJob {
|
||||||
|
|
||||||
|
root = new MarkdownASTRoot();
|
||||||
|
stack: MarkdownASTParentNode[] = [this.root];
|
||||||
|
ptr: MarkdownASTNode = this.root;
|
||||||
|
|
||||||
|
metadata: Record<string, any> = {};
|
||||||
|
|
||||||
|
constructor(public dom: Document) {
|
||||||
|
}
|
||||||
|
|
||||||
|
restFlow() {
|
||||||
|
this.ptr = this.root;
|
||||||
|
}
|
||||||
|
|
||||||
|
walk() {
|
||||||
|
const tw = this.dom.createTreeWalker(
|
||||||
|
this.dom.documentElement,
|
||||||
|
1 | 4,
|
||||||
|
{
|
||||||
|
acceptNode: (node) => {
|
||||||
|
const tagName = node.nodeName.toLowerCase();
|
||||||
|
if (['script', 'style', 'link'].includes(tagName)) {
|
||||||
|
return NodeFilter.FILTER_REJECT; // Ignore these nodes
|
||||||
|
}
|
||||||
|
|
||||||
|
return NodeFilter.FILTER_ACCEPT; // Accept everything else
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit d287049d46781bff2032b02a2bd4322239145c95
|
Subproject commit 4532694d769f75aabffa465565d6427a544c0d6a
|
Loading…
x
Reference in New Issue
Block a user