
3 changed files with 149 additions and 482 deletions
-
446packages/compiler-core/src/parser/Parser.ts
-
107packages/compiler-core/src/parser/Tokenizer.ts
-
78packages/compiler-core/src/parser/index.ts
@ -1,446 +0,0 @@ |
|||
import Tokenizer, { Callbacks, QuoteType } from './Tokenizer.js' |
|||
import { fromCodePoint } from 'entities/lib/decode.js' |
|||
|
|||
const formTags = new Set([ |
|||
'input', |
|||
'option', |
|||
'optgroup', |
|||
'select', |
|||
'button', |
|||
'datalist', |
|||
'textarea' |
|||
]) |
|||
const pTag = new Set(['p']) |
|||
const tableSectionTags = new Set(['thead', 'tbody']) |
|||
const ddtTags = new Set(['dd', 'dt']) |
|||
const rtpTags = new Set(['rt', 'rp']) |
|||
|
|||
const openImpliesClose = new Map<string, Set<string>>([ |
|||
['tr', new Set(['tr', 'th', 'td'])], |
|||
['th', new Set(['th'])], |
|||
['td', new Set(['thead', 'th', 'td'])], |
|||
['body', new Set(['head', 'link', 'script'])], |
|||
['li', new Set(['li'])], |
|||
['p', pTag], |
|||
['h1', pTag], |
|||
['h2', pTag], |
|||
['h3', pTag], |
|||
['h4', pTag], |
|||
['h5', pTag], |
|||
['h6', pTag], |
|||
['select', formTags], |
|||
['input', formTags], |
|||
['output', formTags], |
|||
['button', formTags], |
|||
['datalist', formTags], |
|||
['textarea', formTags], |
|||
['option', new Set(['option'])], |
|||
['optgroup', new Set(['optgroup', 'option'])], |
|||
['dd', ddtTags], |
|||
['dt', ddtTags], |
|||
['address', pTag], |
|||
['article', pTag], |
|||
['aside', pTag], |
|||
['blockquote', pTag], |
|||
['details', pTag], |
|||
['div', pTag], |
|||
['dl', pTag], |
|||
['fieldset', pTag], |
|||
['figcaption', pTag], |
|||
['figure', pTag], |
|||
['footer', pTag], |
|||
['form', pTag], |
|||
['header', pTag], |
|||
['hr', pTag], |
|||
['main', pTag], |
|||
['nav', pTag], |
|||
['ol', pTag], |
|||
['pre', pTag], |
|||
['section', pTag], |
|||
['table', pTag], |
|||
['ul', pTag], |
|||
['rt', rtpTags], |
|||
['rp', rtpTags], |
|||
['tbody', tableSectionTags], |
|||
['tfoot', tableSectionTags] |
|||
]) |
|||
|
|||
const voidElements = new Set([ |
|||
'area', |
|||
'base', |
|||
'basefont', |
|||
'br', |
|||
'col', |
|||
'command', |
|||
'embed', |
|||
'frame', |
|||
'hr', |
|||
'img', |
|||
'input', |
|||
'isindex', |
|||
'keygen', |
|||
'link', |
|||
'meta', |
|||
'param', |
|||
'source', |
|||
'track', |
|||
'wbr' |
|||
]) |
|||
|
|||
const foreignContextElements = new Set(['math', 'svg']) |
|||
|
|||
const htmlIntegrationElements = new Set([ |
|||
'mi', |
|||
'mo', |
|||
'mn', |
|||
'ms', |
|||
'mtext', |
|||
'annotation-xml', |
|||
'foreignobject', |
|||
'desc', |
|||
'title' |
|||
]) |
|||
|
|||
export interface ParserOptions { |
|||
/** |
|||
* Decode entities within the document. |
|||
* |
|||
* @default true |
|||
*/ |
|||
decodeEntities?: boolean |
|||
} |
|||
|
|||
export interface Handler { |
|||
onparserinit(parser: Parser): void |
|||
|
|||
/** |
|||
* Resets the handler back to starting state |
|||
*/ |
|||
onreset(): void |
|||
|
|||
/** |
|||
* Signals the handler that parsing is done |
|||
*/ |
|||
onend(): void |
|||
onerror(error: Error): void |
|||
onclosetag(name: string, isImplied: boolean): void |
|||
onopentagname(name: string): void |
|||
/** |
|||
* |
|||
* @param name Name of the attribute |
|||
* @param value Value of the attribute. |
|||
* @param quote Quotes used around the attribute. `null` if the attribute has no quotes around the value, `undefined` if the attribute has no value. |
|||
*/ |
|||
onattribute( |
|||
name: string, |
|||
value: string, |
|||
quote?: string | undefined | null |
|||
): void |
|||
onopentag( |
|||
name: string, |
|||
attribs: { [s: string]: string }, |
|||
isImplied: boolean |
|||
): void |
|||
ontext(data: string): void |
|||
oncomment(data: string): void |
|||
oncdatastart(): void |
|||
oncdataend(): void |
|||
oncommentend(): void |
|||
onprocessinginstruction(name: string, data: string): void |
|||
} |
|||
|
|||
const reNameEnd = /\s|\// |
|||
|
|||
export class Parser implements Callbacks { |
|||
/** The start index of the last event. */ |
|||
public startIndex = 0 |
|||
/** The end index of the last event. */ |
|||
public endIndex = 0 |
|||
/** |
|||
* Store the start index of the current open tag, |
|||
* so we can update the start index for attributes. |
|||
*/ |
|||
private openTagStart = 0 |
|||
|
|||
private tagname = '' |
|||
private attribname = '' |
|||
private attribvalue = '' |
|||
private attribs: null | { [key: string]: string } = null |
|||
private readonly stack: string[] = [] |
|||
/** Determines whether self-closing tags are recognized. */ |
|||
private readonly foreignContext: boolean[] |
|||
private readonly cbs: Partial<Handler> |
|||
private readonly tokenizer: Tokenizer |
|||
|
|||
private buffer: string = '' |
|||
|
|||
constructor( |
|||
cbs?: Partial<Handler> | null, |
|||
private readonly options: ParserOptions = {} |
|||
) { |
|||
this.cbs = cbs ?? {} |
|||
this.tokenizer = new Tokenizer(this.options, this) |
|||
this.foreignContext = [false] |
|||
this.cbs.onparserinit?.(this) |
|||
} |
|||
|
|||
// Tokenizer event handlers
|
|||
|
|||
/** @internal */ |
|||
ontext(start: number, endIndex: number): void { |
|||
const data = this.getSlice(start, endIndex) |
|||
this.endIndex = endIndex - 1 |
|||
this.cbs.ontext?.(data) |
|||
this.startIndex = endIndex |
|||
} |
|||
|
|||
/** @internal */ |
|||
ontextentity(cp: number, endIndex: number): void { |
|||
this.endIndex = endIndex - 1 |
|||
this.cbs.ontext?.(fromCodePoint(cp)) |
|||
this.startIndex = endIndex |
|||
} |
|||
|
|||
/** @internal */ |
|||
onopentagname(start: number, endIndex: number): void { |
|||
this.emitOpenTag(this.getSlice(start, (this.endIndex = endIndex))) |
|||
} |
|||
|
|||
private emitOpenTag(name: string) { |
|||
this.openTagStart = this.startIndex |
|||
this.tagname = name |
|||
|
|||
const impliesClose = openImpliesClose.get(name) |
|||
|
|||
if (impliesClose) { |
|||
while (this.stack.length > 0 && impliesClose.has(this.stack[0])) { |
|||
const element = this.stack.shift()! |
|||
this.cbs.onclosetag?.(element, true) |
|||
} |
|||
} |
|||
if (!voidElements.has(name)) { |
|||
this.stack.unshift(name) |
|||
|
|||
if (foreignContextElements.has(name)) { |
|||
this.foreignContext.unshift(true) |
|||
} else if (htmlIntegrationElements.has(name)) { |
|||
this.foreignContext.unshift(false) |
|||
} |
|||
} |
|||
this.cbs.onopentagname?.(name) |
|||
if (this.cbs.onopentag) this.attribs = {} |
|||
} |
|||
|
|||
private endOpenTag(isImplied: boolean) { |
|||
this.startIndex = this.openTagStart |
|||
|
|||
if (this.attribs) { |
|||
this.cbs.onopentag?.(this.tagname, this.attribs, isImplied) |
|||
this.attribs = null |
|||
} |
|||
if (this.cbs.onclosetag && voidElements.has(this.tagname)) { |
|||
this.cbs.onclosetag(this.tagname, true) |
|||
} |
|||
|
|||
this.tagname = '' |
|||
} |
|||
|
|||
/** @internal */ |
|||
onopentagend(endIndex: number): void { |
|||
this.endIndex = endIndex |
|||
this.endOpenTag(false) |
|||
|
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
/** @internal */ |
|||
onclosetag(start: number, endIndex: number): void { |
|||
const name = this.getSlice(start, (this.endIndex = endIndex)) |
|||
|
|||
if (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) { |
|||
this.foreignContext.shift() |
|||
} |
|||
|
|||
if (!voidElements.has(name)) { |
|||
const pos = this.stack.indexOf(name) |
|||
if (pos !== -1) { |
|||
for (let index = 0; index <= pos; index++) { |
|||
const element = this.stack.shift()! |
|||
// We know the stack has sufficient elements.
|
|||
this.cbs.onclosetag?.(element, index !== pos) |
|||
} |
|||
} else if (name === 'p') { |
|||
// Implicit open before close
|
|||
this.emitOpenTag('p') |
|||
this.closeCurrentTag(true) |
|||
} |
|||
} else if (name === 'br') { |
|||
// We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
|
|||
this.cbs.onopentagname?.('br') |
|||
this.cbs.onopentag?.('br', {}, true) |
|||
this.cbs.onclosetag?.('br', false) |
|||
} |
|||
|
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
/** @internal */ |
|||
onselfclosingtag(endIndex: number): void { |
|||
this.endIndex = endIndex |
|||
this.closeCurrentTag(false) |
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
private closeCurrentTag(isOpenImplied: boolean) { |
|||
const name = this.tagname |
|||
this.endOpenTag(isOpenImplied) |
|||
|
|||
// Self-closing tags will be on the top of the stack
|
|||
if (this.stack[0] === name) { |
|||
// If the opening tag isn't implied, the closing tag has to be implied.
|
|||
this.cbs.onclosetag?.(name, !isOpenImplied) |
|||
this.stack.shift() |
|||
} |
|||
} |
|||
|
|||
/** @internal */ |
|||
onattribname(start: number, endIndex: number): void { |
|||
this.attribname = this.getSlice((this.startIndex = start), endIndex) |
|||
} |
|||
|
|||
/** @internal */ |
|||
onattribdata(start: number, endIndex: number): void { |
|||
this.attribvalue += this.getSlice(start, endIndex) |
|||
} |
|||
|
|||
/** @internal */ |
|||
onattribentity(cp: number): void { |
|||
this.attribvalue += fromCodePoint(cp) |
|||
} |
|||
|
|||
/** @internal */ |
|||
onattribend(quote: QuoteType, endIndex: number): void { |
|||
this.endIndex = endIndex |
|||
|
|||
this.cbs.onattribute?.( |
|||
this.attribname, |
|||
this.attribvalue, |
|||
quote === QuoteType.Double |
|||
? '"' |
|||
: quote === QuoteType.Single |
|||
? "'" |
|||
: quote === QuoteType.NoValue |
|||
? undefined |
|||
: null |
|||
) |
|||
|
|||
if ( |
|||
this.attribs && |
|||
!Object.prototype.hasOwnProperty.call(this.attribs, this.attribname) |
|||
) { |
|||
this.attribs[this.attribname] = this.attribvalue |
|||
} |
|||
this.attribvalue = '' |
|||
} |
|||
|
|||
private getInstructionName(value: string) { |
|||
const index = value.search(reNameEnd) |
|||
return index < 0 ? value : value.slice(0, index) |
|||
} |
|||
|
|||
/** @internal */ |
|||
ondeclaration(start: number, endIndex: number): void { |
|||
this.endIndex = endIndex |
|||
const value = this.getSlice(start, endIndex) |
|||
|
|||
if (this.cbs.onprocessinginstruction) { |
|||
const name = this.getInstructionName(value) |
|||
this.cbs.onprocessinginstruction(`!${name}`, `!${value}`) |
|||
} |
|||
|
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
/** @internal */ |
|||
onprocessinginstruction(start: number, endIndex: number): void { |
|||
this.endIndex = endIndex |
|||
const value = this.getSlice(start, endIndex) |
|||
|
|||
if (this.cbs.onprocessinginstruction) { |
|||
const name = this.getInstructionName(value) |
|||
this.cbs.onprocessinginstruction(`?${name}`, `?${value}`) |
|||
} |
|||
|
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
/** @internal */ |
|||
oncomment(start: number, endIndex: number, offset: number): void { |
|||
this.endIndex = endIndex |
|||
|
|||
this.cbs.oncomment?.(this.getSlice(start, endIndex - offset)) |
|||
this.cbs.oncommentend?.() |
|||
|
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
/** @internal */ |
|||
oncdata(start: number, endIndex: number, offset: number): void { |
|||
this.endIndex = endIndex |
|||
this.cbs.oncdatastart?.() |
|||
this.cbs.ontext?.(this.getSlice(start, endIndex - offset)) |
|||
this.cbs.oncdataend?.() |
|||
// Set `startIndex` for next node
|
|||
this.startIndex = endIndex + 1 |
|||
} |
|||
|
|||
/** @internal */ |
|||
onend(): void { |
|||
if (this.cbs.onclosetag) { |
|||
// Set the end index for all remaining tags
|
|||
this.endIndex = this.startIndex |
|||
for (let index = 0; index < this.stack.length; index++) { |
|||
this.cbs.onclosetag(this.stack[index], true) |
|||
} |
|||
} |
|||
this.cbs.onend?.() |
|||
} |
|||
|
|||
private getSlice(start: number, end: number) { |
|||
return this.buffer.slice(start, end) |
|||
} |
|||
|
|||
/** |
|||
* Parses a chunk of data and calls the corresponding callbacks. |
|||
* |
|||
* @param input string to parse. |
|||
*/ |
|||
public parse(input: string): void { |
|||
this.reset() |
|||
this.buffer = input |
|||
this.tokenizer.parse(input) |
|||
} |
|||
|
|||
/** |
|||
* Resets the parser to a blank state, ready to parse a new HTML document |
|||
*/ |
|||
public reset(): void { |
|||
this.cbs.onreset?.() |
|||
this.tokenizer.reset() |
|||
this.tagname = '' |
|||
this.attribname = '' |
|||
this.attribs = null |
|||
this.stack.length = 0 |
|||
this.startIndex = 0 |
|||
this.endIndex = 0 |
|||
this.cbs.onparserinit?.(this) |
|||
this.foreignContext.length = 0 |
|||
this.foreignContext.unshift(false) |
|||
} |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue