From 5d309ff52cd399a6b71968a6b9a70c8ac0b98981 Mon Sep 17 00:00:00 2001 From: Joel Kronqvist Date: Sat, 5 Mar 2022 19:02:27 +0200 Subject: Added node_modules for the updating to work properly. --- node_modules/parse5/lib/tokenizer/index.js | 2196 ++++++++++++++++++++++++++++ 1 file changed, 2196 insertions(+) create mode 100644 node_modules/parse5/lib/tokenizer/index.js (limited to 'node_modules/parse5/lib/tokenizer/index.js') diff --git a/node_modules/parse5/lib/tokenizer/index.js b/node_modules/parse5/lib/tokenizer/index.js new file mode 100644 index 0000000..c18d55c --- /dev/null +++ b/node_modules/parse5/lib/tokenizer/index.js @@ -0,0 +1,2196 @@ +'use strict'; + +const Preprocessor = require('./preprocessor'); +const unicode = require('../common/unicode'); +const neTree = require('./named-entity-data'); +const ERR = require('../common/error-codes'); + +//Aliases +const $ = unicode.CODE_POINTS; +const $$ = unicode.CODE_POINT_SEQUENCES; + +//C1 Unicode control character reference replacements +const C1_CONTROLS_REFERENCE_REPLACEMENTS = { + 0x80: 0x20ac, + 0x82: 0x201a, + 0x83: 0x0192, + 0x84: 0x201e, + 0x85: 0x2026, + 0x86: 0x2020, + 0x87: 0x2021, + 0x88: 0x02c6, + 0x89: 0x2030, + 0x8a: 0x0160, + 0x8b: 0x2039, + 0x8c: 0x0152, + 0x8e: 0x017d, + 0x91: 0x2018, + 0x92: 0x2019, + 0x93: 0x201c, + 0x94: 0x201d, + 0x95: 0x2022, + 0x96: 0x2013, + 0x97: 0x2014, + 0x98: 0x02dc, + 0x99: 0x2122, + 0x9a: 0x0161, + 0x9b: 0x203a, + 0x9c: 0x0153, + 0x9e: 0x017e, + 0x9f: 0x0178 +}; + +// Named entity tree flags +const HAS_DATA_FLAG = 1 << 0; +const DATA_DUPLET_FLAG = 1 << 1; +const HAS_BRANCHES_FLAG = 1 << 2; +const MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG; + +//States +const DATA_STATE = 'DATA_STATE'; +const RCDATA_STATE = 'RCDATA_STATE'; +const RAWTEXT_STATE = 'RAWTEXT_STATE'; +const SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE'; +const PLAINTEXT_STATE = 'PLAINTEXT_STATE'; +const TAG_OPEN_STATE = 'TAG_OPEN_STATE'; +const END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE'; +const TAG_NAME_STATE = 'TAG_NAME_STATE'; +const RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE'; +const RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE'; +const RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE'; +const RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE'; +const RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE'; +const RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE'; +const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE'; +const SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE'; +const SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE'; +const SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE'; +const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE'; +const SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE'; +const SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE'; +const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE'; +const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE'; +const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE'; +const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE'; +const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE'; +const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE'; +const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE'; +const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE'; +const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE'; +const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE'; +const BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE'; +const ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE'; +const AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE'; +const BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE'; +const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE'; +const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE'; +const ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE'; +const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE'; +const SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE'; +const BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE'; +const MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE'; +const COMMENT_START_STATE = 'COMMENT_START_STATE'; +const COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE'; +const COMMENT_STATE = 'COMMENT_STATE'; +const COMMENT_LESS_THAN_SIGN_STATE = 'COMMENT_LESS_THAN_SIGN_STATE'; +const COMMENT_LESS_THAN_SIGN_BANG_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_STATE'; +const COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE'; +const COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE'; +const COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE'; +const COMMENT_END_STATE = 'COMMENT_END_STATE'; +const COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE'; +const DOCTYPE_STATE = 'DOCTYPE_STATE'; +const BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE'; +const DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE'; +const AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE'; +const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE'; +const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE'; +const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE'; +const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE'; +const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE'; +const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE'; +const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE'; +const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE'; +const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE'; +const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE'; +const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE'; +const BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE'; +const CDATA_SECTION_STATE = 'CDATA_SECTION_STATE'; +const CDATA_SECTION_BRACKET_STATE = 'CDATA_SECTION_BRACKET_STATE'; +const CDATA_SECTION_END_STATE = 'CDATA_SECTION_END_STATE'; +const CHARACTER_REFERENCE_STATE = 'CHARACTER_REFERENCE_STATE'; +const NAMED_CHARACTER_REFERENCE_STATE = 'NAMED_CHARACTER_REFERENCE_STATE'; +const AMBIGUOUS_AMPERSAND_STATE = 'AMBIGUOS_AMPERSAND_STATE'; +const NUMERIC_CHARACTER_REFERENCE_STATE = 'NUMERIC_CHARACTER_REFERENCE_STATE'; +const HEXADEMICAL_CHARACTER_REFERENCE_START_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_START_STATE'; +const DECIMAL_CHARACTER_REFERENCE_START_STATE = 'DECIMAL_CHARACTER_REFERENCE_START_STATE'; +const HEXADEMICAL_CHARACTER_REFERENCE_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_STATE'; +const DECIMAL_CHARACTER_REFERENCE_STATE = 'DECIMAL_CHARACTER_REFERENCE_STATE'; +const NUMERIC_CHARACTER_REFERENCE_END_STATE = 'NUMERIC_CHARACTER_REFERENCE_END_STATE'; + +//Utils + +//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline +//this functions if they will be situated in another module due to context switch. +//Always perform inlining check before modifying this functions ('node --trace-inlining'). +function isWhitespace(cp) { + return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED; +} + +function isAsciiDigit(cp) { + return cp >= $.DIGIT_0 && cp <= $.DIGIT_9; +} + +function isAsciiUpper(cp) { + return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z; +} + +function isAsciiLower(cp) { + return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z; +} + +function isAsciiLetter(cp) { + return isAsciiLower(cp) || isAsciiUpper(cp); +} + +function isAsciiAlphaNumeric(cp) { + return isAsciiLetter(cp) || isAsciiDigit(cp); +} + +function isAsciiUpperHexDigit(cp) { + return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F; +} + +function isAsciiLowerHexDigit(cp) { + return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F; +} + +function isAsciiHexDigit(cp) { + return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp); +} + +function toAsciiLowerCodePoint(cp) { + return cp + 0x0020; +} + +//NOTE: String.fromCharCode() function can handle only characters from BMP subset. +//So, we need to workaround this manually. +//(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values) +function toChar(cp) { + if (cp <= 0xffff) { + return String.fromCharCode(cp); + } + + cp -= 0x10000; + return String.fromCharCode(((cp >>> 10) & 0x3ff) | 0xd800) + String.fromCharCode(0xdc00 | (cp & 0x3ff)); +} + +function toAsciiLowerChar(cp) { + return String.fromCharCode(toAsciiLowerCodePoint(cp)); +} + +function findNamedEntityTreeBranch(nodeIx, cp) { + const branchCount = neTree[++nodeIx]; + let lo = ++nodeIx; + let hi = lo + branchCount - 1; + + while (lo <= hi) { + const mid = (lo + hi) >>> 1; + const midCp = neTree[mid]; + + if (midCp < cp) { + lo = mid + 1; + } else if (midCp > cp) { + hi = mid - 1; + } else { + return neTree[mid + branchCount]; + } + } + + return -1; +} + +//Tokenizer +class Tokenizer { + constructor() { + this.preprocessor = new Preprocessor(); + + this.tokenQueue = []; + + this.allowCDATA = false; + + this.state = DATA_STATE; + this.returnState = ''; + + this.charRefCode = -1; + this.tempBuff = []; + this.lastStartTagName = ''; + + this.consumedAfterSnapshot = -1; + this.active = false; + + this.currentCharacterToken = null; + this.currentToken = null; + this.currentAttr = null; + } + + //Errors + _err() { + // NOTE: err reporting is noop by default. Enabled by mixin. + } + + _errOnNextCodePoint(err) { + this._consume(); + this._err(err); + this._unconsume(); + } + + //API + getNextToken() { + while (!this.tokenQueue.length && this.active) { + this.consumedAfterSnapshot = 0; + + const cp = this._consume(); + + if (!this._ensureHibernation()) { + this[this.state](cp); + } + } + + return this.tokenQueue.shift(); + } + + write(chunk, isLastChunk) { + this.active = true; + this.preprocessor.write(chunk, isLastChunk); + } + + insertHtmlAtCurrentPos(chunk) { + this.active = true; + this.preprocessor.insertHtmlAtCurrentPos(chunk); + } + + //Hibernation + _ensureHibernation() { + if (this.preprocessor.endOfChunkHit) { + for (; this.consumedAfterSnapshot > 0; this.consumedAfterSnapshot--) { + this.preprocessor.retreat(); + } + + this.active = false; + this.tokenQueue.push({ type: Tokenizer.HIBERNATION_TOKEN }); + + return true; + } + + return false; + } + + //Consumption + _consume() { + this.consumedAfterSnapshot++; + return this.preprocessor.advance(); + } + + _unconsume() { + this.consumedAfterSnapshot--; + this.preprocessor.retreat(); + } + + _reconsumeInState(state) { + this.state = state; + this._unconsume(); + } + + _consumeSequenceIfMatch(pattern, startCp, caseSensitive) { + let consumedCount = 0; + let isMatch = true; + const patternLength = pattern.length; + let patternPos = 0; + let cp = startCp; + let patternCp = void 0; + + for (; patternPos < patternLength; patternPos++) { + if (patternPos > 0) { + cp = this._consume(); + consumedCount++; + } + + if (cp === $.EOF) { + isMatch = false; + break; + } + + patternCp = pattern[patternPos]; + + if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) { + isMatch = false; + break; + } + } + + if (!isMatch) { + while (consumedCount--) { + this._unconsume(); + } + } + + return isMatch; + } + + //Temp buffer + _isTempBufferEqualToScriptString() { + if (this.tempBuff.length !== $$.SCRIPT_STRING.length) { + return false; + } + + for (let i = 0; i < this.tempBuff.length; i++) { + if (this.tempBuff[i] !== $$.SCRIPT_STRING[i]) { + return false; + } + } + + return true; + } + + //Token creation + _createStartTagToken() { + this.currentToken = { + type: Tokenizer.START_TAG_TOKEN, + tagName: '', + selfClosing: false, + ackSelfClosing: false, + attrs: [] + }; + } + + _createEndTagToken() { + this.currentToken = { + type: Tokenizer.END_TAG_TOKEN, + tagName: '', + selfClosing: false, + attrs: [] + }; + } + + _createCommentToken() { + this.currentToken = { + type: Tokenizer.COMMENT_TOKEN, + data: '' + }; + } + + _createDoctypeToken(initialName) { + this.currentToken = { + type: Tokenizer.DOCTYPE_TOKEN, + name: initialName, + forceQuirks: false, + publicId: null, + systemId: null + }; + } + + _createCharacterToken(type, ch) { + this.currentCharacterToken = { + type: type, + chars: ch + }; + } + + _createEOFToken() { + this.currentToken = { type: Tokenizer.EOF_TOKEN }; + } + + //Tag attributes + _createAttr(attrNameFirstCh) { + this.currentAttr = { + name: attrNameFirstCh, + value: '' + }; + } + + _leaveAttrName(toState) { + if (Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) === null) { + this.currentToken.attrs.push(this.currentAttr); + } else { + this._err(ERR.duplicateAttribute); + } + + this.state = toState; + } + + _leaveAttrValue(toState) { + this.state = toState; + } + + //Token emission + _emitCurrentToken() { + this._emitCurrentCharacterToken(); + + const ct = this.currentToken; + + this.currentToken = null; + + //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate. + if (ct.type === Tokenizer.START_TAG_TOKEN) { + this.lastStartTagName = ct.tagName; + } else if (ct.type === Tokenizer.END_TAG_TOKEN) { + if (ct.attrs.length > 0) { + this._err(ERR.endTagWithAttributes); + } + + if (ct.selfClosing) { + this._err(ERR.endTagWithTrailingSolidus); + } + } + + this.tokenQueue.push(ct); + } + + _emitCurrentCharacterToken() { + if (this.currentCharacterToken) { + this.tokenQueue.push(this.currentCharacterToken); + this.currentCharacterToken = null; + } + } + + _emitEOFToken() { + this._createEOFToken(); + this._emitCurrentToken(); + } + + //Characters emission + + //OPTIMIZATION: specification uses only one type of character tokens (one token per character). + //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters. + //If we have a sequence of characters that belong to the same group, parser can process it + //as a single solid character token. + //So, there are 3 types of character tokens in parse5: + //1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000') + //2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f') + //3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') + _appendCharToCurrentCharacterToken(type, ch) { + if (this.currentCharacterToken && this.currentCharacterToken.type !== type) { + this._emitCurrentCharacterToken(); + } + + if (this.currentCharacterToken) { + this.currentCharacterToken.chars += ch; + } else { + this._createCharacterToken(type, ch); + } + } + + _emitCodePoint(cp) { + let type = Tokenizer.CHARACTER_TOKEN; + + if (isWhitespace(cp)) { + type = Tokenizer.WHITESPACE_CHARACTER_TOKEN; + } else if (cp === $.NULL) { + type = Tokenizer.NULL_CHARACTER_TOKEN; + } + + this._appendCharToCurrentCharacterToken(type, toChar(cp)); + } + + _emitSeveralCodePoints(codePoints) { + for (let i = 0; i < codePoints.length; i++) { + this._emitCodePoint(codePoints[i]); + } + } + + //NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character. + //So we can avoid additional checks here. + _emitChars(ch) { + this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch); + } + + // Character reference helpers + _matchNamedCharacterReference(startCp) { + let result = null; + let excess = 1; + let i = findNamedEntityTreeBranch(0, startCp); + + this.tempBuff.push(startCp); + + while (i > -1) { + const current = neTree[i]; + const inNode = current < MAX_BRANCH_MARKER_VALUE; + const nodeWithData = inNode && current & HAS_DATA_FLAG; + + if (nodeWithData) { + //NOTE: we use greedy search, so we continue lookup at this point + result = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]]; + excess = 0; + } + + const cp = this._consume(); + + this.tempBuff.push(cp); + excess++; + + if (cp === $.EOF) { + break; + } + + if (inNode) { + i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1; + } else { + i = cp === current ? ++i : -1; + } + } + + while (excess--) { + this.tempBuff.pop(); + this._unconsume(); + } + + return result; + } + + _isCharacterReferenceInAttribute() { + return ( + this.returnState === ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE || + this.returnState === ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE || + this.returnState === ATTRIBUTE_VALUE_UNQUOTED_STATE + ); + } + + _isCharacterReferenceAttributeQuirk(withSemicolon) { + if (!withSemicolon && this._isCharacterReferenceInAttribute()) { + const nextCp = this._consume(); + + this._unconsume(); + + return nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp); + } + + return false; + } + + _flushCodePointsConsumedAsCharacterReference() { + if (this._isCharacterReferenceInAttribute()) { + for (let i = 0; i < this.tempBuff.length; i++) { + this.currentAttr.value += toChar(this.tempBuff[i]); + } + } else { + this._emitSeveralCodePoints(this.tempBuff); + } + + this.tempBuff = []; + } + + // State machine + + // Data state + //------------------------------------------------------------------ + [DATA_STATE](cp) { + this.preprocessor.dropParsedChunk(); + + if (cp === $.LESS_THAN_SIGN) { + this.state = TAG_OPEN_STATE; + } else if (cp === $.AMPERSAND) { + this.returnState = DATA_STATE; + this.state = CHARACTER_REFERENCE_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this._emitCodePoint(cp); + } else if (cp === $.EOF) { + this._emitEOFToken(); + } else { + this._emitCodePoint(cp); + } + } + + // RCDATA state + //------------------------------------------------------------------ + [RCDATA_STATE](cp) { + this.preprocessor.dropParsedChunk(); + + if (cp === $.AMPERSAND) { + this.returnState = RCDATA_STATE; + this.state = CHARACTER_REFERENCE_STATE; + } else if (cp === $.LESS_THAN_SIGN) { + this.state = RCDATA_LESS_THAN_SIGN_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this._emitChars(unicode.REPLACEMENT_CHARACTER); + } else if (cp === $.EOF) { + this._emitEOFToken(); + } else { + this._emitCodePoint(cp); + } + } + + // RAWTEXT state + //------------------------------------------------------------------ + [RAWTEXT_STATE](cp) { + this.preprocessor.dropParsedChunk(); + + if (cp === $.LESS_THAN_SIGN) { + this.state = RAWTEXT_LESS_THAN_SIGN_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this._emitChars(unicode.REPLACEMENT_CHARACTER); + } else if (cp === $.EOF) { + this._emitEOFToken(); + } else { + this._emitCodePoint(cp); + } + } + + // Script data state + //------------------------------------------------------------------ + [SCRIPT_DATA_STATE](cp) { + this.preprocessor.dropParsedChunk(); + + if (cp === $.LESS_THAN_SIGN) { + this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this._emitChars(unicode.REPLACEMENT_CHARACTER); + } else if (cp === $.EOF) { + this._emitEOFToken(); + } else { + this._emitCodePoint(cp); + } + } + + // PLAINTEXT state + //------------------------------------------------------------------ + [PLAINTEXT_STATE](cp) { + this.preprocessor.dropParsedChunk(); + + if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this._emitChars(unicode.REPLACEMENT_CHARACTER); + } else if (cp === $.EOF) { + this._emitEOFToken(); + } else { + this._emitCodePoint(cp); + } + } + + // Tag open state + //------------------------------------------------------------------ + [TAG_OPEN_STATE](cp) { + if (cp === $.EXCLAMATION_MARK) { + this.state = MARKUP_DECLARATION_OPEN_STATE; + } else if (cp === $.SOLIDUS) { + this.state = END_TAG_OPEN_STATE; + } else if (isAsciiLetter(cp)) { + this._createStartTagToken(); + this._reconsumeInState(TAG_NAME_STATE); + } else if (cp === $.QUESTION_MARK) { + this._err(ERR.unexpectedQuestionMarkInsteadOfTagName); + this._createCommentToken(); + this._reconsumeInState(BOGUS_COMMENT_STATE); + } else if (cp === $.EOF) { + this._err(ERR.eofBeforeTagName); + this._emitChars('<'); + this._emitEOFToken(); + } else { + this._err(ERR.invalidFirstCharacterOfTagName); + this._emitChars('<'); + this._reconsumeInState(DATA_STATE); + } + } + + // End tag open state + //------------------------------------------------------------------ + [END_TAG_OPEN_STATE](cp) { + if (isAsciiLetter(cp)) { + this._createEndTagToken(); + this._reconsumeInState(TAG_NAME_STATE); + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingEndTagName); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofBeforeTagName); + this._emitChars(''); + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.state = SCRIPT_DATA_ESCAPED_STATE; + this._emitChars(unicode.REPLACEMENT_CHARACTER); + } else if (cp === $.EOF) { + this._err(ERR.eofInScriptHtmlCommentLikeText); + this._emitEOFToken(); + } else { + this.state = SCRIPT_DATA_ESCAPED_STATE; + this._emitCodePoint(cp); + } + } + + // Script data escaped less-than sign state + //------------------------------------------------------------------ + [SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE](cp) { + if (cp === $.SOLIDUS) { + this.tempBuff = []; + this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE; + } else if (isAsciiLetter(cp)) { + this.tempBuff = []; + this._emitChars('<'); + this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE); + } else { + this._emitChars('<'); + this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE); + } + } + + // Script data escaped end tag open state + //------------------------------------------------------------------ + [SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE](cp) { + if (isAsciiLetter(cp)) { + this._createEndTagToken(); + this._reconsumeInState(SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE); + } else { + this._emitChars(''); + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; + this._emitChars(unicode.REPLACEMENT_CHARACTER); + } else if (cp === $.EOF) { + this._err(ERR.eofInScriptHtmlCommentLikeText); + this._emitEOFToken(); + } else { + this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE; + this._emitCodePoint(cp); + } + } + + // Script data double escaped less-than sign state + //------------------------------------------------------------------ + [SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE](cp) { + if (cp === $.SOLIDUS) { + this.tempBuff = []; + this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE; + this._emitChars('/'); + } else { + this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE); + } + } + + // Script data double escape end state + //------------------------------------------------------------------ + [SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE](cp) { + if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) { + this.state = this._isTempBufferEqualToScriptString() + ? SCRIPT_DATA_ESCAPED_STATE + : SCRIPT_DATA_DOUBLE_ESCAPED_STATE; + + this._emitCodePoint(cp); + } else if (isAsciiUpper(cp)) { + this.tempBuff.push(toAsciiLowerCodePoint(cp)); + this._emitCodePoint(cp); + } else if (isAsciiLower(cp)) { + this.tempBuff.push(cp); + this._emitCodePoint(cp); + } else { + this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE); + } + } + + // Before attribute name state + //------------------------------------------------------------------ + [BEFORE_ATTRIBUTE_NAME_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) { + this._reconsumeInState(AFTER_ATTRIBUTE_NAME_STATE); + } else if (cp === $.EQUALS_SIGN) { + this._err(ERR.unexpectedEqualsSignBeforeAttributeName); + this._createAttr('='); + this.state = ATTRIBUTE_NAME_STATE; + } else { + this._createAttr(''); + this._reconsumeInState(ATTRIBUTE_NAME_STATE); + } + } + + // Attribute name state + //------------------------------------------------------------------ + [ATTRIBUTE_NAME_STATE](cp) { + if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) { + this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE); + this._unconsume(); + } else if (cp === $.EQUALS_SIGN) { + this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE); + } else if (isAsciiUpper(cp)) { + this.currentAttr.name += toAsciiLowerChar(cp); + } else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) { + this._err(ERR.unexpectedCharacterInAttributeName); + this.currentAttr.name += toChar(cp); + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentAttr.name += unicode.REPLACEMENT_CHARACTER; + } else { + this.currentAttr.name += toChar(cp); + } + } + + // After attribute name state + //------------------------------------------------------------------ + [AFTER_ATTRIBUTE_NAME_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.SOLIDUS) { + this.state = SELF_CLOSING_START_TAG_STATE; + } else if (cp === $.EQUALS_SIGN) { + this.state = BEFORE_ATTRIBUTE_VALUE_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInTag); + this._emitEOFToken(); + } else { + this._createAttr(''); + this._reconsumeInState(ATTRIBUTE_NAME_STATE); + } + } + + // Before attribute value state + //------------------------------------------------------------------ + [BEFORE_ATTRIBUTE_VALUE_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.QUOTATION_MARK) { + this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingAttributeValue); + this.state = DATA_STATE; + this._emitCurrentToken(); + } else { + this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE); + } + } + + // Attribute value (double-quoted) state + //------------------------------------------------------------------ + [ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE](cp) { + if (cp === $.QUOTATION_MARK) { + this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; + } else if (cp === $.AMPERSAND) { + this.returnState = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; + this.state = CHARACTER_REFERENCE_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.EOF) { + this._err(ERR.eofInTag); + this._emitEOFToken(); + } else { + this.currentAttr.value += toChar(cp); + } + } + + // Attribute value (single-quoted) state + //------------------------------------------------------------------ + [ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE](cp) { + if (cp === $.APOSTROPHE) { + this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; + } else if (cp === $.AMPERSAND) { + this.returnState = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; + this.state = CHARACTER_REFERENCE_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.EOF) { + this._err(ERR.eofInTag); + this._emitEOFToken(); + } else { + this.currentAttr.value += toChar(cp); + } + } + + // Attribute value (unquoted) state + //------------------------------------------------------------------ + [ATTRIBUTE_VALUE_UNQUOTED_STATE](cp) { + if (isWhitespace(cp)) { + this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE); + } else if (cp === $.AMPERSAND) { + this.returnState = ATTRIBUTE_VALUE_UNQUOTED_STATE; + this.state = CHARACTER_REFERENCE_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._leaveAttrValue(DATA_STATE); + this._emitCurrentToken(); + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; + } else if ( + cp === $.QUOTATION_MARK || + cp === $.APOSTROPHE || + cp === $.LESS_THAN_SIGN || + cp === $.EQUALS_SIGN || + cp === $.GRAVE_ACCENT + ) { + this._err(ERR.unexpectedCharacterInUnquotedAttributeValue); + this.currentAttr.value += toChar(cp); + } else if (cp === $.EOF) { + this._err(ERR.eofInTag); + this._emitEOFToken(); + } else { + this.currentAttr.value += toChar(cp); + } + } + + // After attribute value (quoted) state + //------------------------------------------------------------------ + [AFTER_ATTRIBUTE_VALUE_QUOTED_STATE](cp) { + if (isWhitespace(cp)) { + this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE); + } else if (cp === $.SOLIDUS) { + this._leaveAttrValue(SELF_CLOSING_START_TAG_STATE); + } else if (cp === $.GREATER_THAN_SIGN) { + this._leaveAttrValue(DATA_STATE); + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInTag); + this._emitEOFToken(); + } else { + this._err(ERR.missingWhitespaceBetweenAttributes); + this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE); + } + } + + // Self-closing start tag state + //------------------------------------------------------------------ + [SELF_CLOSING_START_TAG_STATE](cp) { + if (cp === $.GREATER_THAN_SIGN) { + this.currentToken.selfClosing = true; + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInTag); + this._emitEOFToken(); + } else { + this._err(ERR.unexpectedSolidusInTag); + this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE); + } + } + + // Bogus comment state + //------------------------------------------------------------------ + [BOGUS_COMMENT_STATE](cp) { + if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._emitCurrentToken(); + this._emitEOFToken(); + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.data += unicode.REPLACEMENT_CHARACTER; + } else { + this.currentToken.data += toChar(cp); + } + } + + // Markup declaration open state + //------------------------------------------------------------------ + [MARKUP_DECLARATION_OPEN_STATE](cp) { + if (this._consumeSequenceIfMatch($$.DASH_DASH_STRING, cp, true)) { + this._createCommentToken(); + this.state = COMMENT_START_STATE; + } else if (this._consumeSequenceIfMatch($$.DOCTYPE_STRING, cp, false)) { + this.state = DOCTYPE_STATE; + } else if (this._consumeSequenceIfMatch($$.CDATA_START_STRING, cp, true)) { + if (this.allowCDATA) { + this.state = CDATA_SECTION_STATE; + } else { + this._err(ERR.cdataInHtmlContent); + this._createCommentToken(); + this.currentToken.data = '[CDATA['; + this.state = BOGUS_COMMENT_STATE; + } + } + + //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup + //results are no longer valid and we will need to start over. + else if (!this._ensureHibernation()) { + this._err(ERR.incorrectlyOpenedComment); + this._createCommentToken(); + this._reconsumeInState(BOGUS_COMMENT_STATE); + } + } + + // Comment start state + //------------------------------------------------------------------ + [COMMENT_START_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.state = COMMENT_START_DASH_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.abruptClosingOfEmptyComment); + this.state = DATA_STATE; + this._emitCurrentToken(); + } else { + this._reconsumeInState(COMMENT_STATE); + } + } + + // Comment start dash state + //------------------------------------------------------------------ + [COMMENT_START_DASH_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.state = COMMENT_END_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.abruptClosingOfEmptyComment); + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInComment); + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.data += '-'; + this._reconsumeInState(COMMENT_STATE); + } + } + + // Comment state + //------------------------------------------------------------------ + [COMMENT_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.state = COMMENT_END_DASH_STATE; + } else if (cp === $.LESS_THAN_SIGN) { + this.currentToken.data += '<'; + this.state = COMMENT_LESS_THAN_SIGN_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.data += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.EOF) { + this._err(ERR.eofInComment); + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.data += toChar(cp); + } + } + + // Comment less-than sign state + //------------------------------------------------------------------ + [COMMENT_LESS_THAN_SIGN_STATE](cp) { + if (cp === $.EXCLAMATION_MARK) { + this.currentToken.data += '!'; + this.state = COMMENT_LESS_THAN_SIGN_BANG_STATE; + } else if (cp === $.LESS_THAN_SIGN) { + this.currentToken.data += '!'; + } else { + this._reconsumeInState(COMMENT_STATE); + } + } + + // Comment less-than sign bang state + //------------------------------------------------------------------ + [COMMENT_LESS_THAN_SIGN_BANG_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE; + } else { + this._reconsumeInState(COMMENT_STATE); + } + } + + // Comment less-than sign bang dash state + //------------------------------------------------------------------ + [COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE; + } else { + this._reconsumeInState(COMMENT_END_DASH_STATE); + } + } + + // Comment less-than sign bang dash dash state + //------------------------------------------------------------------ + [COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE](cp) { + if (cp !== $.GREATER_THAN_SIGN && cp !== $.EOF) { + this._err(ERR.nestedComment); + } + + this._reconsumeInState(COMMENT_END_STATE); + } + + // Comment end dash state + //------------------------------------------------------------------ + [COMMENT_END_DASH_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.state = COMMENT_END_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInComment); + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.data += '-'; + this._reconsumeInState(COMMENT_STATE); + } + } + + // Comment end state + //------------------------------------------------------------------ + [COMMENT_END_STATE](cp) { + if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EXCLAMATION_MARK) { + this.state = COMMENT_END_BANG_STATE; + } else if (cp === $.HYPHEN_MINUS) { + this.currentToken.data += '-'; + } else if (cp === $.EOF) { + this._err(ERR.eofInComment); + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.data += '--'; + this._reconsumeInState(COMMENT_STATE); + } + } + + // Comment end bang state + //------------------------------------------------------------------ + [COMMENT_END_BANG_STATE](cp) { + if (cp === $.HYPHEN_MINUS) { + this.currentToken.data += '--!'; + this.state = COMMENT_END_DASH_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.incorrectlyClosedComment); + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInComment); + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.data += '--!'; + this._reconsumeInState(COMMENT_STATE); + } + } + + // DOCTYPE state + //------------------------------------------------------------------ + [DOCTYPE_STATE](cp) { + if (isWhitespace(cp)) { + this.state = BEFORE_DOCTYPE_NAME_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE); + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this._createDoctypeToken(null); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingWhitespaceBeforeDoctypeName); + this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE); + } + } + + // Before DOCTYPE name state + //------------------------------------------------------------------ + [BEFORE_DOCTYPE_NAME_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (isAsciiUpper(cp)) { + this._createDoctypeToken(toAsciiLowerChar(cp)); + this.state = DOCTYPE_NAME_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this._createDoctypeToken(unicode.REPLACEMENT_CHARACTER); + this.state = DOCTYPE_NAME_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingDoctypeName); + this._createDoctypeToken(null); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this._createDoctypeToken(null); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._createDoctypeToken(toChar(cp)); + this.state = DOCTYPE_NAME_STATE; + } + } + + // DOCTYPE name state + //------------------------------------------------------------------ + [DOCTYPE_NAME_STATE](cp) { + if (isWhitespace(cp)) { + this.state = AFTER_DOCTYPE_NAME_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (isAsciiUpper(cp)) { + this.currentToken.name += toAsciiLowerChar(cp); + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.name += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.name += toChar(cp); + } + } + + // After DOCTYPE name state + //------------------------------------------------------------------ + [AFTER_DOCTYPE_NAME_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else if (this._consumeSequenceIfMatch($$.PUBLIC_STRING, cp, false)) { + this.state = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE; + } else if (this._consumeSequenceIfMatch($$.SYSTEM_STRING, cp, false)) { + this.state = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE; + } + //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup + //results are no longer valid and we will need to start over. + else if (!this._ensureHibernation()) { + this._err(ERR.invalidCharacterSequenceAfterDoctypeName); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // After DOCTYPE public keyword state + //------------------------------------------------------------------ + [AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE](cp) { + if (isWhitespace(cp)) { + this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; + } else if (cp === $.QUOTATION_MARK) { + this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); + this.currentToken.publicId = ''; + this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword); + this.currentToken.publicId = ''; + this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingDoctypePublicIdentifier); + this.currentToken.forceQuirks = true; + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // Before DOCTYPE public identifier state + //------------------------------------------------------------------ + [BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.QUOTATION_MARK) { + this.currentToken.publicId = ''; + this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this.currentToken.publicId = ''; + this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingDoctypePublicIdentifier); + this.currentToken.forceQuirks = true; + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // DOCTYPE public identifier (double-quoted) state + //------------------------------------------------------------------ + [DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) { + if (cp === $.QUOTATION_MARK) { + this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.abruptDoctypePublicIdentifier); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.publicId += toChar(cp); + } + } + + // DOCTYPE public identifier (single-quoted) state + //------------------------------------------------------------------ + [DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE](cp) { + if (cp === $.APOSTROPHE) { + this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.abruptDoctypePublicIdentifier); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.publicId += toChar(cp); + } + } + + // After DOCTYPE public identifier state + //------------------------------------------------------------------ + [AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) { + if (isWhitespace(cp)) { + this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.QUOTATION_MARK) { + this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // Between DOCTYPE public and system identifiers state + //------------------------------------------------------------------ + [BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.GREATER_THAN_SIGN) { + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.QUOTATION_MARK) { + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // After DOCTYPE system keyword state + //------------------------------------------------------------------ + [AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE](cp) { + if (isWhitespace(cp)) { + this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; + } else if (cp === $.QUOTATION_MARK) { + this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword); + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // Before DOCTYPE system identifier state + //------------------------------------------------------------------ + [BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.QUOTATION_MARK) { + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; + } else if (cp === $.APOSTROPHE) { + this.currentToken.systemId = ''; + this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.missingDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this.state = DATA_STATE; + this._emitCurrentToken(); + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // DOCTYPE system identifier (double-quoted) state + //------------------------------------------------------------------ + [DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) { + if (cp === $.QUOTATION_MARK) { + this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.abruptDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.systemId += toChar(cp); + } + } + + // DOCTYPE system identifier (single-quoted) state + //------------------------------------------------------------------ + [DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE](cp) { + if (cp === $.APOSTROPHE) { + this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER; + } else if (cp === $.GREATER_THAN_SIGN) { + this._err(ERR.abruptDoctypeSystemIdentifier); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this.currentToken.systemId += toChar(cp); + } + } + + // After DOCTYPE system identifier state + //------------------------------------------------------------------ + [AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) { + if (isWhitespace(cp)) { + return; + } + + if (cp === $.GREATER_THAN_SIGN) { + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInDoctype); + this.currentToken.forceQuirks = true; + this._emitCurrentToken(); + this._emitEOFToken(); + } else { + this._err(ERR.unexpectedCharacterAfterDoctypeSystemIdentifier); + this._reconsumeInState(BOGUS_DOCTYPE_STATE); + } + } + + // Bogus DOCTYPE state + //------------------------------------------------------------------ + [BOGUS_DOCTYPE_STATE](cp) { + if (cp === $.GREATER_THAN_SIGN) { + this._emitCurrentToken(); + this.state = DATA_STATE; + } else if (cp === $.NULL) { + this._err(ERR.unexpectedNullCharacter); + } else if (cp === $.EOF) { + this._emitCurrentToken(); + this._emitEOFToken(); + } + } + + // CDATA section state + //------------------------------------------------------------------ + [CDATA_SECTION_STATE](cp) { + if (cp === $.RIGHT_SQUARE_BRACKET) { + this.state = CDATA_SECTION_BRACKET_STATE; + } else if (cp === $.EOF) { + this._err(ERR.eofInCdata); + this._emitEOFToken(); + } else { + this._emitCodePoint(cp); + } + } + + // CDATA section bracket state + //------------------------------------------------------------------ + [CDATA_SECTION_BRACKET_STATE](cp) { + if (cp === $.RIGHT_SQUARE_BRACKET) { + this.state = CDATA_SECTION_END_STATE; + } else { + this._emitChars(']'); + this._reconsumeInState(CDATA_SECTION_STATE); + } + } + + // CDATA section end state + //------------------------------------------------------------------ + [CDATA_SECTION_END_STATE](cp) { + if (cp === $.GREATER_THAN_SIGN) { + this.state = DATA_STATE; + } else if (cp === $.RIGHT_SQUARE_BRACKET) { + this._emitChars(']'); + } else { + this._emitChars(']]'); + this._reconsumeInState(CDATA_SECTION_STATE); + } + } + + // Character reference state + //------------------------------------------------------------------ + [CHARACTER_REFERENCE_STATE](cp) { + this.tempBuff = [$.AMPERSAND]; + + if (cp === $.NUMBER_SIGN) { + this.tempBuff.push(cp); + this.state = NUMERIC_CHARACTER_REFERENCE_STATE; + } else if (isAsciiAlphaNumeric(cp)) { + this._reconsumeInState(NAMED_CHARACTER_REFERENCE_STATE); + } else { + this._flushCodePointsConsumedAsCharacterReference(); + this._reconsumeInState(this.returnState); + } + } + + // Named character reference state + //------------------------------------------------------------------ + [NAMED_CHARACTER_REFERENCE_STATE](cp) { + const matchResult = this._matchNamedCharacterReference(cp); + + //NOTE: matching can be abrupted by hibernation. In that case match + //results are no longer valid and we will need to start over. + if (this._ensureHibernation()) { + this.tempBuff = [$.AMPERSAND]; + } else if (matchResult) { + const withSemicolon = this.tempBuff[this.tempBuff.length - 1] === $.SEMICOLON; + + if (!this._isCharacterReferenceAttributeQuirk(withSemicolon)) { + if (!withSemicolon) { + this._errOnNextCodePoint(ERR.missingSemicolonAfterCharacterReference); + } + + this.tempBuff = matchResult; + } + + this._flushCodePointsConsumedAsCharacterReference(); + this.state = this.returnState; + } else { + this._flushCodePointsConsumedAsCharacterReference(); + this.state = AMBIGUOUS_AMPERSAND_STATE; + } + } + + // Ambiguos ampersand state + //------------------------------------------------------------------ + [AMBIGUOUS_AMPERSAND_STATE](cp) { + if (isAsciiAlphaNumeric(cp)) { + if (this._isCharacterReferenceInAttribute()) { + this.currentAttr.value += toChar(cp); + } else { + this._emitCodePoint(cp); + } + } else { + if (cp === $.SEMICOLON) { + this._err(ERR.unknownNamedCharacterReference); + } + + this._reconsumeInState(this.returnState); + } + } + + // Numeric character reference state + //------------------------------------------------------------------ + [NUMERIC_CHARACTER_REFERENCE_STATE](cp) { + this.charRefCode = 0; + + if (cp === $.LATIN_SMALL_X || cp === $.LATIN_CAPITAL_X) { + this.tempBuff.push(cp); + this.state = HEXADEMICAL_CHARACTER_REFERENCE_START_STATE; + } else { + this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_START_STATE); + } + } + + // Hexademical character reference start state + //------------------------------------------------------------------ + [HEXADEMICAL_CHARACTER_REFERENCE_START_STATE](cp) { + if (isAsciiHexDigit(cp)) { + this._reconsumeInState(HEXADEMICAL_CHARACTER_REFERENCE_STATE); + } else { + this._err(ERR.absenceOfDigitsInNumericCharacterReference); + this._flushCodePointsConsumedAsCharacterReference(); + this._reconsumeInState(this.returnState); + } + } + + // Decimal character reference start state + //------------------------------------------------------------------ + [DECIMAL_CHARACTER_REFERENCE_START_STATE](cp) { + if (isAsciiDigit(cp)) { + this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_STATE); + } else { + this._err(ERR.absenceOfDigitsInNumericCharacterReference); + this._flushCodePointsConsumedAsCharacterReference(); + this._reconsumeInState(this.returnState); + } + } + + // Hexademical character reference state + //------------------------------------------------------------------ + [HEXADEMICAL_CHARACTER_REFERENCE_STATE](cp) { + if (isAsciiUpperHexDigit(cp)) { + this.charRefCode = this.charRefCode * 16 + cp - 0x37; + } else if (isAsciiLowerHexDigit(cp)) { + this.charRefCode = this.charRefCode * 16 + cp - 0x57; + } else if (isAsciiDigit(cp)) { + this.charRefCode = this.charRefCode * 16 + cp - 0x30; + } else if (cp === $.SEMICOLON) { + this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE; + } else { + this._err(ERR.missingSemicolonAfterCharacterReference); + this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE); + } + } + + // Decimal character reference state + //------------------------------------------------------------------ + [DECIMAL_CHARACTER_REFERENCE_STATE](cp) { + if (isAsciiDigit(cp)) { + this.charRefCode = this.charRefCode * 10 + cp - 0x30; + } else if (cp === $.SEMICOLON) { + this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE; + } else { + this._err(ERR.missingSemicolonAfterCharacterReference); + this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE); + } + } + + // Numeric character reference end state + //------------------------------------------------------------------ + [NUMERIC_CHARACTER_REFERENCE_END_STATE]() { + if (this.charRefCode === $.NULL) { + this._err(ERR.nullCharacterReference); + this.charRefCode = $.REPLACEMENT_CHARACTER; + } else if (this.charRefCode > 0x10ffff) { + this._err(ERR.characterReferenceOutsideUnicodeRange); + this.charRefCode = $.REPLACEMENT_CHARACTER; + } else if (unicode.isSurrogate(this.charRefCode)) { + this._err(ERR.surrogateCharacterReference); + this.charRefCode = $.REPLACEMENT_CHARACTER; + } else if (unicode.isUndefinedCodePoint(this.charRefCode)) { + this._err(ERR.noncharacterCharacterReference); + } else if (unicode.isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { + this._err(ERR.controlCharacterReference); + + const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS[this.charRefCode]; + + if (replacement) { + this.charRefCode = replacement; + } + } + + this.tempBuff = [this.charRefCode]; + + this._flushCodePointsConsumedAsCharacterReference(); + this._reconsumeInState(this.returnState); + } +} + +//Token types +Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN'; +Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN'; +Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN'; +Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN'; +Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN'; +Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN'; +Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN'; +Tokenizer.EOF_TOKEN = 'EOF_TOKEN'; +Tokenizer.HIBERNATION_TOKEN = 'HIBERNATION_TOKEN'; + +//Tokenizer initial states for different modes +Tokenizer.MODE = { + DATA: DATA_STATE, + RCDATA: RCDATA_STATE, + RAWTEXT: RAWTEXT_STATE, + SCRIPT_DATA: SCRIPT_DATA_STATE, + PLAINTEXT: PLAINTEXT_STATE +}; + +//Static +Tokenizer.getTokenAttr = function(token, attrName) { + for (let i = token.attrs.length - 1; i >= 0; i--) { + if (token.attrs[i].name === attrName) { + return token.attrs[i].value; + } + } + + return null; +}; + +module.exports = Tokenizer; -- cgit v1.2.3