diff options
Diffstat (limited to 'node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js')
-rw-r--r-- | node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js | 295 |
1 files changed, 295 insertions, 0 deletions
diff --git a/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js b/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js new file mode 100644 index 0000000..25b7537 --- /dev/null +++ b/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js @@ -0,0 +1,295 @@ +"use strict"; +const whatwgEncoding = require("whatwg-encoding"); + +// https://html.spec.whatwg.org/#encoding-sniffing-algorithm +module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { + let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910 + + if (encoding === null && transportLayerEncodingLabel !== undefined) { + encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel); + } + + if (encoding === null) { + encoding = prescanMetaCharset(buffer); + } + + if (encoding === null) { + encoding = defaultEncoding; + } + + return encoding; +}; + +// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding +function prescanMetaCharset(buffer) { + const l = Math.min(buffer.length, 1024); + for (let i = 0; i < l; i++) { + let c = buffer[i]; + if (c === 0x3C) { + // "<" + const c1 = buffer[i + 1]; + const c2 = buffer[i + 2]; + const c3 = buffer[i + 3]; + const c4 = buffer[i + 4]; + const c5 = buffer[i + 5]; + // !-- (comment start) + if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) { + i += 4; + for (; i < l; i++) { + c = buffer[i]; + const cMinus1 = buffer[i - 1]; + const cMinus2 = buffer[i - 2]; + // --> (comment end) + if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) { + break; + } + } + } else if ((c1 === 0x4D || c1 === 0x6D) && + (c2 === 0x45 || c2 === 0x65) && + (c3 === 0x54 || c3 === 0x74) && + (c4 === 0x41 || c4 === 0x61) && + (isSpaceCharacter(c5) || c5 === 0x2F)) { + // "meta" + space or / + i += 6; + const attributeList = new Set(); + let gotPragma = false; + let needPragma = null; + let charset = null; + + let attrRes; + do { + attrRes = getAttribute(buffer, i, l); + if (attrRes.attr && !attributeList.has(attrRes.attr.name)) { + attributeList.add(attrRes.attr.name); + if (attrRes.attr.name === "http-equiv") { + gotPragma = attrRes.attr.value === "content-type"; + } else if (attrRes.attr.name === "content" && !charset) { + charset = extractCharacterEncodingFromMeta(attrRes.attr.value); + if (charset !== null) { + needPragma = true; + } + } else if (attrRes.attr.name === "charset") { + charset = whatwgEncoding.labelToName(attrRes.attr.value); + needPragma = false; + } + } + i = attrRes.i; + } while (attrRes.attr); + + if (needPragma === null) { + continue; + } + if (needPragma === true && gotPragma === false) { + continue; + } + if (charset === null) { + continue; + } + + if (charset === "UTF-16LE" || charset === "UTF-16BE") { + charset = "UTF-8"; + } + if (charset === "x-user-defined") { + charset = "windows-1252"; + } + + return charset; + } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) { + // a-z or A-Z + for (i += 2; i < l; i++) { + c = buffer[i]; + // space or > + if (isSpaceCharacter(c) || c === 0x3E) { + break; + } + } + let attrRes; + do { + attrRes = getAttribute(buffer, i, l); + i = attrRes.i; + } while (attrRes.attr); + } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) { + // ! or / or ? + for (i += 2; i < l; i++) { + c = buffer[i]; + // > + if (c === 0x3E) { + break; + } + } + } + } + } + return null; +} + +// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing +function getAttribute(buffer, i, l) { + for (; i < l; i++) { + let c = buffer[i]; + // space or / + if (isSpaceCharacter(c) || c === 0x2F) { + continue; + } + // ">" + if (c === 0x3E) { + break; + } + let name = ""; + let value = ""; + nameLoop:for (; i < l; i++) { + c = buffer[i]; + // "=" + if (c === 0x3D && name !== "") { + i++; + break; + } + // space + if (isSpaceCharacter(c)) { + for (i++; i < l; i++) { + c = buffer[i]; + // space + if (isSpaceCharacter(c)) { + continue; + } + // not "=" + if (c !== 0x3D) { + return { attr: { name, value }, i }; + } + + i++; + break nameLoop; + } + break; + } + // / or > + if (c === 0x2F || c === 0x3E) { + return { attr: { name, value }, i }; + } + // A-Z + if (c >= 0x41 && c <= 0x5A) { + name += String.fromCharCode(c + 0x20); // lowercase + } else { + name += String.fromCharCode(c); + } + } + c = buffer[i]; + // space + if (isSpaceCharacter(c)) { + for (i++; i < l; i++) { + c = buffer[i]; + // space + if (isSpaceCharacter(c)) { + continue; + } else { + break; + } + } + } + // " or ' + if (c === 0x22 || c === 0x27) { + const quote = c; + for (i++; i < l; i++) { + c = buffer[i]; + + if (c === quote) { + i++; + return { attr: { name, value }, i }; + } + + // A-Z + if (c >= 0x41 && c <= 0x5A) { + value += String.fromCharCode(c + 0x20); // lowercase + } else { + value += String.fromCharCode(c); + } + } + } + + // > + if (c === 0x3E) { + return { attr: { name, value }, i }; + } + + // A-Z + if (c >= 0x41 && c <= 0x5A) { + value += String.fromCharCode(c + 0x20); // lowercase + } else { + value += String.fromCharCode(c); + } + + for (i++; i < l; i++) { + c = buffer[i]; + + // space or > + if (isSpaceCharacter(c) || c === 0x3E) { + return { attr: { name, value }, i }; + } + + // A-Z + if (c >= 0x41 && c <= 0x5A) { + value += String.fromCharCode(c + 0x20); // lowercase + } else { + value += String.fromCharCode(c); + } + } + } + return { i }; +} + +function extractCharacterEncodingFromMeta(string) { + let position = 0; + + while (true) { + const indexOfCharset = string.substring(position).search(/charset/i); + + if (indexOfCharset === -1) { + return null; + } + let subPosition = position + indexOfCharset + "charset".length; + + while (isSpaceCharacter(string[subPosition].charCodeAt(0))) { + ++subPosition; + } + + if (string[subPosition] !== "=") { + position = subPosition - 1; + continue; + } + + ++subPosition; + + while (isSpaceCharacter(string[subPosition].charCodeAt(0))) { + ++subPosition; + } + + position = subPosition; + break; + } + + if (string[position] === "\"" || string[position] === "'") { + const nextIndex = string.indexOf(string[position], position + 1); + + if (nextIndex !== -1) { + return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex)); + } + + // It is an unmatched quotation mark + return null; + } + + if (string.length === position + 1) { + return null; + } + + const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/); + const end = indexOfASCIIWhitespaceOrSemicolon === -1 ? + string.length : + position + indexOfASCIIWhitespaceOrSemicolon + 1; + + return whatwgEncoding.labelToName(string.substring(position, end)); +} + +function isSpaceCharacter(c) { + return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20; +} |