aboutsummaryrefslogtreecommitdiff
path: root/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js
diff options
context:
space:
mode:
Diffstat (limited to 'node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js')
-rw-r--r--node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js295
1 files changed, 295 insertions, 0 deletions
diff --git a/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js b/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js
new file mode 100644
index 0000000..25b7537
--- /dev/null
+++ b/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js
@@ -0,0 +1,295 @@
+"use strict";
+const whatwgEncoding = require("whatwg-encoding");
+
+// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
+module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
+ let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
+
+ if (encoding === null && transportLayerEncodingLabel !== undefined) {
+ encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
+ }
+
+ if (encoding === null) {
+ encoding = prescanMetaCharset(buffer);
+ }
+
+ if (encoding === null) {
+ encoding = defaultEncoding;
+ }
+
+ return encoding;
+};
+
+// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
+function prescanMetaCharset(buffer) {
+ const l = Math.min(buffer.length, 1024);
+ for (let i = 0; i < l; i++) {
+ let c = buffer[i];
+ if (c === 0x3C) {
+ // "<"
+ const c1 = buffer[i + 1];
+ const c2 = buffer[i + 2];
+ const c3 = buffer[i + 3];
+ const c4 = buffer[i + 4];
+ const c5 = buffer[i + 5];
+ // !-- (comment start)
+ if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
+ i += 4;
+ for (; i < l; i++) {
+ c = buffer[i];
+ const cMinus1 = buffer[i - 1];
+ const cMinus2 = buffer[i - 2];
+ // --> (comment end)
+ if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
+ break;
+ }
+ }
+ } else if ((c1 === 0x4D || c1 === 0x6D) &&
+ (c2 === 0x45 || c2 === 0x65) &&
+ (c3 === 0x54 || c3 === 0x74) &&
+ (c4 === 0x41 || c4 === 0x61) &&
+ (isSpaceCharacter(c5) || c5 === 0x2F)) {
+ // "meta" + space or /
+ i += 6;
+ const attributeList = new Set();
+ let gotPragma = false;
+ let needPragma = null;
+ let charset = null;
+
+ let attrRes;
+ do {
+ attrRes = getAttribute(buffer, i, l);
+ if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
+ attributeList.add(attrRes.attr.name);
+ if (attrRes.attr.name === "http-equiv") {
+ gotPragma = attrRes.attr.value === "content-type";
+ } else if (attrRes.attr.name === "content" && !charset) {
+ charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
+ if (charset !== null) {
+ needPragma = true;
+ }
+ } else if (attrRes.attr.name === "charset") {
+ charset = whatwgEncoding.labelToName(attrRes.attr.value);
+ needPragma = false;
+ }
+ }
+ i = attrRes.i;
+ } while (attrRes.attr);
+
+ if (needPragma === null) {
+ continue;
+ }
+ if (needPragma === true && gotPragma === false) {
+ continue;
+ }
+ if (charset === null) {
+ continue;
+ }
+
+ if (charset === "UTF-16LE" || charset === "UTF-16BE") {
+ charset = "UTF-8";
+ }
+ if (charset === "x-user-defined") {
+ charset = "windows-1252";
+ }
+
+ return charset;
+ } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
+ // a-z or A-Z
+ for (i += 2; i < l; i++) {
+ c = buffer[i];
+ // space or >
+ if (isSpaceCharacter(c) || c === 0x3E) {
+ break;
+ }
+ }
+ let attrRes;
+ do {
+ attrRes = getAttribute(buffer, i, l);
+ i = attrRes.i;
+ } while (attrRes.attr);
+ } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
+ // ! or / or ?
+ for (i += 2; i < l; i++) {
+ c = buffer[i];
+ // >
+ if (c === 0x3E) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ return null;
+}
+
+// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
+function getAttribute(buffer, i, l) {
+ for (; i < l; i++) {
+ let c = buffer[i];
+ // space or /
+ if (isSpaceCharacter(c) || c === 0x2F) {
+ continue;
+ }
+ // ">"
+ if (c === 0x3E) {
+ break;
+ }
+ let name = "";
+ let value = "";
+ nameLoop:for (; i < l; i++) {
+ c = buffer[i];
+ // "="
+ if (c === 0x3D && name !== "") {
+ i++;
+ break;
+ }
+ // space
+ if (isSpaceCharacter(c)) {
+ for (i++; i < l; i++) {
+ c = buffer[i];
+ // space
+ if (isSpaceCharacter(c)) {
+ continue;
+ }
+ // not "="
+ if (c !== 0x3D) {
+ return { attr: { name, value }, i };
+ }
+
+ i++;
+ break nameLoop;
+ }
+ break;
+ }
+ // / or >
+ if (c === 0x2F || c === 0x3E) {
+ return { attr: { name, value }, i };
+ }
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ name += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ name += String.fromCharCode(c);
+ }
+ }
+ c = buffer[i];
+ // space
+ if (isSpaceCharacter(c)) {
+ for (i++; i < l; i++) {
+ c = buffer[i];
+ // space
+ if (isSpaceCharacter(c)) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ }
+ // " or '
+ if (c === 0x22 || c === 0x27) {
+ const quote = c;
+ for (i++; i < l; i++) {
+ c = buffer[i];
+
+ if (c === quote) {
+ i++;
+ return { attr: { name, value }, i };
+ }
+
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ value += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ value += String.fromCharCode(c);
+ }
+ }
+ }
+
+ // >
+ if (c === 0x3E) {
+ return { attr: { name, value }, i };
+ }
+
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ value += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ value += String.fromCharCode(c);
+ }
+
+ for (i++; i < l; i++) {
+ c = buffer[i];
+
+ // space or >
+ if (isSpaceCharacter(c) || c === 0x3E) {
+ return { attr: { name, value }, i };
+ }
+
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ value += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ value += String.fromCharCode(c);
+ }
+ }
+ }
+ return { i };
+}
+
+function extractCharacterEncodingFromMeta(string) {
+ let position = 0;
+
+ while (true) {
+ const indexOfCharset = string.substring(position).search(/charset/i);
+
+ if (indexOfCharset === -1) {
+ return null;
+ }
+ let subPosition = position + indexOfCharset + "charset".length;
+
+ while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
+ ++subPosition;
+ }
+
+ if (string[subPosition] !== "=") {
+ position = subPosition - 1;
+ continue;
+ }
+
+ ++subPosition;
+
+ while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
+ ++subPosition;
+ }
+
+ position = subPosition;
+ break;
+ }
+
+ if (string[position] === "\"" || string[position] === "'") {
+ const nextIndex = string.indexOf(string[position], position + 1);
+
+ if (nextIndex !== -1) {
+ return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
+ }
+
+ // It is an unmatched quotation mark
+ return null;
+ }
+
+ if (string.length === position + 1) {
+ return null;
+ }
+
+ const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
+ const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
+ string.length :
+ position + indexOfASCIIWhitespaceOrSemicolon + 1;
+
+ return whatwgEncoding.labelToName(string.substring(position, end));
+}
+
+function isSpaceCharacter(c) {
+ return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
+}