From 268186c27d436dd4fe6a330af8790ceeaeb6492c Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Tue, 20 Jul 2021 14:18:42 -0500 Subject: [PATCH] fix(parser): html entities evaluated (#738) --- packages/astro-parser/src/parse/state/tag.ts | 4 +- packages/astro-parser/src/parse/state/text.ts | 3 +- packages/astro-parser/src/parse/utils/html.ts | 85 +------------------ .../snowpack.config.json | 3 + .../src/pages/index.astro | 11 +++ .../test/html-encoded-characters.test.js | 23 +++++ 6 files changed, 41 insertions(+), 88 deletions(-) create mode 100644 packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json create mode 100644 packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro create mode 100644 packages/astro/test/html-encoded-characters.test.js diff --git a/packages/astro-parser/src/parse/state/tag.ts b/packages/astro-parser/src/parse/state/tag.ts index 70fa9e3619..f3d30b06d4 100644 --- a/packages/astro-parser/src/parse/state/tag.ts +++ b/packages/astro-parser/src/parse/state/tag.ts @@ -2,7 +2,7 @@ import read_expression from '../read/expression.js'; import read_style from '../read/style.js'; -import { decode_character_references, closing_tag_omitted } from '../utils/html.js'; +import { closing_tag_omitted } from '../utils/html.js'; import { is_void } from '../../utils/names.js'; import { Parser } from '../index.js'; import { Directive, DirectiveType, TemplateNode, Text } from '../../interfaces.js'; @@ -533,7 +533,7 @@ export function read_sequence(parser: Parser, done: () => boolean): TemplateNode function flush() { if (current_chunk.raw) { - current_chunk.data = decode_character_references(current_chunk.raw); + current_chunk.data = current_chunk.raw; current_chunk.end = parser.index; chunks.push(current_chunk); } diff --git a/packages/astro-parser/src/parse/state/text.ts b/packages/astro-parser/src/parse/state/text.ts index 020d066fd4..dec284ae49 100644 --- a/packages/astro-parser/src/parse/state/text.ts +++ b/packages/astro-parser/src/parse/state/text.ts @@ -1,6 +1,5 @@ // @ts-nocheck -import { decode_character_references } from '../utils/html.js'; import { Parser } from '../index.js'; export default function text(parser: Parser) { @@ -25,7 +24,7 @@ export default function text(parser: Parser) { end: parser.index, type: 'Text', raw: data, - data: decode_character_references(data), + data, }; parser.current().children.push(node); diff --git a/packages/astro-parser/src/parse/utils/html.ts b/packages/astro-parser/src/parse/utils/html.ts index e4669a2dbf..9988174f30 100644 --- a/packages/astro-parser/src/parse/utils/html.ts +++ b/packages/astro-parser/src/parse/utils/html.ts @@ -1,86 +1,3 @@ -// @ts-nocheck - -import entities from './entities.js'; - -const windows_1252 = [ - 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376, -]; - -const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g'); - -export function decode_character_references(html: string) { - return html.replace(entity_pattern, (match, entity) => { - let code; - - // Handle named entities - if (entity[0] !== '#') { - code = entities[entity]; - } else if (entity[1] === 'x') { - code = parseInt(entity.substring(2), 16); - } else { - code = parseInt(entity.substring(1), 10); - } - - if (!code) { - return match; - } - - return String.fromCodePoint(validate_code(code)); - }); -} - -const NUL = 0; - -// some code points are verboten. If we were inserting HTML, the browser would replace the illegal -// code points with alternatives in some cases - since we're bypassing that mechanism, we need -// to replace them ourselves -// -// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters -function validate_code(code: number) { - // line feed becomes generic whitespace - if (code === 10) { - return 32; - } - - // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) - if (code < 128) { - return code; - } - - // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need - // to correct the mistake or we'll end up with missing € signs and so on - if (code <= 159) { - return windows_1252[code - 128]; - } - - // basic multilingual plane - if (code < 55296) { - return code; - } - - // UTF-16 surrogate halves - if (code <= 57343) { - return NUL; - } - - // rest of the basic multilingual plane - if (code <= 65535) { - return code; - } - - // supplementary multilingual plane 0x10000 - 0x1ffff - if (code >= 65536 && code <= 131071) { - return code; - } - - // supplementary ideographic plane 0x20000 - 0x2ffff - if (code >= 131072 && code <= 196607) { - return code; - } - - return NUL; -} - // based on http://developers.whatwg.org/syntax.html#syntax-tag-omission const disallowed_contents = new Map([ ['li', new Set(['li'])], @@ -103,7 +20,7 @@ const disallowed_contents = new Map([ // close it, like `
  • one
  • two`? export function closing_tag_omitted(current: string, next?: string) { if (disallowed_contents.has(current)) { - if (!next || disallowed_contents.get(current).has(next)) { + if (!next || disallowed_contents.get(current)!.has(next)) { return true; } } diff --git a/packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json b/packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json new file mode 100644 index 0000000000..8f034781d8 --- /dev/null +++ b/packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json @@ -0,0 +1,3 @@ +{ + "workspaceRoot": "../../../../../" +} diff --git a/packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro b/packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro new file mode 100644 index 0000000000..a174c3491b --- /dev/null +++ b/packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro @@ -0,0 +1,11 @@ +--- +--- + +HTML Encoded Characters + +

       Hello, world;

    +
    +

    Nested elements? No problem. 

    +
    + + diff --git a/packages/astro/test/html-encoded-characters.test.js b/packages/astro/test/html-encoded-characters.test.js new file mode 100644 index 0000000000..e12656a3c1 --- /dev/null +++ b/packages/astro/test/html-encoded-characters.test.js @@ -0,0 +1,23 @@ +import { suite } from 'uvu'; +import * as assert from 'uvu/assert'; +import { doc } from './test-utils.js'; +import { setup } from './helpers.js'; + +const HtmlEncodedChars = suite('HTML Encoded Characters'); + +setup(HtmlEncodedChars, './fixtures/html-encoded-characters'); + +HtmlEncodedChars("doesn't decode html entities", async ({ runtime }) => { + const result = await runtime.load('/'); + if (result.error) throw new Error(result.error); + + const $ = doc(result.contents); + // Note: although this may look like it's incorrectly decoding the chars, + // Cheerio is showing how the browsers _should_ interpret the HTML. If it + // wasn't working correctly, then the spaces would have been trimmed to a + // single space. + assert.equal($('h1').html(), '   Hello, world;'); + assert.equal($('div p').html(), 'Nested elements? No problem. '); +}); + +HtmlEncodedChars.run();