feat(compiler): Parse and recover on incomplete opening HTML tags (#38681)

Let's say we have a code like

```html
<div<span>123</span>
```

Currently this gets parsed into a tree with the element tag `div<span`.
This has at least two downsides:

- An incorrect diagnostic that `</span>` doesn't close an element is
  emitted.
- A consumer of the parse tree using it for editor services is unable to
  provide correct completions for the opening `<span>` tag.

This patch attempts to fix both issues by instead parsing the code into
the same tree that would be parsed for `<div></div><span>123</span>`.

In particular, we do this by optimistically scanning an open tag as
usual, but if we do not notice a terminating '>', we mark the tag as
"incomplete". A parser then emits an error for the incomplete tag and
adds a synthetic (recovered) element node to the tree with the
incomplete open tag's name.

What's the downside of this? For one, a breaking change.

<ol>
<li>

The first breaking change is that `<` symbols that are ambiguously text
or opening tags will be parsed as opening tags instead of text in
element bodies. Take the code

```html
<p>a<b</p>
```

Clearly we cannot have the best of both worlds, and this patch chooses
to swap the parsing strategy to support the new feature. Of course, `<`
can still be inserted as text via the `&lt;` entity.

</li>
</ol>

Part of #38596

PR Close #38681
This commit is contained in:
Ayaz Hafiz
2020-09-02 10:17:01 -05:00
committed by Misko Hevery
parent 49f27e31ed
commit 6ae3b68acf
4 changed files with 153 additions and 32 deletions

View File

@ -11,7 +11,7 @@ import {HtmlParser, ParseTreeResult, TreeError} from '../../src/ml_parser/html_p
import {TokenType} from '../../src/ml_parser/lexer';
import {ParseError} from '../../src/parse_util';
import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spec_utils';
import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes} from './ast_spec_utils';
{
describe('HtmlParser', () => {
@ -622,7 +622,7 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spe
`{a, select, b {foo} % { bar {% bar}}`, 'TestComp', {tokenizeExpansionForms: true});
expect(humanizeErrors(p.errors)).toEqual([
[
6,
TokenType.RAW_TEXT,
'Unexpected character "EOF" (Do you have an unescaped "{" in your template? Use "{{ \'{\' }}") to escape it.)',
'0:36'
],
@ -840,14 +840,66 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spe
]]);
});
it('should report subsequent open tags without proper close tag', () => {
const errors = parser.parse('<div</div>', 'TestComp').errors;
expect(errors.length).toEqual(1);
expect(humanizeErrors(errors)).toEqual([[
'div',
'Unexpected closing tag "div". It may happen when the tag has already been closed by another tag. For more info see https://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags',
'0:4'
]]);
describe('incomplete element tag', () => {
it('should parse and report incomplete tags after the tag name', () => {
const {errors, rootNodes} = parser.parse('<div<span><div </span>', 'TestComp');
expect(humanizeNodes(rootNodes, true)).toEqual([
[html.Element, 'div', 0, '<div', '<div', null],
[html.Element, 'span', 0, '<span><div </span>', '<span>', '</span>'],
[html.Element, 'div', 1, '<div ', '<div ', null],
]);
expect(humanizeErrors(errors)).toEqual([
['div', 'Opening tag "div" not terminated.', '0:0'],
['div', 'Opening tag "div" not terminated.', '0:10'],
]);
});
it('should parse and report incomplete tags after attribute', () => {
const {errors, rootNodes} =
parser.parse('<div class="hi" sty<span></span>', 'TestComp');
expect(humanizeNodes(rootNodes, true)).toEqual([
[html.Element, 'div', 0, '<div class="hi" sty', '<div class="hi" sty', null],
[html.Attribute, 'class', 'hi', 'class="hi"'],
[html.Attribute, 'sty', '', 'sty'],
[html.Element, 'span', 0, '<span></span>', '<span>', '</span>'],
]);
expect(humanizeErrors(errors)).toEqual([
['div', 'Opening tag "div" not terminated.', '0:0'],
]);
});
it('should parse and report incomplete tags after quote', () => {
const {errors, rootNodes} = parser.parse('<div "<span></span>', 'TestComp');
expect(humanizeNodes(rootNodes, true)).toEqual([
[html.Element, 'div', 0, '<div ', '<div ', null],
[html.Text, '"', 0, '"'],
[html.Element, 'span', 0, '<span></span>', '<span>', '</span>'],
]);
expect(humanizeErrors(errors)).toEqual([
['div', 'Opening tag "div" not terminated.', '0:0'],
]);
});
it('should report subsequent open tags without proper close tag', () => {
const errors = parser.parse('<div</div>', 'TestComp').errors;
expect(errors.length).toEqual(2);
expect(humanizeErrors(errors)).toEqual([
['div', 'Opening tag "div" not terminated.', '0:0'],
// TODO(ayazhafiz): the following error is unnecessary and can be pruned if we keep
// track of the incomplete tag names.
[
'div',
'Unexpected closing tag "div". It may happen when the tag has already been closed by another tag. For more info see https://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags',
'0:4'
]
]);
});
});
it('should report closing tag for void elements', () => {

View File

@ -232,6 +232,45 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.EOF, ''],
]);
});
describe('tags', () => {
it('after tag name', () => {
expect(tokenizeAndHumanizeSourceSpans('<div<span><div</span>')).toEqual([
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
[lex.TokenType.TAG_OPEN_START, '<span'],
[lex.TokenType.TAG_OPEN_END, '>'],
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
[lex.TokenType.TAG_CLOSE, '</span>'],
[lex.TokenType.EOF, ''],
]);
});
it('in attribute', () => {
expect(tokenizeAndHumanizeSourceSpans('<div class="hi" sty<span></span>')).toEqual([
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
[lex.TokenType.ATTR_NAME, 'class'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'hi'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_NAME, 'sty'],
[lex.TokenType.TAG_OPEN_START, '<span'],
[lex.TokenType.TAG_OPEN_END, '>'],
[lex.TokenType.TAG_CLOSE, '</span>'],
[lex.TokenType.EOF, ''],
]);
});
it('after quote', () => {
expect(tokenizeAndHumanizeSourceSpans('<div "<span></span>')).toEqual([
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
[lex.TokenType.TEXT, '"'],
[lex.TokenType.TAG_OPEN_START, '<span'],
[lex.TokenType.TAG_OPEN_END, '>'],
[lex.TokenType.TAG_CLOSE, '</span>'],
[lex.TokenType.EOF, ''],
]);
});
});
});
describe('attributes', () => {
@ -554,7 +593,8 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeSourceSpans('<p>a<b</p>')).toEqual([
[lex.TokenType.TAG_OPEN_START, '<p'],
[lex.TokenType.TAG_OPEN_END, '>'],
[lex.TokenType.TEXT, 'a<b'],
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<b'],
[lex.TokenType.TAG_CLOSE, '</p>'],
[lex.TokenType.EOF, ''],
]);
@ -579,25 +619,41 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should parse start tags quotes in place of an attribute name as text', () => {
expect(tokenizeAndHumanizeParts('<t ">')).toEqual([
[lex.TokenType.TEXT, '<t ">'],
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
[lex.TokenType.TEXT, '">'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeParts('<t \'>')).toEqual([
[lex.TokenType.TEXT, '<t \'>'],
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
[lex.TokenType.TEXT, '\'>'],
[lex.TokenType.EOF],
]);
});
it('should parse start tags quotes in place of an attribute name (after a valid attribute) as text',
it('should parse start tags quotes in place of an attribute name (after a valid attribute)',
() => {
expect(tokenizeAndHumanizeParts('<t a="b" ">')).toEqual([
[lex.TokenType.TEXT, '<t a="b" ">'],
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_QUOTE, '"'],
// TODO(ayazhafiz): the " symbol should be a synthetic attribute,
// allowing us to complete the opening tag correctly.
[lex.TokenType.TEXT, '">'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeParts('<t a=\'b\' \'>')).toEqual([
[lex.TokenType.TEXT, '<t a=\'b\' \'>'],
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_QUOTE, '\''],
// TODO(ayazhafiz): the ' symbol should be a synthetic attribute,
// allowing us to complete the opening tag correctly.
[lex.TokenType.TEXT, '\'>'],
[lex.TokenType.EOF],
]);
});