feat(compiler): Parse and recover on incomplete opening HTML tags (#38681)

Let's say we have a code like ```html <div<span>123</span> ``` Currently this gets parsed into a tree with the element tag `div<span`. This has at least two downsides: - An incorrect diagnostic that `</span>` doesn't close an element is emitted. - A consumer of the parse tree using it for editor services is unable to provide correct completions for the opening `<span>` tag. This patch attempts to fix both issues by instead parsing the code into the same tree that would be parsed for `<div></div><span>123</span>`. In particular, we do this by optimistically scanning an open tag as usual, but if we do not notice a terminating '>', we mark the tag as "incomplete". A parser then emits an error for the incomplete tag and adds a synthetic (recovered) element node to the tree with the incomplete open tag's name. What's the downside of this? For one, a breaking change. <ol> <li> The first breaking change is that `<` symbols that are ambiguously text or opening tags will be parsed as opening tags instead of text in element bodies. Take the code ```html <p>a<b</p> ``` Clearly we cannot have the best of both worlds, and this patch chooses to swap the parsing strategy to support the new feature. Of course, `<` can still be inserted as text via the `<` entity. </li> </ol> Part of #38596 PR Close #38681
2020-09-02 10:17:01 -05:00
parent 49f27e31ed
commit 6ae3b68acf
4 changed files with 153 additions and 32 deletions
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@ -17,6 +17,7 @@ export enum TokenType {
  TAG_OPEN_END,
  TAG_OPEN_END_VOID,
  TAG_CLOSE,
+  INCOMPLETE_TAG_OPEN,
  TEXT,
  ESCAPABLE_RAW_TEXT,
  RAW_TEXT,
@ -511,8 +512,6 @@ class _Tokenizer {
    let tagName: string;
    let prefix: string;
    let openTagToken: Token|undefined;
-    let tokensBeforeTagOpen = this.tokens.length;
-    const innerStart = this._cursor.clone();
    try {
      if (!chars.isAsciiLetter(this._cursor.peek())) {
        throw this._createError(
@ -523,7 +522,8 @@ class _Tokenizer {
      prefix = openTagToken.parts[0];
      tagName = openTagToken.parts[1];
      this._attemptCharCodeUntilFn(isNotWhitespace);
-      while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT) {
+      while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT &&
+             this._cursor.peek() !== chars.$LT) {
        this._consumeAttributeName();
        this._attemptCharCodeUntilFn(isNotWhitespace);
        if (this._attemptCharCode(chars.$EQ)) {
@ -535,14 +535,15 @@ class _Tokenizer {
      this._consumeTagOpenEnd();
    } catch (e) {
      if (e instanceof _ControlFlowError) {
-        // When the start tag is invalid (including invalid "attributes"), assume we want a "<"
-        this._cursor = innerStart;
        if (openTagToken) {
-          this.tokens.length = tokensBeforeTagOpen;
+          // We errored before we could close the opening tag, so it is incomplete.
+          openTagToken.type = TokenType.INCOMPLETE_TAG_OPEN;
+        } else {
+          // When the start tag is invalid, assume we want a "<" as text.
+          // Back to back text tokens are merged at the end.
+          this._beginToken(TokenType.TEXT, start);
+          this._endToken(['<']);
        }
-        // Back to back text tokens are merged at the end
-        this._beginToken(TokenType.TEXT, start);
-        this._endToken(['<']);
        return;
      }

@ -772,8 +773,8 @@ function isNotWhitespace(code: number): boolean {
 }

 function isNameEnd(code: number): boolean {
-  return chars.isWhitespace(code) || code === chars.$GT || code === chars.$SLASH ||
-      code === chars.$SQ || code === chars.$DQ || code === chars.$EQ;
+  return chars.isWhitespace(code) || code === chars.$GT || code === chars.$LT ||
+      code === chars.$SLASH || code === chars.$SQ || code === chars.$DQ || code === chars.$EQ;
 }

 function isPrefixEnd(code: number): boolean {
--- a/packages/compiler/src/ml_parser/parser.ts
+++ b/packages/compiler/src/ml_parser/parser.ts
@ -56,7 +56,8 @@ class _TreeBuilder {

  build(): void {
    while (this._peek.type !== lex.TokenType.EOF) {
-      if (this._peek.type === lex.TokenType.TAG_OPEN_START) {
+      if (this._peek.type === lex.TokenType.TAG_OPEN_START ||
+          this._peek.type === lex.TokenType.INCOMPLETE_TAG_OPEN) {
        this._consumeStartTag(this._advance());
      } else if (this._peek.type === lex.TokenType.TAG_CLOSE) {
        this._consumeEndTag(this._advance());
@ -233,8 +234,7 @@ class _TreeBuilder {
  }

  private _consumeStartTag(startTagToken: lex.Token) {
-    const prefix = startTagToken.parts[0];
-    const name = startTagToken.parts[1];
+    const [prefix, name] = startTagToken.parts;
    const attrs: html.Attribute[] = [];
    while (this._peek.type === lex.TokenType.ATTR_NAME) {
      attrs.push(this._consumeAttr(this._advance()));
@ -266,6 +266,12 @@ class _TreeBuilder {
      // Elements that are self-closed have their `endSourceSpan` set to the full span, as the
      // element start tag also represents the end tag.
      this._popElement(fullName, span);
+    } else if (startTagToken.type === lex.TokenType.INCOMPLETE_TAG_OPEN) {
+      // We already know the opening tag is not complete, so it is unlikely it has a corresponding
+      // close tag. Let's optimistically parse it as a full element and emit an error.
+      this._popElement(fullName, null);
+      this.errors.push(
+          TreeError.create(fullName, span, `Opening tag "${fullName}" not terminated.`));
    }
  }

@ -295,7 +301,13 @@ class _TreeBuilder {
    }
  }

-  private _popElement(fullName: string, endSourceSpan: ParseSourceSpan): boolean {
+  /**
+   * Closes the nearest element with the tag name `fullName` in the parse tree.
+   * `endSourceSpan` is the span of the closing tag, or null if the element does
+   * not have a closing tag (for example, this happens when an incomplete
+   * opening tag is recovered).
+   */
+  private _popElement(fullName: string, endSourceSpan: ParseSourceSpan|null): boolean {
    for (let stackIndex = this._elementStack.length - 1; stackIndex >= 0; stackIndex--) {
      const el = this._elementStack[stackIndex];
      if (el.name == fullName) {
@ -303,7 +315,7 @@ class _TreeBuilder {
        // removed from the element stack at this point are closed implicitly, so they won't get
        // an end source span (as there is no explicit closing element).
        el.endSourceSpan = endSourceSpan;
-        el.sourceSpan.end = endSourceSpan.end || el.sourceSpan.end;
+        el.sourceSpan.end = endSourceSpan !== null ? endSourceSpan.end : el.sourceSpan.end;

        this._elementStack.splice(stackIndex, this._elementStack.length - stackIndex);
        return true;