feat(compiler): support tokenizing escaped strings (#28055)

In order to support source mapping of templates, we need to be able to tokenize the template in its original context. When the template is defined inline as a JavaScript string in a TS/JS source file, the tokenizer must be able to handle string escape sequences, such as `\n` and `\"` as they appear in the original source file. This commit teaches the lexer how to unescape these sequences, but only when the `escapedString` option is set to true. Otherwise there is no change to the tokenizing behaviour. PR Close #28055
2019-02-08 22:10:19 +00:00
parent eeb560ac88
commit 2424184d42
4 changed files with 463 additions and 4 deletions
--- a/packages/compiler/src/chars.ts
+++ b/packages/compiler/src/chars.ts
@ -7,6 +7,7 @@
 */

 export const $EOF = 0;
+export const $BSPACE = 8;
 export const $TAB = 9;
 export const $LF = 10;
 export const $VTAB = 11;
@ -36,6 +37,7 @@ export const $GT = 62;
 export const $QUESTION = 63;

 export const $0 = 48;
+export const $7 = 55;
 export const $9 = 57;

 export const $A = 65;
@ -51,6 +53,7 @@ export const $CARET = 94;
 export const $_ = 95;

 export const $a = 97;
+export const $b = 98;
 export const $e = 101;
 export const $f = 102;
 export const $n = 110;
@ -87,3 +90,11 @@ export function isAsciiLetter(code: number): boolean {
 export function isAsciiHexDigit(code: number): boolean {
  return code >= $a && code <= $f || code >= $A && code <= $F || isDigit(code);
 }
+
+export function isNewLine(code: number): boolean {
+  return code === $LF || code === $CR;
+}
+
+export function isOctalDigit(code: number): boolean {
+  return $0 <= code && code <= $7;
+}
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@ -70,6 +70,30 @@ export interface TokenizeOptions {
   * The entire `source` string is parsed if this is not provided.
   * */
  range?: LexerRange;
+  /**
+   * If this text is stored in a JavaScript string, then we have to deal with escape sequences.
+   *
+   * **Example 1:**
+   *
+   * ```
+   * "abc\"def\nghi"
+   * ```
+   *
+   * - The `\"` must be converted to `"`.
+   * - The `\n` must be converted to a new line character in a token,
+   *   but it should not increment the current line for source mapping.
+   *
+   * **Example 2:**
+   *
+   * ```
+   * "abc\
+   *  def"
+   * ```
+   *
+   * The line continuation (`\` followed by a newline) should be removed from a token
+   * but the new line should increment the current line for source mapping.
+   */
+  escapedString?: boolean;
 }

 export function tokenize(
@ -99,6 +123,7 @@ class _Tokenizer {
  private _end: number;
  private _tokenizeIcu: boolean;
  private _interpolationConfig: InterpolationConfig;
+  private _escapedString: boolean;
  private _peek: number = -1;
  private _nextPeek: number = -1;
  private _index: number;
@ -123,6 +148,7 @@ class _Tokenizer {
      options: TokenizeOptions) {
    this._tokenizeIcu = options.tokenizeExpansionForms || false;
    this._interpolationConfig = options.interpolationConfig || DEFAULT_INTERPOLATION_CONFIG;
+    this._escapedString = options.escapedString || false;
    this._input = _file.content;
    if (options.range) {
      this._end = options.range.endPos;
@ -266,10 +292,13 @@ class _Tokenizer {
    if (this._index >= this._end) {
      throw this._createError(_unexpectedCharacterErrorMsg(chars.$EOF), this._getSpan());
    }
-    if (this._peek === chars.$LF) {
+    // The actual character in the input might be different to the _peek if we are processing
+    // escape characters. We only want to track "real" new lines.
+    const actualChar = this._input.charCodeAt(this._index);
+    if (actualChar === chars.$LF) {
      this._line++;
      this._column = 0;
-    } else if (this._peek !== chars.$LF && this._peek !== chars.$CR) {
+    } else if (!chars.isNewLine(actualChar)) {
      this._column++;
    }
    this._index++;
@ -284,6 +313,22 @@ class _Tokenizer {
    this._peek = this._index >= this._end ? chars.$EOF : this._input.charCodeAt(this._index);
    this._nextPeek =
        this._index + 1 >= this._end ? chars.$EOF : this._input.charCodeAt(this._index + 1);
+    if (this._peek === chars.$BACKSLASH && processingEscapeSequence !== true &&
+        this._escapedString) {
+      this._processEscapeSequence();
+    }
+  }
+
+  /**
+   * Advance the specific number of characters.
+   * @param count The number of characters to advance.
+   * @param processingEscapeSequence Whether we want `advance()` to process escape sequences.
+   */
+  private _advanceN(count: number, processingEscapeSequence?: boolean) {
+    while (count) {
+      this._advance(processingEscapeSequence);
+      count--;
+    }
  }

  private _attemptCharCode(charCode: number): boolean {
@ -368,9 +413,11 @@ class _Tokenizer {
    if (decodeEntities && this._peek === chars.$AMPERSAND) {
      return this._decodeEntity();
    } else {
-      const index = this._index;
+      // Don't rely upon reading directly from `_input` as the actual char value
+      // may have been generated from an escape sequence.
+      const char = String.fromCodePoint(this._peek);
      this._advance();
-      return this._input[index];
+      return char;
    }
  }

@ -410,6 +457,122 @@ class _Tokenizer {
    }
  }

+  /**
+   * Process the escape sequence that starts at the current position in the text.
+   *
+   * This method is called from `_advance()` to ensure that escape sequences are
+   * always processed correctly however tokens are being consumed.
+   *
+   * But note that this method also calls `_advance()` (re-entering) to move through
+   * the characters within an escape sequence. In that case it tells `_advance()` not
+   * to attempt to process further escape sequences by passing `true` as its first
+   * argument.
+   */
+  private _processEscapeSequence(): void {
+    this._advance(true);  // advance past the backslash
+
+    // First check for standard control char sequences
+    if (this._peekChar() === chars.$n) {
+      this._peek = chars.$LF;
+    } else if (this._peekChar() === chars.$r) {
+      this._peek = chars.$CR;
+    } else if (this._peekChar() === chars.$v) {
+      this._peek = chars.$VTAB;
+    } else if (this._peekChar() === chars.$t) {
+      this._peek = chars.$TAB;
+    } else if (this._peekChar() === chars.$b) {
+      this._peek = chars.$BSPACE;
+    } else if (this._peekChar() === chars.$f) {
+      this._peek = chars.$FF;
+    }
+
+    // Now consider more complex sequences
+
+    else if (this._peekChar() === chars.$u) {
+      // Unicode code-point sequence
+      this._advance(true);  // advance past the `u` char
+      if (this._peekChar() === chars.$LBRACE) {
+        // Variable length Unicode, e.g. `\x{123}`
+        this._advance(true);  // advance past the `{` char
+        // Advance past the variable number of hex digits until we hit a `}` char
+        const start = this._getLocation();
+        while (this._peekChar() !== chars.$RBRACE) {
+          this._advance(true);
+        }
+        this._decodeHexDigits(start, this._index - start.offset);
+      } else {
+        // Fixed length Unicode, e.g. `\u1234`
+        this._parseFixedHexSequence(4);
+      }
+    }
+
+    else if (this._peekChar() === chars.$x) {
+      // Hex char code, e.g. `\x2F`
+      this._advance(true);  // advance past the `x` char
+      this._parseFixedHexSequence(2);
+    }
+
+    else if (chars.isOctalDigit(this._peekChar())) {
+      // Octal char code, e.g. `\012`,
+      const start = this._index;
+      let length = 1;
+      // Note that we work with `_nextPeek` because, although we check the next character
+      // after the sequence to find the end of the sequence,
+      // we do not want to advance that far to check the character, otherwise we will
+      // have to back up.
+      while (chars.isOctalDigit(this._nextPeek) && length < 3) {
+        this._advance(true);
+        length++;
+      }
+      const octal = this._input.substr(start, length);
+      this._peek = parseInt(octal, 8);
+    }
+
+    else if (chars.isNewLine(this._peekChar())) {
+      // Line continuation `\` followed by a new line
+      this._advance(true);  // advance over the newline
+    }
+
+    // If none of the `if` blocks were executed then we just have an escaped normal character.
+    // In that case we just, effectively, skip the backslash from the character.
+  }
+
+  private _parseFixedHexSequence(length: number) {
+    const start = this._getLocation();
+    this._advanceN(length - 1, true);
+    this._decodeHexDigits(start, length);
+  }
+
+  private _decodeHexDigits(start: ParseLocation, length: number) {
+    const hex = this._input.substr(start.offset, length);
+    const charCode = parseInt(hex, 16);
+    if (!isNaN(charCode)) {
+      this._peek = charCode;
+    } else {
+      throw this._createError(
+          'Invalid hexadecimal escape sequence', this._getSpan(start, this._getLocation()));
+    }
+  }
+
+  /**
+   * This little helper is to solve a problem where the TS compiler will narrow
+   * the type of `_peek` after an `if` statment, even if there is a call to a
+   * method that might mutate the `_peek`.
+   *
+   * For example:
+   *
+   * ```
+   * if (this._peek === 10) {
+   *   this._advance(); // mutates _peek
+   *   if (this._peek === 20) {
+   *     ...
+   * ```
+   *
+   * The second if statement fails TS compilation because the compiler has determined
+   * that `_peek` is `10` and so can never be equal to `20`.
+   */
+  private _peekChar(): number { return this._peek; }
+
  private _consumeRawText(
      decodeEntities: boolean, firstCharOfEnd: number, attemptEndRest: () => boolean): Token {
    let tagCloseStart: ParseLocation;
--- a/packages/compiler/src/render3/view/template.ts
+++ b/packages/compiler/src/render3/view/template.ts
@ -1580,6 +1580,30 @@ export interface ParseTemplateOptions {
   * The entire `source` string is parsed if this is not provided.
   * */
  range?: LexerRange;
+  /**
+   * If this text is stored in a JavaScript string, then we have to deal with escape sequences.
+   *
+   * **Example 1:**
+   *
+   * ```
+   * "abc\"def\nghi"
+   * ```
+   *
+   * - The `\"` must be converted to `"`.
+   * - The `\n` must be converted to a new line character in a token,
+   *   but it should not increment the current line for source mapping.
+   *
+   * **Example 2:**
+   *
+   * ```
+   * "abc\
+   *  def"
+   * ```
+   *
+   * The line continuation (`\` followed by a newline) should be removed from a token
+   * but the new line should increment the current line for source mapping.
+   */
+  escapedString?: boolean;
 }

 /**