feat(compiler): support tokenizing escaped strings (#28055)

In order to support source mapping of templates, we need to be able to tokenize the template in its original context. When the template is defined inline as a JavaScript string in a TS/JS source file, the tokenizer must be able to handle string escape sequences, such as `\n` and `\"` as they appear in the original source file. This commit teaches the lexer how to unescape these sequences, but only when the `escapedString` option is set to true. Otherwise there is no change to the tokenizing behaviour. PR Close #28055
2019-02-08 22:10:19 +00:00
parent eeb560ac88
commit 2424184d42
4 changed files with 463 additions and 4 deletions
--- a/packages/compiler/test/ml_parser/lexer_spec.ts
+++ b/packages/compiler/test/ml_parser/lexer_spec.ts
@ -824,6 +824,267 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
      });
    });

+    describe('(processing escaped strings)', () => {
+      it('should unescape standard escape sequences', () => {
+        expect(tokenizeAndHumanizeParts('\\\' \\\' \\\'', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\' \' \''],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\" \\" \\"', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\" \" \"'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\` \\` \\`', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\` \` \`'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\\\ \\\\ \\\\', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\\ \\ \\'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\n \\n \\n', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\n \n \n'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\r \\r \\r', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\n \n \n'],  // post processing converts `\r` to `\n`
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\v \\v \\v', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\v \v \v'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\t \\t \\t', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\t \t \t'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\b \\b \\b', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\b \b \b'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\f \\f \\f', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\f \f \f'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts(
+                   '\\\' \\" \\` \\\\ \\n \\r \\v \\t \\b \\f', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TEXT, '\' \" \` \\ \n \n \v \t \b \f'],
+              [lex.TokenType.EOF],
+
+            ]);
+      });
+
+      it('should unescape null sequences', () => {
+        expect(tokenizeAndHumanizeParts('\\0', {escapedString: true})).toEqual([
+          [lex.TokenType.EOF],
+        ]);
+        // \09 is not an octal number so the \0 is taken as EOF
+        expect(tokenizeAndHumanizeParts('\\09', {escapedString: true})).toEqual([
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should unescape octal sequences', () => {
+        // \19 is read as an octal `\1` followed by a normal char `9`
+        // \1234 is read as an octal `\123` followed by a normal char `4`
+        // \999 is not an octal number so its backslash just gets removed.
+        expect(tokenizeAndHumanizeParts(
+                   '\\001 \\01 \\1 \\12 \\223 \\19 \\2234 \\999', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TEXT, '\x01 \x01 \x01 \x0A \x93 \x019 \x934 999'],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+      it('should unescape hex sequences', () => {
+        expect(tokenizeAndHumanizeParts('\\x12 \\x4F \\xDC', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\x12 \x4F \xDC'],
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should report an error on an invalid hex sequence', () => {
+        expect(tokenizeAndHumanizeErrors('\\xGG', {escapedString: true})).toEqual([
+          [null, 'Invalid hexadecimal escape sequence', '0:2']
+        ]);
+
+        expect(tokenizeAndHumanizeErrors('abc \\x xyz', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'Invalid hexadecimal escape sequence', '0:6']
+        ]);
+
+        expect(tokenizeAndHumanizeErrors('abc\\x', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:5']
+        ]);
+      });
+
+      it('should unescape fixed length Unicode sequences', () => {
+        expect(tokenizeAndHumanizeParts('\\u0123 \\uABCD', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, '\u0123 \uABCD'],
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should error on an invalid fixed length Unicode sequence', () => {
+        expect(tokenizeAndHumanizeErrors('\\uGGGG', {escapedString: true})).toEqual([
+          [null, 'Invalid hexadecimal escape sequence', '0:2']
+        ]);
+      });
+
+      it('should unescape variable length Unicode sequences', () => {
+        expect(tokenizeAndHumanizeParts(
+                   '\\u{01} \\u{ABC} \\u{1234} \\u{123AB}', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TEXT, '\u{01} \u{ABC} \u{1234} \u{123AB}'],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+      it('should error on an invalid variable length Unicode sequence', () => {
+        expect(tokenizeAndHumanizeErrors('\\u{GG}', {escapedString: true})).toEqual([
+          [null, 'Invalid hexadecimal escape sequence', '0:3']
+        ]);
+      });
+
+      it('should unescape line continuations', () => {
+        expect(tokenizeAndHumanizeParts('abc\\\ndef', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'abcdef'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeParts('\\\nx\\\ny\\\n', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'xy'],
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should remove backslash from "non-escape" sequences', () => {
+        expect(tokenizeAndHumanizeParts('\a \g \~', {escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'a g ~'],
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should unescape sequences in plain text', () => {
+        expect(tokenizeAndHumanizeParts('abc\ndef\\nghi\\tjkl\\`\\\'\\"mno', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TEXT, 'abc\ndef\nghi\tjkl`\'"mno'],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+      it('should unescape sequences in raw text', () => {
+        expect(tokenizeAndHumanizeParts(
+                   '<script>abc\ndef\\nghi\\tjkl\\`\\\'\\"mno</script>', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TAG_OPEN_START, null, 'script'],
+              [lex.TokenType.TAG_OPEN_END],
+              [lex.TokenType.RAW_TEXT, 'abc\ndef\nghi\tjkl`\'"mno'],
+              [lex.TokenType.TAG_CLOSE, null, 'script'],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+      it('should unescape sequences in escapable raw text', () => {
+        expect(tokenizeAndHumanizeParts(
+                   '<title>abc\ndef\\nghi\\tjkl\\`\\\'\\"mno</title>', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TAG_OPEN_START, null, 'title'],
+              [lex.TokenType.TAG_OPEN_END],
+              [lex.TokenType.ESCAPABLE_RAW_TEXT, 'abc\ndef\nghi\tjkl`\'"mno'],
+              [lex.TokenType.TAG_CLOSE, null, 'title'],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+      it('should parse over escape sequences in tag definitions', () => {
+        expect(tokenizeAndHumanizeParts('<t a=\\"b\\" \\n c=\\\'d\\\'>', {escapedString: true}))
+            .toEqual([
+              [lex.TokenType.TAG_OPEN_START, null, 't'],
+              [lex.TokenType.ATTR_NAME, null, 'a'],
+              [lex.TokenType.ATTR_VALUE, 'b'],
+              [lex.TokenType.ATTR_NAME, null, 'c'],
+              [lex.TokenType.ATTR_VALUE, 'd'],
+              [lex.TokenType.TAG_OPEN_END],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+      it('should tokenize the correct span when there are escape sequences', () => {
+        const text =
+            'selector: "app-root",\ntemplate: "line 1\\n\\"line 2\\"\\nline 3",\ninputs: []';
+        const range = {
+          startPos: 33,
+          startLine: 1,
+          startCol: 10,
+          endPos: 59,
+        };
+        expect(tokenizeAndHumanizeParts(text, {range, escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'line 1\n"line 2"\nline 3'],
+          [lex.TokenType.EOF],
+        ]);
+        expect(tokenizeAndHumanizeSourceSpans(text, {range, escapedString: true})).toEqual([
+          [lex.TokenType.TEXT, 'line 1\\n\\"line 2\\"\\nline 3'],
+          [lex.TokenType.EOF, ''],
+        ]);
+      });
+
+      it('should account for escape sequences when computing source spans ', () => {
+        const text = '<t>line 1</t>\n' +  // <- unescaped line break
+            '<t>line 2</t>\\n' +          // <- escaped line break
+            '<t>line 3\\\n' +             // <- line continuation
+            '</t>';
+
+        expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([
+          [lex.TokenType.TAG_OPEN_START, null, 't'], [lex.TokenType.TAG_OPEN_END],
+          [lex.TokenType.TEXT, 'line 1'], [lex.TokenType.TAG_CLOSE, null, 't'],
+          [lex.TokenType.TEXT, '\n'],
+
+          [lex.TokenType.TAG_OPEN_START, null, 't'], [lex.TokenType.TAG_OPEN_END],
+          [lex.TokenType.TEXT, 'line 2'], [lex.TokenType.TAG_CLOSE, null, 't'],
+          [lex.TokenType.TEXT, '\n'],
+
+          [lex.TokenType.TAG_OPEN_START, null, 't'], [lex.TokenType.TAG_OPEN_END],
+          [lex.TokenType.TEXT, 'line 3'],  // <- line continuation does not appear in token
+          [lex.TokenType.TAG_CLOSE, null, 't'],
+
+          [lex.TokenType.EOF]
+        ]);
+        expect(tokenizeAndHumanizeLineColumn(text, {escapedString: true})).toEqual([
+          [lex.TokenType.TAG_OPEN_START, '0:0'],
+          [lex.TokenType.TAG_OPEN_END, '0:2'],
+          [lex.TokenType.TEXT, '0:3'],
+          [lex.TokenType.TAG_CLOSE, '0:9'],
+          [lex.TokenType.TEXT, '0:13'],  // <- real newline increments the row
+
+          [lex.TokenType.TAG_OPEN_START, '1:0'],
+          [lex.TokenType.TAG_OPEN_END, '1:2'],
+          [lex.TokenType.TEXT, '1:3'],
+          [lex.TokenType.TAG_CLOSE, '1:9'],
+          [lex.TokenType.TEXT, '1:14'],  // <- escaped newline does not increment the row
+
+          [lex.TokenType.TAG_OPEN_START, '1:15'],
+          [lex.TokenType.TAG_OPEN_END, '1:17'],
+          [lex.TokenType.TEXT, '1:18'],  // <- the line continuation increments the row
+          [lex.TokenType.TAG_CLOSE, '2:0'],
+
+          [lex.TokenType.EOF, '2:4'],
+        ]);
+        expect(tokenizeAndHumanizeSourceSpans(text, {escapedString: true})).toEqual([
+          [lex.TokenType.TAG_OPEN_START, '<t'], [lex.TokenType.TAG_OPEN_END, '>'],
+          [lex.TokenType.TEXT, 'line 1'], [lex.TokenType.TAG_CLOSE, '</t>'],
+          [lex.TokenType.TEXT, '\n'],
+
+          [lex.TokenType.TAG_OPEN_START, '<t'], [lex.TokenType.TAG_OPEN_END, '>'],
+          [lex.TokenType.TEXT, 'line 2'], [lex.TokenType.TAG_CLOSE, '</t>\\'],
+          [lex.TokenType.TEXT, 'n'],
+
+          [lex.TokenType.TAG_OPEN_START, '<t'], [lex.TokenType.TAG_OPEN_END, '>'],
+          [lex.TokenType.TEXT, 'line 3\\\n'], [lex.TokenType.TAG_CLOSE, '</t>'],
+
+          [lex.TokenType.EOF, '']
+        ]);
+      });
+    });
  });
 }