diff --git a/packages/compiler/src/chars.ts b/packages/compiler/src/chars.ts index 4e510a2d58..28ce8b9c22 100644 --- a/packages/compiler/src/chars.ts +++ b/packages/compiler/src/chars.ts @@ -7,6 +7,7 @@ */ export const $EOF = 0; +export const $BSPACE = 8; export const $TAB = 9; export const $LF = 10; export const $VTAB = 11; @@ -36,6 +37,7 @@ export const $GT = 62; export const $QUESTION = 63; export const $0 = 48; +export const $7 = 55; export const $9 = 57; export const $A = 65; @@ -51,6 +53,7 @@ export const $CARET = 94; export const $_ = 95; export const $a = 97; +export const $b = 98; export const $e = 101; export const $f = 102; export const $n = 110; @@ -87,3 +90,11 @@ export function isAsciiLetter(code: number): boolean { export function isAsciiHexDigit(code: number): boolean { return code >= $a && code <= $f || code >= $A && code <= $F || isDigit(code); } + +export function isNewLine(code: number): boolean { + return code === $LF || code === $CR; +} + +export function isOctalDigit(code: number): boolean { + return $0 <= code && code <= $7; +} diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index 30bfdea9aa..c3c63487b5 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -70,6 +70,30 @@ export interface TokenizeOptions { * The entire `source` string is parsed if this is not provided. * */ range?: LexerRange; + /** + * If this text is stored in a JavaScript string, then we have to deal with escape sequences. + * + * **Example 1:** + * + * ``` + * "abc\"def\nghi" + * ``` + * + * - The `\"` must be converted to `"`. + * - The `\n` must be converted to a new line character in a token, + * but it should not increment the current line for source mapping. + * + * **Example 2:** + * + * ``` + * "abc\ + * def" + * ``` + * + * The line continuation (`\` followed by a newline) should be removed from a token + * but the new line should increment the current line for source mapping. + */ + escapedString?: boolean; } export function tokenize( @@ -99,6 +123,7 @@ class _Tokenizer { private _end: number; private _tokenizeIcu: boolean; private _interpolationConfig: InterpolationConfig; + private _escapedString: boolean; private _peek: number = -1; private _nextPeek: number = -1; private _index: number; @@ -123,6 +148,7 @@ class _Tokenizer { options: TokenizeOptions) { this._tokenizeIcu = options.tokenizeExpansionForms || false; this._interpolationConfig = options.interpolationConfig || DEFAULT_INTERPOLATION_CONFIG; + this._escapedString = options.escapedString || false; this._input = _file.content; if (options.range) { this._end = options.range.endPos; @@ -266,10 +292,13 @@ class _Tokenizer { if (this._index >= this._end) { throw this._createError(_unexpectedCharacterErrorMsg(chars.$EOF), this._getSpan()); } - if (this._peek === chars.$LF) { + // The actual character in the input might be different to the _peek if we are processing + // escape characters. We only want to track "real" new lines. + const actualChar = this._input.charCodeAt(this._index); + if (actualChar === chars.$LF) { this._line++; this._column = 0; - } else if (this._peek !== chars.$LF && this._peek !== chars.$CR) { + } else if (!chars.isNewLine(actualChar)) { this._column++; } this._index++; @@ -284,6 +313,22 @@ class _Tokenizer { this._peek = this._index >= this._end ? chars.$EOF : this._input.charCodeAt(this._index); this._nextPeek = this._index + 1 >= this._end ? chars.$EOF : this._input.charCodeAt(this._index + 1); + if (this._peek === chars.$BACKSLASH && processingEscapeSequence !== true && + this._escapedString) { + this._processEscapeSequence(); + } + } + + /** + * Advance the specific number of characters. + * @param count The number of characters to advance. + * @param processingEscapeSequence Whether we want `advance()` to process escape sequences. + */ + private _advanceN(count: number, processingEscapeSequence?: boolean) { + while (count) { + this._advance(processingEscapeSequence); + count--; + } } private _attemptCharCode(charCode: number): boolean { @@ -368,9 +413,11 @@ class _Tokenizer { if (decodeEntities && this._peek === chars.$AMPERSAND) { return this._decodeEntity(); } else { - const index = this._index; + // Don't rely upon reading directly from `_input` as the actual char value + // may have been generated from an escape sequence. + const char = String.fromCodePoint(this._peek); this._advance(); - return this._input[index]; + return char; } } @@ -410,6 +457,122 @@ class _Tokenizer { } } + /** + * Process the escape sequence that starts at the current position in the text. + * + * This method is called from `_advance()` to ensure that escape sequences are + * always processed correctly however tokens are being consumed. + * + * But note that this method also calls `_advance()` (re-entering) to move through + * the characters within an escape sequence. In that case it tells `_advance()` not + * to attempt to process further escape sequences by passing `true` as its first + * argument. + */ + private _processEscapeSequence(): void { + this._advance(true); // advance past the backslash + + // First check for standard control char sequences + if (this._peekChar() === chars.$n) { + this._peek = chars.$LF; + } else if (this._peekChar() === chars.$r) { + this._peek = chars.$CR; + } else if (this._peekChar() === chars.$v) { + this._peek = chars.$VTAB; + } else if (this._peekChar() === chars.$t) { + this._peek = chars.$TAB; + } else if (this._peekChar() === chars.$b) { + this._peek = chars.$BSPACE; + } else if (this._peekChar() === chars.$f) { + this._peek = chars.$FF; + } + + // Now consider more complex sequences + + else if (this._peekChar() === chars.$u) { + // Unicode code-point sequence + this._advance(true); // advance past the `u` char + if (this._peekChar() === chars.$LBRACE) { + // Variable length Unicode, e.g. `\x{123}` + this._advance(true); // advance past the `{` char + // Advance past the variable number of hex digits until we hit a `}` char + const start = this._getLocation(); + while (this._peekChar() !== chars.$RBRACE) { + this._advance(true); + } + this._decodeHexDigits(start, this._index - start.offset); + } else { + // Fixed length Unicode, e.g. `\u1234` + this._parseFixedHexSequence(4); + } + } + + else if (this._peekChar() === chars.$x) { + // Hex char code, e.g. `\x2F` + this._advance(true); // advance past the `x` char + this._parseFixedHexSequence(2); + } + + else if (chars.isOctalDigit(this._peekChar())) { + // Octal char code, e.g. `\012`, + const start = this._index; + let length = 1; + // Note that we work with `_nextPeek` because, although we check the next character + // after the sequence to find the end of the sequence, + // we do not want to advance that far to check the character, otherwise we will + // have to back up. + while (chars.isOctalDigit(this._nextPeek) && length < 3) { + this._advance(true); + length++; + } + const octal = this._input.substr(start, length); + this._peek = parseInt(octal, 8); + } + + else if (chars.isNewLine(this._peekChar())) { + // Line continuation `\` followed by a new line + this._advance(true); // advance over the newline + } + + // If none of the `if` blocks were executed then we just have an escaped normal character. + // In that case we just, effectively, skip the backslash from the character. + } + + private _parseFixedHexSequence(length: number) { + const start = this._getLocation(); + this._advanceN(length - 1, true); + this._decodeHexDigits(start, length); + } + + private _decodeHexDigits(start: ParseLocation, length: number) { + const hex = this._input.substr(start.offset, length); + const charCode = parseInt(hex, 16); + if (!isNaN(charCode)) { + this._peek = charCode; + } else { + throw this._createError( + 'Invalid hexadecimal escape sequence', this._getSpan(start, this._getLocation())); + } + } + + /** + * This little helper is to solve a problem where the TS compiler will narrow + * the type of `_peek` after an `if` statment, even if there is a call to a + * method that might mutate the `_peek`. + * + * For example: + * + * ``` + * if (this._peek === 10) { + * this._advance(); // mutates _peek + * if (this._peek === 20) { + * ... + * ``` + * + * The second if statement fails TS compilation because the compiler has determined + * that `_peek` is `10` and so can never be equal to `20`. + */ + private _peekChar(): number { return this._peek; } + private _consumeRawText( decodeEntities: boolean, firstCharOfEnd: number, attemptEndRest: () => boolean): Token { let tagCloseStart: ParseLocation; diff --git a/packages/compiler/src/render3/view/template.ts b/packages/compiler/src/render3/view/template.ts index 6ffc1d3a75..b7adf7c211 100644 --- a/packages/compiler/src/render3/view/template.ts +++ b/packages/compiler/src/render3/view/template.ts @@ -1580,6 +1580,30 @@ export interface ParseTemplateOptions { * The entire `source` string is parsed if this is not provided. * */ range?: LexerRange; + /** + * If this text is stored in a JavaScript string, then we have to deal with escape sequences. + * + * **Example 1:** + * + * ``` + * "abc\"def\nghi" + * ``` + * + * - The `\"` must be converted to `"`. + * - The `\n` must be converted to a new line character in a token, + * but it should not increment the current line for source mapping. + * + * **Example 2:** + * + * ``` + * "abc\ + * def" + * ``` + * + * The line continuation (`\` followed by a newline) should be removed from a token + * but the new line should increment the current line for source mapping. + */ + escapedString?: boolean; } /** diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 1f9b5068ba..36379c5a31 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -824,6 +824,267 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u }); }); + describe('(processing escaped strings)', () => { + it('should unescape standard escape sequences', () => { + expect(tokenizeAndHumanizeParts('\\\' \\\' \\\'', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\' \' \''], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\" \\" \\"', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\" \" \"'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\` \\` \\`', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\` \` \`'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\\\ \\\\ \\\\', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\\ \\ \\'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\n \\n \\n', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\n \n \n'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\r \\r \\r', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\n \n \n'], // post processing converts `\r` to `\n` + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\v \\v \\v', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\v \v \v'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\t \\t \\t', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\t \t \t'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\b \\b \\b', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\b \b \b'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\f \\f \\f', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\f \f \f'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts( + '\\\' \\" \\` \\\\ \\n \\r \\v \\t \\b \\f', {escapedString: true})) + .toEqual([ + [lex.TokenType.TEXT, '\' \" \` \\ \n \n \v \t \b \f'], + [lex.TokenType.EOF], + + ]); + }); + + it('should unescape null sequences', () => { + expect(tokenizeAndHumanizeParts('\\0', {escapedString: true})).toEqual([ + [lex.TokenType.EOF], + ]); + // \09 is not an octal number so the \0 is taken as EOF + expect(tokenizeAndHumanizeParts('\\09', {escapedString: true})).toEqual([ + [lex.TokenType.EOF], + ]); + }); + + it('should unescape octal sequences', () => { + // \19 is read as an octal `\1` followed by a normal char `9` + // \1234 is read as an octal `\123` followed by a normal char `4` + // \999 is not an octal number so its backslash just gets removed. + expect(tokenizeAndHumanizeParts( + '\\001 \\01 \\1 \\12 \\223 \\19 \\2234 \\999', {escapedString: true})) + .toEqual([ + [lex.TokenType.TEXT, '\x01 \x01 \x01 \x0A \x93 \x019 \x934 999'], + [lex.TokenType.EOF], + ]); + }); + + it('should unescape hex sequences', () => { + expect(tokenizeAndHumanizeParts('\\x12 \\x4F \\xDC', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\x12 \x4F \xDC'], + [lex.TokenType.EOF], + ]); + }); + + it('should report an error on an invalid hex sequence', () => { + expect(tokenizeAndHumanizeErrors('\\xGG', {escapedString: true})).toEqual([ + [null, 'Invalid hexadecimal escape sequence', '0:2'] + ]); + + expect(tokenizeAndHumanizeErrors('abc \\x xyz', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, 'Invalid hexadecimal escape sequence', '0:6'] + ]); + + expect(tokenizeAndHumanizeErrors('abc\\x', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:5'] + ]); + }); + + it('should unescape fixed length Unicode sequences', () => { + expect(tokenizeAndHumanizeParts('\\u0123 \\uABCD', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, '\u0123 \uABCD'], + [lex.TokenType.EOF], + ]); + }); + + it('should error on an invalid fixed length Unicode sequence', () => { + expect(tokenizeAndHumanizeErrors('\\uGGGG', {escapedString: true})).toEqual([ + [null, 'Invalid hexadecimal escape sequence', '0:2'] + ]); + }); + + it('should unescape variable length Unicode sequences', () => { + expect(tokenizeAndHumanizeParts( + '\\u{01} \\u{ABC} \\u{1234} \\u{123AB}', {escapedString: true})) + .toEqual([ + [lex.TokenType.TEXT, '\u{01} \u{ABC} \u{1234} \u{123AB}'], + [lex.TokenType.EOF], + ]); + }); + + it('should error on an invalid variable length Unicode sequence', () => { + expect(tokenizeAndHumanizeErrors('\\u{GG}', {escapedString: true})).toEqual([ + [null, 'Invalid hexadecimal escape sequence', '0:3'] + ]); + }); + + it('should unescape line continuations', () => { + expect(tokenizeAndHumanizeParts('abc\\\ndef', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, 'abcdef'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeParts('\\\nx\\\ny\\\n', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, 'xy'], + [lex.TokenType.EOF], + ]); + }); + + it('should remove backslash from "non-escape" sequences', () => { + expect(tokenizeAndHumanizeParts('\a \g \~', {escapedString: true})).toEqual([ + [lex.TokenType.TEXT, 'a g ~'], + [lex.TokenType.EOF], + ]); + }); + + it('should unescape sequences in plain text', () => { + expect(tokenizeAndHumanizeParts('abc\ndef\\nghi\\tjkl\\`\\\'\\"mno', {escapedString: true})) + .toEqual([ + [lex.TokenType.TEXT, 'abc\ndef\nghi\tjkl`\'"mno'], + [lex.TokenType.EOF], + ]); + }); + + it('should unescape sequences in raw text', () => { + expect(tokenizeAndHumanizeParts( + '', {escapedString: true})) + .toEqual([ + [lex.TokenType.TAG_OPEN_START, null, 'script'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.RAW_TEXT, 'abc\ndef\nghi\tjkl`\'"mno'], + [lex.TokenType.TAG_CLOSE, null, 'script'], + [lex.TokenType.EOF], + ]); + }); + + it('should unescape sequences in escapable raw text', () => { + expect(tokenizeAndHumanizeParts( + '