From f7c867ebc28d6f1d0f9315c0d816ac0d2ea00269 Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Tue, 26 Feb 2019 13:06:26 +0000 Subject: [PATCH] fix(ivy): correctly tokenize escaped characters in templates (#28978) Previously the start of a character indicated by an escape sequence was being incorrectly computed by the lexer, which caused tokens to include the start of the escaped character sequence in the preceding token. In particular this affected the name extracted from opening tags if the name was terminated by an escape sequence. For example, `` would have the name `t\` rather than `t`. This fix refactors the lexer to use a "cursor" object to iterate over the characters in the template source. There are two cursor implementations, one expects a simple string, the other expects a string that contains JavaScript escape sequences that need to be unescaped. PR Close #28978 --- packages/compiler/src/ml_parser/lexer.ts | 751 ++++++++++-------- .../compiler/test/ml_parser/lexer_spec.ts | 72 +- 2 files changed, 476 insertions(+), 347 deletions(-) diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index 0e2f44f2b0..ef798bb3aa 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -120,17 +120,10 @@ class _ControlFlowError { // See http://www.w3.org/TR/html51/syntax.html#writing class _Tokenizer { - private _input: string; - private _end: number; + private _cursor: CharacterCursor; private _tokenizeIcu: boolean; private _interpolationConfig: InterpolationConfig; - private _escapedString: boolean; - private _peek: number = -1; - private _nextPeek: number = -1; - private _index: number; - private _line: number; - private _column: number; - private _currentTokenStart: ParseLocation|null = null; + private _currentTokenStart: CharacterCursor|null = null; private _currentTokenType: TokenType|null = null; private _expansionCaseStack: TokenType[] = []; private _inInterpolation: boolean = false; @@ -145,31 +138,18 @@ class _Tokenizer { * @param _interpolationConfig */ constructor( - private _file: ParseSourceFile, private _getTagDefinition: (tagName: string) => TagDefinition, + _file: ParseSourceFile, private _getTagDefinition: (tagName: string) => TagDefinition, options: TokenizeOptions) { this._tokenizeIcu = options.tokenizeExpansionForms || false; this._interpolationConfig = options.interpolationConfig || DEFAULT_INTERPOLATION_CONFIG; - this._escapedString = options.escapedString || false; - this._input = _file.content; - if (options.range) { - this._end = options.range.endPos; - this._index = options.range.startPos; - this._line = options.range.startLine; - this._column = options.range.startCol; - } else { - this._end = this._input.length; - this._index = 0; - this._line = 0; - this._column = 0; - } + const range = + options.range || {endPos: _file.content.length, startPos: 0, startLine: 0, startCol: 0}; + this._cursor = options.escapedString ? new EscapedCharacterCursor(_file, range) : + new PlainCharacterCursor(_file, range); try { - this._initPeek(); + this._cursor.init(); } catch (e) { - if (e instanceof _ControlFlowError) { - this.errors.push(e.error); - } else { - throw e; - } + this.handleError(e); } } @@ -182,8 +162,8 @@ class _Tokenizer { } tokenize(): TokenizeResult { - while (this._peek !== chars.$EOF) { - const start = this._getLocation(); + while (this._cursor.peek() !== chars.$EOF) { + const start = this._cursor.clone(); try { if (this._attemptCharCode(chars.$LT)) { if (this._attemptCharCode(chars.$BANG)) { @@ -203,11 +183,7 @@ class _Tokenizer { this._consumeText(); } } catch (e) { - if (e instanceof _ControlFlowError) { - this.errors.push(e.error); - } else { - throw e; - } + this.handleError(e); } } this._beginToken(TokenType.EOF); @@ -220,17 +196,17 @@ class _Tokenizer { * @internal */ private _tokenizeExpansionForm(): boolean { - if (isExpansionFormStart(this._input, this._index, this._interpolationConfig)) { + if (this.isExpansionFormStart()) { this._consumeExpansionFormStart(); return true; } - if (isExpansionCaseStart(this._peek) && this._isInExpansionForm()) { + if (isExpansionCaseStart(this._cursor.peek()) && this._isInExpansionForm()) { this._consumeExpansionCaseStart(); return true; } - if (this._peek === chars.$RBRACE) { + if (this._cursor.peek() === chars.$RBRACE) { if (this._isInExpansionCase()) { this._consumeExpansionCaseEnd(); return true; @@ -245,34 +221,24 @@ class _Tokenizer { return false; } - private _getLocation(): ParseLocation { - return new ParseLocation(this._file, this._index, this._line, this._column); - } - - private _getSpan( - start: ParseLocation = this._getLocation(), - end: ParseLocation = this._getLocation()): ParseSourceSpan { - return new ParseSourceSpan(start, end); - } - - private _beginToken(type: TokenType, start: ParseLocation = this._getLocation()) { + private _beginToken(type: TokenType, start = this._cursor.clone()) { this._currentTokenStart = start; this._currentTokenType = type; } - private _endToken(parts: string[], end: ParseLocation = this._getLocation()): Token { + private _endToken(parts: string[], end = this._cursor.clone()): Token { if (this._currentTokenStart === null) { throw new TokenError( 'Programming error - attempted to end a token when there was no start to the token', - this._currentTokenType, this._getSpan(end, end)); + this._currentTokenType, this._cursor.getSpan(end)); } if (this._currentTokenType === null) { throw new TokenError( 'Programming error - attempted to end a token which has no token type', null, - this._getSpan(this._currentTokenStart, end)); + this._cursor.getSpan(this._currentTokenStart)); } const token = - new Token(this._currentTokenType, parts, new ParseSourceSpan(this._currentTokenStart, end)); + new Token(this._currentTokenType, parts, this._cursor.getSpan(this._currentTokenStart)); this.tokens.push(token); this._currentTokenStart = null; this._currentTokenType = null; @@ -284,89 +250,57 @@ class _Tokenizer { msg += ` (Do you have an unescaped "{" in your template? Use "{{ '{' }}") to escape it.)`; } const error = new TokenError(msg, this._currentTokenType, span); - this._currentTokenStart = null !; - this._currentTokenType = null !; + this._currentTokenStart = null; + this._currentTokenType = null; return new _ControlFlowError(error); } - private _advance(processingEscapeSequence?: boolean) { - if (this._index >= this._end) { - throw this._createError(_unexpectedCharacterErrorMsg(chars.$EOF), this._getSpan()); + private handleError(e: any) { + if (e instanceof CursorError) { + e = this._createError(e.msg, this._cursor.getSpan(e.cursor)); } - // The actual character in the input might be different to the _peek if we are processing - // escape characters. We only want to track "real" new lines. - const actualChar = this._input.charCodeAt(this._index); - if (actualChar === chars.$LF) { - this._line++; - this._column = 0; - } else if (!chars.isNewLine(actualChar)) { - this._column++; - } - this._index++; - this._initPeek(processingEscapeSequence); - } - - /** - * Initialize the _peek and _nextPeek properties based on the current _index. - * @param processingEscapeSequence whether we are in the middle of processing an escape sequence. - */ - private _initPeek(processingEscapeSequence?: boolean) { - this._peek = this._index >= this._end ? chars.$EOF : this._input.charCodeAt(this._index); - this._nextPeek = - this._index + 1 >= this._end ? chars.$EOF : this._input.charCodeAt(this._index + 1); - if (this._peek === chars.$BACKSLASH && processingEscapeSequence !== true && - this._escapedString) { - this._processEscapeSequence(); - } - } - - /** - * Advance the specific number of characters. - * @param count The number of characters to advance. - * @param processingEscapeSequence Whether we want `advance()` to process escape sequences. - */ - private _advanceN(count: number, processingEscapeSequence?: boolean) { - while (count) { - this._advance(processingEscapeSequence); - count--; + if (e instanceof _ControlFlowError) { + this.errors.push(e.error); + } else { + throw e; } } private _attemptCharCode(charCode: number): boolean { - if (this._peek === charCode) { - this._advance(); + if (this._cursor.peek() === charCode) { + this._cursor.advance(); return true; } return false; } private _attemptCharCodeCaseInsensitive(charCode: number): boolean { - if (compareCharCodeCaseInsensitive(this._peek, charCode)) { - this._advance(); + if (compareCharCodeCaseInsensitive(this._cursor.peek(), charCode)) { + this._cursor.advance(); return true; } return false; } private _requireCharCode(charCode: number) { - const location = this._getLocation(); + const location = this._cursor.clone(); if (!this._attemptCharCode(charCode)) { throw this._createError( - _unexpectedCharacterErrorMsg(this._peek), this._getSpan(location, location)); + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(location)); } } private _attemptStr(chars: string): boolean { const len = chars.length; - if (this._index + len > this._end) { + if (this._cursor.charsLeft() < len) { return false; } - const initialPosition = this._savePosition(); + const initialPosition = this._cursor.clone(); for (let i = 0; i < len; i++) { if (!this._attemptCharCode(chars.charCodeAt(i))) { // If attempting to parse the string fails, we want to reset the parser // to where it was before the attempt - this._restorePosition(initialPosition); + this._cursor = initialPosition; return false; } } @@ -383,277 +317,162 @@ class _Tokenizer { } private _requireStr(chars: string) { - const location = this._getLocation(); + const location = this._cursor.clone(); if (!this._attemptStr(chars)) { - throw this._createError(_unexpectedCharacterErrorMsg(this._peek), this._getSpan(location)); + throw this._createError( + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(location)); } } private _attemptCharCodeUntilFn(predicate: (code: number) => boolean) { - while (!predicate(this._peek)) { - this._advance(); + while (!predicate(this._cursor.peek())) { + this._cursor.advance(); } } private _requireCharCodeUntilFn(predicate: (code: number) => boolean, len: number) { - const start = this._getLocation(); + const start = this._cursor.clone(); this._attemptCharCodeUntilFn(predicate); - if (this._index - start.offset < len) { + const end = this._cursor.clone(); + if (end.diff(start) < len) { throw this._createError( - _unexpectedCharacterErrorMsg(this._peek), this._getSpan(start, start)); + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(start)); } } private _attemptUntilChar(char: number) { - while (this._peek !== char) { - this._advance(); + while (this._cursor.peek() !== char) { + this._cursor.advance(); } } private _readChar(decodeEntities: boolean): string { - if (decodeEntities && this._peek === chars.$AMPERSAND) { + if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) { return this._decodeEntity(); } else { // Don't rely upon reading directly from `_input` as the actual char value // may have been generated from an escape sequence. - const char = String.fromCodePoint(this._peek); - this._advance(); + const char = String.fromCodePoint(this._cursor.peek()); + this._cursor.advance(); return char; } } private _decodeEntity(): string { - const start = this._getLocation(); - this._advance(); + const start = this._cursor.clone(); + this._cursor.advance(); if (this._attemptCharCode(chars.$HASH)) { const isHex = this._attemptCharCode(chars.$x) || this._attemptCharCode(chars.$X); - const numberStart = this._getLocation().offset; + const codeStart = this._cursor.clone(); this._attemptCharCodeUntilFn(isDigitEntityEnd); - if (this._peek != chars.$SEMICOLON) { - throw this._createError(_unexpectedCharacterErrorMsg(this._peek), this._getSpan()); + if (this._cursor.peek() != chars.$SEMICOLON) { + throw this._createError( + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan()); } - this._advance(); - const strNum = this._input.substring(numberStart, this._index - 1); + const strNum = this._cursor.getChars(codeStart); + this._cursor.advance(); try { const charCode = parseInt(strNum, isHex ? 16 : 10); return String.fromCharCode(charCode); } catch { - const entity = this._input.substring(start.offset + 1, this._index - 1); - throw this._createError(_unknownEntityErrorMsg(entity), this._getSpan(start)); + throw this._createError( + _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan()); } } else { - const startPosition = this._savePosition(); + const nameStart = this._cursor.clone(); this._attemptCharCodeUntilFn(isNamedEntityEnd); - if (this._peek != chars.$SEMICOLON) { - this._restorePosition(startPosition); + if (this._cursor.peek() != chars.$SEMICOLON) { + this._cursor = nameStart; return '&'; } - this._advance(); - const name = this._input.substring(start.offset + 1, this._index - 1); + const name = this._cursor.getChars(nameStart); + this._cursor.advance(); const char = NAMED_ENTITIES[name]; if (!char) { - throw this._createError(_unknownEntityErrorMsg(name), this._getSpan(start)); + throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); } return char; } } - /** - * Process the escape sequence that starts at the current position in the text. - * - * This method is called from `_advance()` to ensure that escape sequences are - * always processed correctly however tokens are being consumed. - * - * But note that this method also calls `_advance()` (re-entering) to move through - * the characters within an escape sequence. In that case it tells `_advance()` not - * to attempt to process further escape sequences by passing `true` as its first - * argument. - */ - private _processEscapeSequence(): void { - this._advance(true); // advance past the backslash - - // First check for standard control char sequences - if (this._peekChar() === chars.$n) { - this._peek = chars.$LF; - } else if (this._peekChar() === chars.$r) { - this._peek = chars.$CR; - } else if (this._peekChar() === chars.$v) { - this._peek = chars.$VTAB; - } else if (this._peekChar() === chars.$t) { - this._peek = chars.$TAB; - } else if (this._peekChar() === chars.$b) { - this._peek = chars.$BSPACE; - } else if (this._peekChar() === chars.$f) { - this._peek = chars.$FF; - } - - // Now consider more complex sequences - - else if (this._peekChar() === chars.$u) { - // Unicode code-point sequence - this._advance(true); // advance past the `u` char - if (this._peekChar() === chars.$LBRACE) { - // Variable length Unicode, e.g. `\x{123}` - this._advance(true); // advance past the `{` char - // Advance past the variable number of hex digits until we hit a `}` char - const start = this._getLocation(); - while (this._peekChar() !== chars.$RBRACE) { - this._advance(true); - } - this._decodeHexDigits(start, this._index - start.offset); - } else { - // Fixed length Unicode, e.g. `\u1234` - this._parseFixedHexSequence(4); - } - } - - else if (this._peekChar() === chars.$x) { - // Hex char code, e.g. `\x2F` - this._advance(true); // advance past the `x` char - this._parseFixedHexSequence(2); - } - - else if (chars.isOctalDigit(this._peekChar())) { - // Octal char code, e.g. `\012`, - const start = this._index; - let length = 1; - // Note that we work with `_nextPeek` because, although we check the next character - // after the sequence to find the end of the sequence, - // we do not want to advance that far to check the character, otherwise we will - // have to back up. - while (chars.isOctalDigit(this._nextPeek) && length < 3) { - this._advance(true); - length++; - } - const octal = this._input.substr(start, length); - this._peek = parseInt(octal, 8); - } - - else if (chars.isNewLine(this._peekChar())) { - // Line continuation `\` followed by a new line - this._advance(true); // advance over the newline - } - - // If none of the `if` blocks were executed then we just have an escaped normal character. - // In that case we just, effectively, skip the backslash from the character. - } - - private _parseFixedHexSequence(length: number) { - const start = this._getLocation(); - this._advanceN(length - 1, true); - this._decodeHexDigits(start, length); - } - - private _decodeHexDigits(start: ParseLocation, length: number) { - const hex = this._input.substr(start.offset, length); - const charCode = parseInt(hex, 16); - if (!isNaN(charCode)) { - this._peek = charCode; - } else { - throw this._createError( - 'Invalid hexadecimal escape sequence', this._getSpan(start, this._getLocation())); - } - } - - /** - * This little helper is to solve a problem where the TS compiler will narrow - * the type of `_peek` after an `if` statment, even if there is a call to a - * method that might mutate the `_peek`. - * - * For example: - * - * ``` - * if (this._peek === 10) { - * this._advance(); // mutates _peek - * if (this._peek === 20) { - * ... - * ``` - * - * The second if statement fails TS compilation because the compiler has determined - * that `_peek` is `10` and so can never be equal to `20`. - */ - private _peekChar(): number { return this._peek; } - - private _consumeRawText( - decodeEntities: boolean, firstCharOfEnd: number, attemptEndRest: () => boolean): Token { - let tagCloseStart: ParseLocation; - const textStart = this._getLocation(); - this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT, textStart); + private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token { + this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); const parts: string[] = []; while (true) { - tagCloseStart = this._getLocation(); - if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) { + const tagCloseStart = this._cursor.clone(); + const foundEndMarker = endMarkerPredicate(); + this._cursor = tagCloseStart; + if (foundEndMarker) { break; } - if (this._index > tagCloseStart.offset) { - // add the characters consumed by the previous if statement to the output - parts.push(this._input.substring(tagCloseStart.offset, this._index)); - } - while (this._peek !== firstCharOfEnd) { - parts.push(this._readChar(decodeEntities)); - } + parts.push(this._readChar(decodeEntities)); } - return this._endToken([this._processCarriageReturns(parts.join(''))], tagCloseStart); + return this._endToken([this._processCarriageReturns(parts.join(''))]); } - private _consumeComment(start: ParseLocation) { + private _consumeComment(start: CharacterCursor) { this._beginToken(TokenType.COMMENT_START, start); this._requireCharCode(chars.$MINUS); this._endToken([]); - const textToken = this._consumeRawText(false, chars.$MINUS, () => this._attemptStr('->')); - this._beginToken(TokenType.COMMENT_END, textToken.sourceSpan.end); + this._consumeRawText(false, () => this._attemptStr('-->')); + this._beginToken(TokenType.COMMENT_END); + this._requireStr('-->'); this._endToken([]); } - private _consumeCdata(start: ParseLocation) { + private _consumeCdata(start: CharacterCursor) { this._beginToken(TokenType.CDATA_START, start); this._requireStr('CDATA['); this._endToken([]); - const textToken = this._consumeRawText(false, chars.$RBRACKET, () => this._attemptStr(']>')); - this._beginToken(TokenType.CDATA_END, textToken.sourceSpan.end); + this._consumeRawText(false, () => this._attemptStr(']]>')); + this._beginToken(TokenType.CDATA_END); + this._requireStr(']]>'); this._endToken([]); } - private _consumeDocType(start: ParseLocation) { + private _consumeDocType(start: CharacterCursor) { this._beginToken(TokenType.DOC_TYPE, start); + const contentStart = this._cursor.clone(); this._attemptUntilChar(chars.$GT); - this._advance(); - this._endToken([this._input.substring(start.offset + 2, this._index - 1)]); + const content = this._cursor.getChars(contentStart); + this._cursor.advance(); + this._endToken([content]); } private _consumePrefixAndName(): string[] { - const nameOrPrefixStart = this._index; + const nameOrPrefixStart = this._cursor.clone(); let prefix: string = ''; - while (this._peek !== chars.$COLON && !isPrefixEnd(this._peek)) { - this._advance(); + while (this._cursor.peek() !== chars.$COLON && !isPrefixEnd(this._cursor.peek())) { + this._cursor.advance(); } - let nameStart: number; - if (this._peek === chars.$COLON) { - this._advance(); - prefix = this._input.substring(nameOrPrefixStart, this._index - 1); - nameStart = this._index; + let nameStart: CharacterCursor; + if (this._cursor.peek() === chars.$COLON) { + prefix = this._cursor.getChars(nameOrPrefixStart); + this._cursor.advance(); + nameStart = this._cursor.clone(); } else { nameStart = nameOrPrefixStart; } - this._requireCharCodeUntilFn(isNameEnd, this._index === nameStart ? 1 : 0); - const name = this._input.substring(nameStart, this._index); + this._requireCharCodeUntilFn(isNameEnd, prefix === '' ? 0 : 1); + const name = this._cursor.getChars(nameStart); return [prefix, name]; } - private _consumeTagOpen(start: ParseLocation) { - const savedPos = this._savePosition(); + private _consumeTagOpen(start: CharacterCursor) { let tagName: string; let prefix: string; let openTagToken: Token|undefined; + const innerStart = this._cursor.clone(); try { - if (!chars.isAsciiLetter(this._peek)) { - throw this._createError(_unexpectedCharacterErrorMsg(this._peek), this._getSpan()); + if (!chars.isAsciiLetter(this._cursor.peek())) { + throw this._createError( + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(start)); } openTagToken = this._consumeTagOpenStart(start); prefix = openTagToken.parts[0]; tagName = openTagToken.parts[1]; this._attemptCharCodeUntilFn(isNotWhitespace); - while (this._peek !== chars.$SLASH && this._peek !== chars.$GT) { + while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT) { this._consumeAttributeName(); this._attemptCharCodeUntilFn(isNotWhitespace); if (this._attemptCharCode(chars.$EQ)) { @@ -666,7 +485,10 @@ class _Tokenizer { } catch (e) { if (e instanceof _ControlFlowError) { // When the start tag is invalid, assume we want a "<" - this._restorePosition(savedPos); + this._cursor = innerStart; + if (openTagToken) { + this.tokens.pop(); + } // Back to back text tokens are merged at the end this._beginToken(TokenType.TEXT, start); this._endToken(['<']); @@ -686,18 +508,21 @@ class _Tokenizer { } private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) { - const textToken = this._consumeRawText(decodeEntities, chars.$LT, () => { + const textToken = this._consumeRawText(decodeEntities, () => { + if (!this._attemptCharCode(chars.$LT)) return false; if (!this._attemptCharCode(chars.$SLASH)) return false; this._attemptCharCodeUntilFn(isNotWhitespace); if (!this._attemptStrCaseInsensitive(tagName)) return false; this._attemptCharCodeUntilFn(isNotWhitespace); return this._attemptCharCode(chars.$GT); }); - this._beginToken(TokenType.TAG_CLOSE, textToken.sourceSpan.end); + this._beginToken(TokenType.TAG_CLOSE); + this._requireCharCodeUntilFn(code => code === chars.$GT, 3); + this._cursor.advance(); // Consume the `>` this._endToken([prefix, tagName]); } - private _consumeTagOpenStart(start: ParseLocation) { + private _consumeTagOpenStart(start: CharacterCursor) { this._beginToken(TokenType.TAG_OPEN_START, start); const parts = this._consumePrefixAndName(); return this._endToken(parts); @@ -711,26 +536,26 @@ class _Tokenizer { private _consumeAttributeValue() { let value: string; - if (this._peek === chars.$SQ || this._peek === chars.$DQ) { + if (this._cursor.peek() === chars.$SQ || this._cursor.peek() === chars.$DQ) { this._beginToken(TokenType.ATTR_QUOTE); - const quoteChar = this._peek; - this._advance(); + const quoteChar = this._cursor.peek(); + this._cursor.advance(); this._endToken([String.fromCodePoint(quoteChar)]); this._beginToken(TokenType.ATTR_VALUE); const parts: string[] = []; - while (this._peek !== quoteChar) { + while (this._cursor.peek() !== quoteChar) { parts.push(this._readChar(true)); } value = parts.join(''); this._endToken([this._processCarriageReturns(value)]); this._beginToken(TokenType.ATTR_QUOTE); - this._advance(); + this._cursor.advance(); this._endToken([String.fromCodePoint(quoteChar)]); } else { this._beginToken(TokenType.ATTR_VALUE); - const valueStart = this._index; + const valueStart = this._cursor.clone(); this._requireCharCodeUntilFn(isNameEnd, 1); - value = this._input.substring(valueStart, this._index); + value = this._cursor.getChars(valueStart); this._endToken([this._processCarriageReturns(value)]); } } @@ -743,7 +568,7 @@ class _Tokenizer { this._endToken([]); } - private _consumeTagClose(start: ParseLocation) { + private _consumeTagClose(start: CharacterCursor) { this._beginToken(TokenType.TAG_CLOSE, start); this._attemptCharCodeUntilFn(isNotWhitespace); const prefixAndName = this._consumePrefixAndName(); @@ -753,50 +578,50 @@ class _Tokenizer { } private _consumeExpansionFormStart() { - this._beginToken(TokenType.EXPANSION_FORM_START, this._getLocation()); + this._beginToken(TokenType.EXPANSION_FORM_START); this._requireCharCode(chars.$LBRACE); this._endToken([]); this._expansionCaseStack.push(TokenType.EXPANSION_FORM_START); - this._beginToken(TokenType.RAW_TEXT, this._getLocation()); + this._beginToken(TokenType.RAW_TEXT); const condition = this._readUntil(chars.$COMMA); - this._endToken([condition], this._getLocation()); + this._endToken([condition]); this._requireCharCode(chars.$COMMA); this._attemptCharCodeUntilFn(isNotWhitespace); - this._beginToken(TokenType.RAW_TEXT, this._getLocation()); + this._beginToken(TokenType.RAW_TEXT); const type = this._readUntil(chars.$COMMA); - this._endToken([type], this._getLocation()); + this._endToken([type]); this._requireCharCode(chars.$COMMA); this._attemptCharCodeUntilFn(isNotWhitespace); } private _consumeExpansionCaseStart() { - this._beginToken(TokenType.EXPANSION_CASE_VALUE, this._getLocation()); + this._beginToken(TokenType.EXPANSION_CASE_VALUE); const value = this._readUntil(chars.$LBRACE).trim(); - this._endToken([value], this._getLocation()); + this._endToken([value]); this._attemptCharCodeUntilFn(isNotWhitespace); - this._beginToken(TokenType.EXPANSION_CASE_EXP_START, this._getLocation()); + this._beginToken(TokenType.EXPANSION_CASE_EXP_START); this._requireCharCode(chars.$LBRACE); - this._endToken([], this._getLocation()); + this._endToken([]); this._attemptCharCodeUntilFn(isNotWhitespace); this._expansionCaseStack.push(TokenType.EXPANSION_CASE_EXP_START); } private _consumeExpansionCaseEnd() { - this._beginToken(TokenType.EXPANSION_CASE_EXP_END, this._getLocation()); + this._beginToken(TokenType.EXPANSION_CASE_EXP_END); this._requireCharCode(chars.$RBRACE); - this._endToken([], this._getLocation()); + this._endToken([]); this._attemptCharCodeUntilFn(isNotWhitespace); this._expansionCaseStack.pop(); } private _consumeExpansionFormEnd() { - this._beginToken(TokenType.EXPANSION_FORM_END, this._getLocation()); + this._beginToken(TokenType.EXPANSION_FORM_END); this._requireCharCode(chars.$RBRACE); this._endToken([]); @@ -804,7 +629,7 @@ class _Tokenizer { } private _consumeText() { - const start = this._getLocation(); + const start = this._cursor.clone(); this._beginToken(TokenType.TEXT, start); const parts: string[] = []; @@ -826,17 +651,17 @@ class _Tokenizer { } private _isTextEnd(): boolean { - if (this._peek === chars.$LT || this._peek === chars.$EOF) { + if (this._cursor.peek() === chars.$LT || this._cursor.peek() === chars.$EOF) { return true; } if (this._tokenizeIcu && !this._inInterpolation) { - if (isExpansionFormStart(this._input, this._index, this._interpolationConfig)) { + if (this.isExpansionFormStart()) { // start of an expansion form return true; } - if (this._peek === chars.$RBRACE && this._isInExpansionCase()) { + if (this._cursor.peek() === chars.$RBRACE && this._isInExpansionCase()) { // end of and expansion case return true; } @@ -845,26 +670,10 @@ class _Tokenizer { return false; } - private _savePosition(): [number, number, number, number, number] { - return [this._peek, this._index, this._column, this._line, this.tokens.length]; - } - private _readUntil(char: number): string { - const start = this._index; + const start = this._cursor.clone(); this._attemptUntilChar(char); - return this._input.substring(start, this._index); - } - - private _restorePosition(position: [number, number, number, number, number]): void { - this._peek = position[0]; - this._index = position[1]; - this._column = position[2]; - this._line = position[3]; - const nbTokens = position[4]; - if (nbTokens < this.tokens.length) { - // remove any extra tokens - this.tokens = this.tokens.slice(0, nbTokens); - } + return this._cursor.getChars(start); } private _isInExpansionCase(): boolean { @@ -878,6 +687,19 @@ class _Tokenizer { this._expansionCaseStack[this._expansionCaseStack.length - 1] === TokenType.EXPANSION_FORM_START; } + + private isExpansionFormStart(): boolean { + if (this._cursor.peek() !== chars.$LBRACE) { + return false; + } + if (this._interpolationConfig) { + const start = this._cursor.clone(); + const isInterpolation = this._attemptStr(this._interpolationConfig.start); + this._cursor = start; + return !isInterpolation; + } + return true; + } } function isNotWhitespace(code: number): boolean { @@ -902,14 +724,6 @@ function isNamedEntityEnd(code: number): boolean { return code == chars.$SEMICOLON || code == chars.$EOF || !chars.isAsciiLetter(code); } -function isExpansionFormStart( - input: string, offset: number, interpolationConfig: InterpolationConfig): boolean { - const isInterpolationStart = - interpolationConfig ? input.indexOf(interpolationConfig.start, offset) == offset : false; - - return input.charCodeAt(offset) == chars.$LBRACE && !isInterpolationStart; -} - function isExpansionCaseStart(peek: number): boolean { return peek === chars.$EQ || chars.isAsciiLetter(peek) || chars.isDigit(peek); } @@ -928,7 +742,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] { for (let i = 0; i < srcTokens.length; i++) { const token = srcTokens[i]; if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) { - lastDstToken.parts[0] += token.parts[0]; + lastDstToken.parts[0] ! += token.parts[0]; lastDstToken.sourceSpan.end = token.sourceSpan.end; } else { lastDstToken = token; @@ -938,3 +752,258 @@ function mergeTextTokens(srcTokens: Token[]): Token[] { return dstTokens; } + + +/** + * The _Tokenizer uses objects of this type to move through the input text, + * extracting "parsed characters". These could be more than one actual character + * if the text contains escape sequences. + */ +interface CharacterCursor { + /** Initialize the cursor. */ + init(): void; + /** The parsed character at the current cursor position. */ + peek(): number; + /** Advance the cursor by one parsed character. */ + advance(): void; + /** Get a span from the marked start point to the current point. */ + getSpan(start?: this): ParseSourceSpan; + /** Get the parsed characters from the marked start point to the current point. */ + getChars(start: this): string; + /** The number of characters left before the end of the cursor. */ + charsLeft(): number; + /** The number of characters between `this` cursor and `other` cursor. */ + diff(other: this): number; + /** Make a copy of this cursor */ + clone(): CharacterCursor; +} + +interface CursorState { + peek: number; + offset: number; + line: number; + column: number; +} + +class PlainCharacterCursor implements CharacterCursor { + protected state: CursorState; + protected file: ParseSourceFile; + protected input: string; + protected end: number; + + constructor(fileOrCursor: PlainCharacterCursor); + constructor(fileOrCursor: ParseSourceFile, range: LexerRange); + constructor(fileOrCursor: ParseSourceFile|PlainCharacterCursor, range?: LexerRange) { + if (fileOrCursor instanceof PlainCharacterCursor) { + this.file = fileOrCursor.file; + this.input = fileOrCursor.input; + this.end = fileOrCursor.end; + this.state = {...fileOrCursor.state}; + } else { + if (!range) { + throw new Error( + 'Programming error: the range argument must be provided with a file argument.'); + } + this.file = fileOrCursor; + this.input = fileOrCursor.content; + this.end = range.endPos; + this.state = { + peek: -1, + offset: range.startPos, + line: range.startLine, + column: range.startCol, + }; + } + } + + clone(): PlainCharacterCursor { return new PlainCharacterCursor(this); } + + peek() { return this.state.peek; } + charsLeft() { return this.end - this.state.offset; } + diff(other: this) { return this.state.offset - other.state.offset; } + + advance(): void { this.advanceState(this.state); } + + init(): void { this.updatePeek(this.state); } + + getSpan(start?: this): ParseSourceSpan { + start = start || this; + return new ParseSourceSpan( + new ParseLocation(start.file, start.state.offset, start.state.line, start.state.column), + new ParseLocation(this.file, this.state.offset, this.state.line, this.state.column)); + } + + getChars(start: this): string { + return this.input.substring(start.state.offset, this.state.offset); + } + + charAt(pos: number): number { return this.input.charCodeAt(pos); } + + protected advanceState(state: CursorState) { + if (state.offset >= this.end) { + this.state = state; + throw new CursorError('Unexpected character "EOF"', this); + } + const currentChar = this.charAt(state.offset); + if (currentChar === chars.$LF) { + state.line++; + state.column = 0; + } else if (!chars.isNewLine(currentChar)) { + state.column++; + } + state.offset++; + this.updatePeek(state); + } + + protected updatePeek(state: CursorState): void { + state.peek = state.offset >= this.end ? chars.$EOF : this.charAt(state.offset); + } +} + +class EscapedCharacterCursor extends PlainCharacterCursor { + protected internalState: CursorState; + + constructor(fileOrCursor: EscapedCharacterCursor); + constructor(fileOrCursor: ParseSourceFile, range: LexerRange); + constructor(fileOrCursor: ParseSourceFile|EscapedCharacterCursor, range?: LexerRange) { + if (fileOrCursor instanceof EscapedCharacterCursor) { + super(fileOrCursor); + this.internalState = {...fileOrCursor.internalState}; + } else { + super(fileOrCursor, range !); + this.internalState = this.state; + } + } + + advance(): void { + this.state = this.internalState; + super.advance(); + this.processEscapeSequence(); + } + + init(): void { + super.init(); + this.processEscapeSequence(); + } + + clone(): EscapedCharacterCursor { return new EscapedCharacterCursor(this); } + + getChars(start: this): string { + const cursor = start.clone(); + let chars = ''; + while (cursor.internalState.offset < this.internalState.offset) { + chars += String.fromCodePoint(cursor.peek()); + cursor.advance(); + } + return chars; + } + + /** + * Process the escape sequence that starts at the current position in the text. + * + * This method is called to ensure that `peek` has the unescaped value of escape sequences. + */ + protected processEscapeSequence(): void { + const peek = () => this.internalState.peek; + + if (peek() === chars.$BACKSLASH) { + // We have hit an escape sequence so we need the internal state to become independent + // of the external state. + this.internalState = {...this.state}; + + // Move past the backslash + this.advanceState(this.internalState); + + // First check for standard control char sequences + if (peek() === chars.$n) { + this.state.peek = chars.$LF; + } else if (peek() === chars.$r) { + this.state.peek = chars.$CR; + } else if (peek() === chars.$v) { + this.state.peek = chars.$VTAB; + } else if (peek() === chars.$t) { + this.state.peek = chars.$TAB; + } else if (peek() === chars.$b) { + this.state.peek = chars.$BSPACE; + } else if (peek() === chars.$f) { + this.state.peek = chars.$FF; + } + + // Now consider more complex sequences + else if (peek() === chars.$u) { + // Unicode code-point sequence + this.advanceState(this.internalState); // advance past the `u` char + if (peek() === chars.$LBRACE) { + // Variable length Unicode, e.g. `\x{123}` + this.advanceState(this.internalState); // advance past the `{` char + // Advance past the variable number of hex digits until we hit a `}` char + const digitStart = this.clone(); + let length = 0; + while (peek() !== chars.$RBRACE) { + this.advanceState(this.internalState); + length++; + } + this.state.peek = this.decodeHexDigits(digitStart, length); + } else { + // Fixed length Unicode, e.g. `\u1234` + const digitStart = this.clone(); + this.advanceState(this.internalState); + this.advanceState(this.internalState); + this.advanceState(this.internalState); + this.state.peek = this.decodeHexDigits(digitStart, 4); + } + } + + else if (peek() === chars.$x) { + // Hex char code, e.g. `\x2F` + this.advanceState(this.internalState); // advance past the `x` char + const digitStart = this.clone(); + this.advanceState(this.internalState); + this.state.peek = this.decodeHexDigits(digitStart, 2); + } + + else if (chars.isOctalDigit(peek())) { + // Octal char code, e.g. `\012`, + let octal = ''; + let length = 0; + let previous = this.clone(); + while (chars.isOctalDigit(peek()) && length < 3) { + previous = this.clone(); + octal += String.fromCodePoint(peek()); + this.advanceState(this.internalState); + length++; + } + this.state.peek = parseInt(octal, 8); + // Backup one char + this.internalState = previous.internalState; + } + + else if (chars.isNewLine(this.internalState.peek)) { + // Line continuation `\` followed by a new line + this.advanceState(this.internalState); // advance over the newline + this.state = this.internalState; + } + + else { + // If none of the `if` blocks were executed then we just have an escaped normal character. + // In that case we just, effectively, skip the backslash from the character. + this.state.peek = this.internalState.peek; + } + } + } + + protected decodeHexDigits(start: EscapedCharacterCursor, length: number): number { + const hex = this.input.substr(start.internalState.offset, length); + const charCode = parseInt(hex, 16); + if (!isNaN(charCode)) { + return charCode; + } else { + start.state = start.internalState; + throw new CursorError('Invalid hexadecimal escape sequence', start); + } + } +} + +export class CursorError { + constructor(public msg: string, public cursor: CharacterCursor) {} +} diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 7f527d4e65..897ff19654 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -456,7 +456,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u lex.TokenType.TEXT, 'Unknown entity "tbo" - use the "&#;" or "&#x;" syntax', '0:0' ]]); - expect(tokenizeAndHumanizeErrors('&#asdf;')).toEqual([ + expect(tokenizeAndHumanizeErrors('sdf;')).toEqual([ [lex.TokenType.TEXT, 'Unexpected character "s"', '0:3'] ]); expect(tokenizeAndHumanizeErrors(' sdf;')).toEqual([ @@ -891,7 +891,6 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u .toEqual([ [lex.TokenType.TEXT, '\' \" \` \\ \n \n \v \t \b \f'], [lex.TokenType.EOF], - ]); }); @@ -1033,6 +1032,68 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u ]); }); + it('should parse over escaped new line in tag definitions', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + + it('should parse over escaped characters in tag definitions', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + + it('should unescape characters in tag names', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 'td'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 'td'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeSourceSpans(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, ''], + [lex.TokenType.TAG_CLOSE, ''], + [lex.TokenType.EOF, ''], + ]); + }); + + it('should unescape characters in attributes', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.ATTR_NAME, '', 'd'], + [lex.TokenType.ATTR_QUOTE, '"'], + [lex.TokenType.ATTR_VALUE, 'e'], + [lex.TokenType.ATTR_QUOTE, '"'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + + it('should parse over escaped new line in attribute values', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.ATTR_NAME, '', 'a'], + [lex.TokenType.ATTR_VALUE, 'b'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + it('should tokenize the correct span when there are escape sequences', () => { const text = 'selector: "app-root",\ntemplate: "line 1\\n\\"line 2\\"\\nline 3",\ninputs: []'; @@ -1057,7 +1118,6 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u 'line 2\\n' + // <- escaped line break 'line 3\\\n' + // <- line continuation ''; - expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.TEXT, 'line 1'], [lex.TokenType.TAG_CLOSE, '', 't'], @@ -1084,7 +1144,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_END, '1:2'], [lex.TokenType.TEXT, '1:3'], [lex.TokenType.TAG_CLOSE, '1:9'], - [lex.TokenType.TEXT, '1:14'], // <- escaped newline does not increment the row + [lex.TokenType.TEXT, '1:13'], // <- escaped newline does not increment the row [lex.TokenType.TAG_OPEN_START, '1:15'], [lex.TokenType.TAG_OPEN_END, '1:17'], @@ -1099,8 +1159,8 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TEXT, '\n'], [lex.TokenType.TAG_OPEN_START, ''], - [lex.TokenType.TEXT, 'line 2'], [lex.TokenType.TAG_CLOSE, '\\'], - [lex.TokenType.TEXT, 'n'], + [lex.TokenType.TEXT, 'line 2'], [lex.TokenType.TAG_CLOSE, ''], + [lex.TokenType.TEXT, '\\n'], [lex.TokenType.TAG_OPEN_START, ''], [lex.TokenType.TEXT, 'line 3\\\n'], [lex.TokenType.TAG_CLOSE, ''],