diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 974a248..507cdda 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -20,14 +20,6 @@ vector> Lexer::getTokens() { exit(1); } - currentIndex += token->getLexme().length(); - currentColumn += token->getLexme().length(); - - if (token->getKind() == Token::Kind::NEW_LINE) { - currentLine++; - currentColumn = 0; - } - // filter out multiple new lines if (tokens.empty() || token->getKind() != Token::Kind::NEW_LINE || tokens.back()->getKind() != token->getKind()) tokens.push_back(token); @@ -35,7 +27,8 @@ vector> Lexer::getTokens() { return tokens; } -shared_ptr Lexer::nextToken() { +shared_ptr Lexer::nextToken() { + // Ignore white spaces while (currentIndex < source.length() && isWhiteSpace(currentIndex)) { currentIndex++; currentColumn++; @@ -43,6 +36,69 @@ shared_ptr Lexer::nextToken() { shared_ptr token; + // ignore // comment + token = match(Token::Kind::INVALID, "//", false); + if (token) { + currentIndex += 2; + do { + // new line + token = match(Token::Kind::NEW_LINE, "\n", false); + if (token != nullptr) + return token; + + // eof + token = matchEnd(); + if (token != nullptr) + return token; + + // if either not found, go to then next character + currentIndex++; + } while(true); + } + + // ignore /* */ comment + token = match(Token::Kind::INVALID, "/*", false); + if (token) { + shared_ptr newLineToken = nullptr; // we want to return the first new line we come accross + int depth = 1; // so we can embed comments inside each other + do { + // new line + token = match(Token::Kind::NEW_LINE, "\n", false); + newLineToken = newLineToken ? newLineToken : token; + if (token) { + continue;; + } + + // eof + token = matchEnd(); + if (token) + return make_shared(Token::Kind::INVALID, "", currentLine, currentColumn); + + // go deeper + token = match(Token::Kind::INVALID, "/*", false); + if (token) { + depth++; + continue; + } + + // go back + token = match(Token::Kind::INVALID, "*/", false); + if (token) { + depth--; + } + + if (depth > 0) { + currentIndex++; + currentColumn++; + } + } while(depth > 0); + + if (newLineToken) + return newLineToken; + else + return nextToken(); // gets rid of remaining white spaces without repeating the code + } + // arithmetic token = match(Token::Kind::PLUS, "+", false); if (token != nullptr) @@ -126,15 +182,11 @@ shared_ptr Lexer::nextToken() { return token; // new line - token = match(Token::Kind::NEW_LINE, "\r\n", false); - if (token != nullptr) - return token; - token = match(Token::Kind::NEW_LINE, "\n", false); if (token != nullptr) return token; - // other + // eof token = matchEnd(); if (token != nullptr) return token; @@ -146,10 +198,12 @@ shared_ptr Lexer::match(Token::Kind kind, string lexme, bool needsSeparat bool isMatching = source.compare(currentIndex, lexme.length(), lexme) == 0; bool isSeparatorSatisfied = !needsSeparator || isSeparator(currentIndex + lexme.length()); - if (isMatching && isSeparatorSatisfied) - return make_shared(kind, lexme, currentLine, currentColumn); - else + if (!isMatching || !isSeparatorSatisfied) return nullptr; + + shared_ptr token = make_shared(kind, lexme, currentLine, currentColumn); + advanceWithToken(token); + return token; } shared_ptr Lexer::matchInteger() { @@ -162,7 +216,9 @@ shared_ptr Lexer::matchInteger() { return nullptr; string lexme = source.substr(currentIndex, nextIndex - currentIndex); - return make_shared(Token::Kind::INTEGER, lexme, currentLine, currentColumn); + shared_ptr token = make_shared(Token::Kind::INTEGER, lexme, currentLine, currentColumn); + advanceWithToken(token); + return token; } shared_ptr Lexer::matchIdentifier() { @@ -175,7 +231,9 @@ shared_ptr Lexer::matchIdentifier() { return nullptr; string lexme = source.substr(currentIndex, nextIndex - currentIndex); - return make_shared(Token::Kind::IDENTIFIER, lexme, currentLine, currentColumn); + shared_ptr token = make_shared(Token::Kind::IDENTIFIER, lexme, currentLine, currentColumn); + advanceWithToken(token); + return token; } shared_ptr Lexer::matchEnd() { @@ -229,9 +287,18 @@ bool Lexer::isSeparator(int index) { case ' ': case '\t': case '\n': - case '\r'; return true; default: return false; } } + + void Lexer::advanceWithToken(shared_ptr token) { + if (token->getKind() == Token::Kind::NEW_LINE) { + currentLine++; + currentColumn = 0; + } else { + currentColumn += token->getLexme().length(); + } + currentIndex += token->getLexme().length(); + } diff --git a/src/Lexer.h b/src/Lexer.h index b18dbb6..3785ada 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -14,24 +14,22 @@ private: int currentLine = 0; int currentColumn = 0; - Token nextToken(); - Token matchEnd(); - Token matchNewLine(); - Token matchInvalid(); - Token matchSymbol(char symbol, Token::Kind kind); - Token matchKeyword(string keyword, Token::Kind kind); - Token matchInteger(); - Token matchIdentifier(); + shared_ptr nextToken(); + shared_ptr match(Token::Kind kind, string lexme, bool needsSeparator); + shared_ptr matchInteger(); + shared_ptr matchIdentifier(); + shared_ptr matchEnd(); + shared_ptr matchInvalid(); bool isWhiteSpace(int index); - bool isNewLine(int index); bool isDigit(int index); bool isIdentifier(int index); bool isSeparator(int index); + void advanceWithToken(shared_ptr token); public: Lexer(string source); - vector getTokens(); + vector> getTokens(); }; #endif \ No newline at end of file