Better error reporting

2025-05-31 23:24:21 +09:00
parent 2b78b5bf23
commit 61e648e55b
9 changed files with 198 additions and 108 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -1,92 +1,140 @@
 #include "Lexer.h"

-Lexer::Lexer(std::string source) : source(source) {
+Lexer::Lexer(string source): source(source) {
 }

-std::vector<Token> Lexer::getTokens() {
-    std::vector<Token> tokens;
+vector<Token> Lexer::getTokens() {
+    vector<Token> tokens;
    do {
        Token token = nextToken();
-        currentIndex += token.getLexme().length();

-        if (token.getKind() == Token::Kind::NEW_LINE)
+        // Abort scanning if we got an error
+        if (token.getKind() == Token::Kind::INVALID) {
+            cerr << "Unexpected character '" << token.getLexme() << "' at " << token.getLine() << ":" << token.getColumn() << endl;
+            return vector<Token>();
+         }
+        
+        currentIndex += token.getLexme().length();
+        currentColumn += token.getLexme().length();
+
+        if (token.getKind() == Token::Kind::NEW_LINE) {
            currentLine++;
+            currentColumn = 0;
+        }
        
        // filter out multiple new lines
-        if (tokens.empty() || token.getKind() != Token::Kind::NEW_LINE || tokens.back() != token)
+        if (tokens.empty() || token.getKind() != Token::Kind::NEW_LINE || tokens.back().getKind() != token.getKind())
            tokens.push_back(token);
    } while (tokens.back().getKind() != Token::Kind::END);
    return tokens;
 }

 Token Lexer::nextToken() {
-    Token token = Token::Invalid;
-
-    while (currentIndex < source.length() && isWhiteSpace(currentIndex))
+    while (currentIndex < source.length() && isWhiteSpace(currentIndex)) {
        currentIndex++;
+        currentColumn++;
+    }

-    do {
-        if ((token = matchEnd()) != Token::Invalid)
-            break;
-    
-        if ((token = matchSymbol('+', Token::Kind::PLUS)) != Token::Invalid)
-            break;
-        
-        if ((token = matchSymbol('-', Token::Kind::MINUS)) != Token::Invalid)
-            break;
+    {
+        Token token = matchEnd();
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol('*', Token::Kind::STAR)) != Token::Invalid)
-            break;
+    {
+        Token token = matchSymbol('+', Token::Kind::PLUS);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol('/', Token::Kind::SLASH)) != Token::Invalid)
-            break;
+    {
+        Token token = matchSymbol('-', Token::Kind::MINUS);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol('%', Token::Kind::PERCENT)) != Token::Invalid)
-            break;
+    {
+        Token token = matchSymbol('*', Token::Kind::STAR);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol('(', Token::Kind::LEFT_PAREN)) != Token::Invalid)
-            break;
+    {
+        Token token = matchSymbol('/', Token::Kind::SLASH);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol(')', Token::Kind::RIGHT_PAREN)) != Token::Invalid)
-            break;
+    {
+        Token token =matchSymbol('%', Token::Kind::PERCENT);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol('.', Token::Kind::DOT)) != Token::Invalid)
-            break;
+    {
+        Token token = matchSymbol('(', Token::Kind::LEFT_PAREN);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchSymbol(',', Token::Kind::COMMA)) != Token::Invalid)
-            break;
+    {
+        Token token = matchSymbol(')', Token::Kind::RIGHT_PAREN);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchInteger()) != Token::Invalid)
-            break;
+    {
+        Token token =matchSymbol('.', Token::Kind::DOT);
+        if (token.isValid())
+            return token;
+    }

-        if ((token = matchNewLine()) != Token::Invalid)
-            break;
-        
-        token = matchInvalid();
-    } while(false);
+    {
+        Token token = matchSymbol(',', Token::Kind::COMMA);
+        if (token.isValid())
+            return token;
+    }

-    return token;
+    {
+        Token token = matchInteger();
+        if (token.isValid())
+            return token;
+    }
+
+    {
+        Token token = matchKeyword("fun", Token::Kind::FUNCTION);
+        if (token.isValid())
+            return token;
+    }
+
+    {
+        Token token = matchNewLine();
+        if (token.isValid())
+            return token;
+    }
+
+    return matchInvalid();
 }

 Token Lexer::matchEnd() {
    if (currentIndex >= source.length())
-        return Token(Token::Kind::END, "");
+        return Token(Token::Kind::END, "", currentLine, currentColumn);
    
-    return Token::Invalid;
+    return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
 }

 Token Lexer::matchNewLine() {
    if (isNewLine(currentIndex))
-        return Token(Token::Kind::NEW_LINE, "\n");
+        return Token(Token::Kind::NEW_LINE, "\n", currentLine, currentColumn);

-    return Token::Invalid;
+    return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
 }

 Token Lexer::matchSymbol(char symbol, Token::Kind kind) {
    if (source.at(currentIndex) == symbol)
-        return Token(kind, std::string(1, symbol));
+        return Token(kind, string(1, symbol), currentLine, currentColumn);

-    return Token::Invalid;
+    return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
 }

 Token Lexer::matchInteger() {
@@ -96,15 +144,24 @@ Token Lexer::matchInteger() {
        nextIndex++;
    
    if (nextIndex == currentIndex)
-        return Token::Invalid;
+        return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
    
-    std::string lexme = source.substr(currentIndex, nextIndex - currentIndex);
-    return Token(Token::Kind::INTEGER, lexme);
+    string lexme = source.substr(currentIndex, nextIndex - currentIndex);
+    return Token(Token::Kind::INTEGER, lexme, currentLine, currentColumn);
+}
+
+Token Lexer::matchKeyword(string keyword, Token::Kind kind) {
+    bool isMatching = source.compare(currentIndex, keyword.length(), keyword) == 0;
+    bool isSeparated = (currentIndex + keyword.length() >= source.length()) || isWhiteSpace(currentIndex + keyword.length()) || isNewLine(currentIndex + keyword.length());
+
+    if (isMatching && isSeparated)
+        return Token(Token::Kind::FUNCTION, keyword, currentLine, currentColumn);
+    else
+        return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
 }

 Token Lexer::matchInvalid() {
-    char symbol = source.at(currentIndex);
-    return Token(Token::Kind::INVALID, std::string(1, symbol));
+    return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
 }

 bool Lexer::isWhiteSpace(int index) {