Tokenizing input

2025-05-27 22:38:44 +09:00
parent 838dbbeb03
commit 69bf54a62d
8 changed files with 225 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 .DS_Store
 *.o
-brb
+brb
+.vscode/settings.json
+*.dSYM
+*.brc
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug Bits Runner Builder",
+            "type": "lldb-dap",
+            "request": "launch",
+            "program": "${workspaceFolder}/brb",
+            "args": ["${workspaceFolder}/test.brc"]
+        }
+
+    ]
+}
--- a/Lexer.cpp
+++ b/Lexer.cpp
@@ -1,9 +1,123 @@
 #include "Lexer.h"
-#include "Token.h"

 Lexer::Lexer(std::string source) : source(source) {
 }

-std::vector<Token> Lexer::tokens() {
-    return { Token::integer, Token::real, Token::integer, Token::eof };
-}
+std::vector<Token> Lexer::getTokens() {
+    std::vector<Token> tokens;
+    do {
+        Token token = nextToken();
+        currentIndex += token.getLexme().length();
+
+        if (token.getKind() == Token::Kind::NEW_LINE)
+            currentLine++;
+        
+        // filter out multiple new lines
+        if (tokens.empty() || token.getKind() != Token::Kind::NEW_LINE || tokens.back() != token)
+            tokens.push_back(token);
+    } while (tokens.back().getKind() != Token::Kind::END);
+    return tokens;
+}
+
+Token Lexer::nextToken() {
+    Token token = Token::Invalid;
+
+    while (currentIndex < source.length() && isWhiteSpace(currentIndex))
+        currentIndex++;
+
+    do {
+        if ((token = matchEnd()) != Token::Invalid)
+            break;
+    
+        if ((token = matchSymbol('+', Token::Kind::PLUS)) != Token::Invalid)
+            break;
+        
+        if ((token = matchSymbol('-', Token::Kind::MINUS)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol('*', Token::Kind::STAR)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol('/', Token::Kind::SLASH)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol('%', Token::Kind::PERCENT)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol('(', Token::Kind::LEFT_PAREN)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol(')', Token::Kind::RIGHT_PAREN)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol('.', Token::Kind::DOT)) != Token::Invalid)
+            break;
+
+        if ((token = matchSymbol(',', Token::Kind::COMMA)) != Token::Invalid)
+            break;
+
+        if ((token = matchInteger()) != Token::Invalid)
+            break;
+
+        if ((token = matchNewLine()) != Token::Invalid)
+            break;
+        
+        token = matchInvalid();
+    } while(false);
+
+    return token;
+}
+
+Token Lexer::matchEnd() {
+    if (currentIndex >= source.length())
+        return Token(Token::Kind::END, "");
+    
+    return Token::Invalid;
+}
+
+Token Lexer::matchNewLine() {
+    if (isNewLine(currentIndex))
+        return Token(Token::Kind::NEW_LINE, "\n");
+
+    return Token::Invalid;
+}
+
+Token Lexer::matchSymbol(char symbol, Token::Kind kind) {
+    if (source.at(currentIndex) == symbol)
+        return Token(kind, std::string(1, symbol));
+
+    return Token::Invalid;
+}
+
+Token Lexer::matchInteger() {
+    int nextIndex = currentIndex;
+
+    while (nextIndex < source.length() && isDigit(nextIndex))
+        nextIndex++;
+    
+    if (nextIndex == currentIndex)
+        return Token::Invalid;
+    
+    std::string lexme = source.substr(currentIndex, nextIndex - currentIndex);
+    return Token(Token::Kind::INTEGER, lexme);
+}
+
+Token Lexer::matchInvalid() {
+    char symbol = source.at(currentIndex);
+    return Token(Token::Kind::INVALID, std::string(1, symbol));
+}
+
+bool Lexer::isWhiteSpace(int index) {
+    char character = source.at(index);
+    return character == ' ' || character == '\t';
+}
+
+bool Lexer::isNewLine(int index) {
+    char character = source.at(index);
+    return character == '\n';
+}
+
+bool Lexer::isDigit(int index) {
+    char character = source.at(index);
+    return character >= '0' && character <= '9';
+}
--- a/Lexer.h
+++ b/Lexer.h
@@ -2,16 +2,28 @@
 #define LEXER_H

 #include <vector>
-
-class Token;
+#include "Token.h"

 class Lexer {
 private:
    std::string source;
+    int currentIndex = 0;
+    int currentLine = 0;
+
+    Token nextToken();
+    Token matchEnd();
+    Token matchNewLine();
+    Token matchInvalid();
+    Token matchSymbol(char symbol, Token::Kind kind);
+    Token matchInteger();
+
+    bool isWhiteSpace(int index);
+    bool isNewLine(int index);
+    bool isDigit(int index);

 public:
    Lexer(std::string source);
-    std::vector<Token> tokens();
+    std::vector<Token> getTokens();
 };

 #endif
--- a/Token.cpp
+++ b/Token.cpp
@@ -1,18 +1,53 @@
 #include "Token.h"

-Token::Token(Kind kind): kind(kind) {
+Token Token::Invalid = Token(Token::Kind::INVALID, "");
+
+Token::Token(Kind kind, std::string lexme): kind(kind), lexme(lexme) {
+}
+
+Token::Kind Token::getKind() {
+    return kind;
+}
+
+std::string Token::getLexme() {
+    return lexme;
+}
+
+bool Token::operator==(Token const& other) {
+    return kind == other.kind;
+}
+
+bool Token::operator!=(Token const& other) {
+    return kind != other.kind;
 }

 std::string Token::toString() {
    switch (kind) {
-        case integer:
+        case PLUS:
+            return "PLUS";
+        case MINUS:
+            return "MINUS";
+        case STAR:
+            return "STAR";
+        case SLASH:
+            return "SLASH";
+        case PERCENT:
+            return "PERCENT";
+        case LEFT_PAREN:
+            return "LEFT_PARENT";
+        case RIGHT_PAREN:
+            return "RIGHT_PAREN";
+        case DOT:
+            return "DOT";
+        case COMMA:
+            return "COMMA";
+        case INTEGER:
            return "INTEGER";
-            break;
-        case real:
-            return "REAL";
-            break;
-        case eof:
-            return "EOF";
-            break;
+        case NEW_LINE:
+            return "NEW_LINE";
+        case END:
+            return "END";
+        case INVALID:
+            return "INVALID";
    }
 }
--- a/Token.h
+++ b/Token.h
@@ -5,18 +5,39 @@

 class Token {
 public:
-    enum Kind {
-        integer,
-        real,
-        eof
+    enum Kind {        
+        PLUS,
+        MINUS,
+        STAR,
+        SLASH,
+        PERCENT,
+        
+        LEFT_PAREN,
+        RIGHT_PAREN,
+        DOT,
+        COMMA,
+
+        INTEGER,
+
+        NEW_LINE,
+
+        END,
+        INVALID
    };

 private:
    Kind kind;
+    std::string lexme;

 public:
-    Token(Kind kind);
+    Token(Kind kind, std::string lexme);
+    Kind getKind();
+    std::string getLexme();
+    bool operator==(Token const& other);
+    bool operator!=(Token const& other);
    std::string toString();
+
+    static Token Invalid;
 };

 #endif
--- a/main.cpp
+++ b/main.cpp
@@ -25,7 +25,7 @@ int main(int argc, char **argv) {

    std::string source = readFile(std::string(argv[1]));
    Lexer lexer(source);
-    std::vector<Token> tokens = lexer.tokens();
+    std::vector<Token> tokens = lexer.getTokens();
    for (Token &token : tokens)
        std::cout << token.toString() << " ";
    std::cout << std::endl;
--- a/make.sh
+++ b/make.sh
@@ -1,3 +1,3 @@
 #!/bin/bash

-cc -std=c++17 -lc++ *.cpp -o brb
+cc -g -std=c++17 -lc++ *.cpp -o brb