From 69bf54a62d8dfb0a6b84693871479b39b410df31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Grodzi=C5=84ski?= Date: Tue, 27 May 2025 22:38:44 +0900 Subject: [PATCH] Tokenizing input --- .gitignore | 5 +- .vscode/launch.json | 16 ++++++ Lexer.cpp | 122 ++++++++++++++++++++++++++++++++++++++++++-- Lexer.h | 18 +++++-- Token.cpp | 53 +++++++++++++++---- Token.h | 31 +++++++++-- main.cpp | 2 +- make.sh | 2 +- 8 files changed, 225 insertions(+), 24 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.gitignore b/.gitignore index 48d31f4..06204aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .DS_Store *.o -brb \ No newline at end of file +brb +.vscode/settings.json +*.dSYM +*.brc \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..e8a73f9 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Bits Runner Builder", + "type": "lldb-dap", + "request": "launch", + "program": "${workspaceFolder}/brb", + "args": ["${workspaceFolder}/test.brc"] + } + + ] +} \ No newline at end of file diff --git a/Lexer.cpp b/Lexer.cpp index 8f005a8..16b5150 100644 --- a/Lexer.cpp +++ b/Lexer.cpp @@ -1,9 +1,123 @@ #include "Lexer.h" -#include "Token.h" Lexer::Lexer(std::string source) : source(source) { } -std::vector Lexer::tokens() { - return { Token::integer, Token::real, Token::integer, Token::eof }; -} \ No newline at end of file +std::vector Lexer::getTokens() { + std::vector tokens; + do { + Token token = nextToken(); + currentIndex += token.getLexme().length(); + + if (token.getKind() == Token::Kind::NEW_LINE) + currentLine++; + + // filter out multiple new lines + if (tokens.empty() || token.getKind() != Token::Kind::NEW_LINE || tokens.back() != token) + tokens.push_back(token); + } while (tokens.back().getKind() != Token::Kind::END); + return tokens; +} + +Token Lexer::nextToken() { + Token token = Token::Invalid; + + while (currentIndex < source.length() && isWhiteSpace(currentIndex)) + currentIndex++; + + do { + if ((token = matchEnd()) != Token::Invalid) + break; + + if ((token = matchSymbol('+', Token::Kind::PLUS)) != Token::Invalid) + break; + + if ((token = matchSymbol('-', Token::Kind::MINUS)) != Token::Invalid) + break; + + if ((token = matchSymbol('*', Token::Kind::STAR)) != Token::Invalid) + break; + + if ((token = matchSymbol('/', Token::Kind::SLASH)) != Token::Invalid) + break; + + if ((token = matchSymbol('%', Token::Kind::PERCENT)) != Token::Invalid) + break; + + if ((token = matchSymbol('(', Token::Kind::LEFT_PAREN)) != Token::Invalid) + break; + + if ((token = matchSymbol(')', Token::Kind::RIGHT_PAREN)) != Token::Invalid) + break; + + if ((token = matchSymbol('.', Token::Kind::DOT)) != Token::Invalid) + break; + + if ((token = matchSymbol(',', Token::Kind::COMMA)) != Token::Invalid) + break; + + if ((token = matchInteger()) != Token::Invalid) + break; + + if ((token = matchNewLine()) != Token::Invalid) + break; + + token = matchInvalid(); + } while(false); + + return token; +} + +Token Lexer::matchEnd() { + if (currentIndex >= source.length()) + return Token(Token::Kind::END, ""); + + return Token::Invalid; +} + +Token Lexer::matchNewLine() { + if (isNewLine(currentIndex)) + return Token(Token::Kind::NEW_LINE, "\n"); + + return Token::Invalid; +} + +Token Lexer::matchSymbol(char symbol, Token::Kind kind) { + if (source.at(currentIndex) == symbol) + return Token(kind, std::string(1, symbol)); + + return Token::Invalid; +} + +Token Lexer::matchInteger() { + int nextIndex = currentIndex; + + while (nextIndex < source.length() && isDigit(nextIndex)) + nextIndex++; + + if (nextIndex == currentIndex) + return Token::Invalid; + + std::string lexme = source.substr(currentIndex, nextIndex - currentIndex); + return Token(Token::Kind::INTEGER, lexme); +} + +Token Lexer::matchInvalid() { + char symbol = source.at(currentIndex); + return Token(Token::Kind::INVALID, std::string(1, symbol)); +} + +bool Lexer::isWhiteSpace(int index) { + char character = source.at(index); + return character == ' ' || character == '\t'; +} + +bool Lexer::isNewLine(int index) { + char character = source.at(index); + return character == '\n'; +} + +bool Lexer::isDigit(int index) { + char character = source.at(index); + return character >= '0' && character <= '9'; +} diff --git a/Lexer.h b/Lexer.h index 4b3f413..4225f2c 100644 --- a/Lexer.h +++ b/Lexer.h @@ -2,16 +2,28 @@ #define LEXER_H #include - -class Token; +#include "Token.h" class Lexer { private: std::string source; + int currentIndex = 0; + int currentLine = 0; + + Token nextToken(); + Token matchEnd(); + Token matchNewLine(); + Token matchInvalid(); + Token matchSymbol(char symbol, Token::Kind kind); + Token matchInteger(); + + bool isWhiteSpace(int index); + bool isNewLine(int index); + bool isDigit(int index); public: Lexer(std::string source); - std::vector tokens(); + std::vector getTokens(); }; #endif \ No newline at end of file diff --git a/Token.cpp b/Token.cpp index 3aa468f..e6c99af 100644 --- a/Token.cpp +++ b/Token.cpp @@ -1,18 +1,53 @@ #include "Token.h" -Token::Token(Kind kind): kind(kind) { +Token Token::Invalid = Token(Token::Kind::INVALID, ""); + +Token::Token(Kind kind, std::string lexme): kind(kind), lexme(lexme) { +} + +Token::Kind Token::getKind() { + return kind; +} + +std::string Token::getLexme() { + return lexme; +} + +bool Token::operator==(Token const& other) { + return kind == other.kind; +} + +bool Token::operator!=(Token const& other) { + return kind != other.kind; } std::string Token::toString() { switch (kind) { - case integer: + case PLUS: + return "PLUS"; + case MINUS: + return "MINUS"; + case STAR: + return "STAR"; + case SLASH: + return "SLASH"; + case PERCENT: + return "PERCENT"; + case LEFT_PAREN: + return "LEFT_PARENT"; + case RIGHT_PAREN: + return "RIGHT_PAREN"; + case DOT: + return "DOT"; + case COMMA: + return "COMMA"; + case INTEGER: return "INTEGER"; - break; - case real: - return "REAL"; - break; - case eof: - return "EOF"; - break; + case NEW_LINE: + return "NEW_LINE"; + case END: + return "END"; + case INVALID: + return "INVALID"; } } \ No newline at end of file diff --git a/Token.h b/Token.h index 618072b..12d05eb 100644 --- a/Token.h +++ b/Token.h @@ -5,18 +5,39 @@ class Token { public: - enum Kind { - integer, - real, - eof + enum Kind { + PLUS, + MINUS, + STAR, + SLASH, + PERCENT, + + LEFT_PAREN, + RIGHT_PAREN, + DOT, + COMMA, + + INTEGER, + + NEW_LINE, + + END, + INVALID }; private: Kind kind; + std::string lexme; public: - Token(Kind kind); + Token(Kind kind, std::string lexme); + Kind getKind(); + std::string getLexme(); + bool operator==(Token const& other); + bool operator!=(Token const& other); std::string toString(); + + static Token Invalid; }; #endif \ No newline at end of file diff --git a/main.cpp b/main.cpp index 07d8ec0..6f2c9c1 100644 --- a/main.cpp +++ b/main.cpp @@ -25,7 +25,7 @@ int main(int argc, char **argv) { std::string source = readFile(std::string(argv[1])); Lexer lexer(source); - std::vector tokens = lexer.tokens(); + std::vector tokens = lexer.getTokens(); for (Token &token : tokens) std::cout << token.toString() << " "; std::cout << std::endl; diff --git a/make.sh b/make.sh index c85dd32..8d241b8 100755 --- a/make.sh +++ b/make.sh @@ -1,3 +1,3 @@ #!/bin/bash -cc -std=c++17 -lc++ *.cpp -o brb \ No newline at end of file +cc -g -std=c++17 -lc++ *.cpp -o brb \ No newline at end of file