Tokenizing input

This commit is contained in:
Rafał Grodziński
2025-05-27 22:38:44 +09:00
parent 838dbbeb03
commit 69bf54a62d
8 changed files with 225 additions and 24 deletions

5
.gitignore vendored
View File

@@ -1,3 +1,6 @@
.DS_Store
*.o
brb
brb
.vscode/settings.json
*.dSYM
*.brc

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Debug Bits Runner Builder",
"type": "lldb-dap",
"request": "launch",
"program": "${workspaceFolder}/brb",
"args": ["${workspaceFolder}/test.brc"]
}
]
}

122
Lexer.cpp
View File

@@ -1,9 +1,123 @@
#include "Lexer.h"
#include "Token.h"
Lexer::Lexer(std::string source) : source(source) {
}
std::vector<Token> Lexer::tokens() {
return { Token::integer, Token::real, Token::integer, Token::eof };
}
std::vector<Token> Lexer::getTokens() {
std::vector<Token> tokens;
do {
Token token = nextToken();
currentIndex += token.getLexme().length();
if (token.getKind() == Token::Kind::NEW_LINE)
currentLine++;
// filter out multiple new lines
if (tokens.empty() || token.getKind() != Token::Kind::NEW_LINE || tokens.back() != token)
tokens.push_back(token);
} while (tokens.back().getKind() != Token::Kind::END);
return tokens;
}
Token Lexer::nextToken() {
Token token = Token::Invalid;
while (currentIndex < source.length() && isWhiteSpace(currentIndex))
currentIndex++;
do {
if ((token = matchEnd()) != Token::Invalid)
break;
if ((token = matchSymbol('+', Token::Kind::PLUS)) != Token::Invalid)
break;
if ((token = matchSymbol('-', Token::Kind::MINUS)) != Token::Invalid)
break;
if ((token = matchSymbol('*', Token::Kind::STAR)) != Token::Invalid)
break;
if ((token = matchSymbol('/', Token::Kind::SLASH)) != Token::Invalid)
break;
if ((token = matchSymbol('%', Token::Kind::PERCENT)) != Token::Invalid)
break;
if ((token = matchSymbol('(', Token::Kind::LEFT_PAREN)) != Token::Invalid)
break;
if ((token = matchSymbol(')', Token::Kind::RIGHT_PAREN)) != Token::Invalid)
break;
if ((token = matchSymbol('.', Token::Kind::DOT)) != Token::Invalid)
break;
if ((token = matchSymbol(',', Token::Kind::COMMA)) != Token::Invalid)
break;
if ((token = matchInteger()) != Token::Invalid)
break;
if ((token = matchNewLine()) != Token::Invalid)
break;
token = matchInvalid();
} while(false);
return token;
}
Token Lexer::matchEnd() {
if (currentIndex >= source.length())
return Token(Token::Kind::END, "");
return Token::Invalid;
}
Token Lexer::matchNewLine() {
if (isNewLine(currentIndex))
return Token(Token::Kind::NEW_LINE, "\n");
return Token::Invalid;
}
Token Lexer::matchSymbol(char symbol, Token::Kind kind) {
if (source.at(currentIndex) == symbol)
return Token(kind, std::string(1, symbol));
return Token::Invalid;
}
Token Lexer::matchInteger() {
int nextIndex = currentIndex;
while (nextIndex < source.length() && isDigit(nextIndex))
nextIndex++;
if (nextIndex == currentIndex)
return Token::Invalid;
std::string lexme = source.substr(currentIndex, nextIndex - currentIndex);
return Token(Token::Kind::INTEGER, lexme);
}
Token Lexer::matchInvalid() {
char symbol = source.at(currentIndex);
return Token(Token::Kind::INVALID, std::string(1, symbol));
}
bool Lexer::isWhiteSpace(int index) {
char character = source.at(index);
return character == ' ' || character == '\t';
}
bool Lexer::isNewLine(int index) {
char character = source.at(index);
return character == '\n';
}
bool Lexer::isDigit(int index) {
char character = source.at(index);
return character >= '0' && character <= '9';
}

18
Lexer.h
View File

@@ -2,16 +2,28 @@
#define LEXER_H
#include <vector>
class Token;
#include "Token.h"
class Lexer {
private:
std::string source;
int currentIndex = 0;
int currentLine = 0;
Token nextToken();
Token matchEnd();
Token matchNewLine();
Token matchInvalid();
Token matchSymbol(char symbol, Token::Kind kind);
Token matchInteger();
bool isWhiteSpace(int index);
bool isNewLine(int index);
bool isDigit(int index);
public:
Lexer(std::string source);
std::vector<Token> tokens();
std::vector<Token> getTokens();
};
#endif

View File

@@ -1,18 +1,53 @@
#include "Token.h"
Token::Token(Kind kind): kind(kind) {
Token Token::Invalid = Token(Token::Kind::INVALID, "");
Token::Token(Kind kind, std::string lexme): kind(kind), lexme(lexme) {
}
Token::Kind Token::getKind() {
return kind;
}
std::string Token::getLexme() {
return lexme;
}
bool Token::operator==(Token const& other) {
return kind == other.kind;
}
bool Token::operator!=(Token const& other) {
return kind != other.kind;
}
std::string Token::toString() {
switch (kind) {
case integer:
case PLUS:
return "PLUS";
case MINUS:
return "MINUS";
case STAR:
return "STAR";
case SLASH:
return "SLASH";
case PERCENT:
return "PERCENT";
case LEFT_PAREN:
return "LEFT_PARENT";
case RIGHT_PAREN:
return "RIGHT_PAREN";
case DOT:
return "DOT";
case COMMA:
return "COMMA";
case INTEGER:
return "INTEGER";
break;
case real:
return "REAL";
break;
case eof:
return "EOF";
break;
case NEW_LINE:
return "NEW_LINE";
case END:
return "END";
case INVALID:
return "INVALID";
}
}

31
Token.h
View File

@@ -5,18 +5,39 @@
class Token {
public:
enum Kind {
integer,
real,
eof
enum Kind {
PLUS,
MINUS,
STAR,
SLASH,
PERCENT,
LEFT_PAREN,
RIGHT_PAREN,
DOT,
COMMA,
INTEGER,
NEW_LINE,
END,
INVALID
};
private:
Kind kind;
std::string lexme;
public:
Token(Kind kind);
Token(Kind kind, std::string lexme);
Kind getKind();
std::string getLexme();
bool operator==(Token const& other);
bool operator!=(Token const& other);
std::string toString();
static Token Invalid;
};
#endif

View File

@@ -25,7 +25,7 @@ int main(int argc, char **argv) {
std::string source = readFile(std::string(argv[1]));
Lexer lexer(source);
std::vector<Token> tokens = lexer.tokens();
std::vector<Token> tokens = lexer.getTokens();
for (Token &token : tokens)
std::cout << token.toString() << " ";
std::cout << std::endl;

View File

@@ -1,3 +1,3 @@
#!/bin/bash
cc -std=c++17 -lc++ *.cpp -o brb
cc -g -std=c++17 -lc++ *.cpp -o brb