Updated lexer

This commit is contained in:
Rafał Grodziński
2025-06-02 17:57:21 +09:00
parent 2ef888e374
commit 5e3ef33f15
4 changed files with 177 additions and 152 deletions

View File

@@ -3,189 +3,190 @@
Lexer::Lexer(string source): source(source) { Lexer::Lexer(string source): source(source) {
} }
vector<Token> Lexer::getTokens() { vector<shared_ptr<Token>> Lexer::getTokens() {
vector<Token> tokens; vector<shared_ptr<Token>> tokens;
shared_ptr<Token> token = nullptr;
do { do {
Token token = nextToken(); token = nextToken();
// Got a nullptr, shouldn't have happened
// Abort scanning if we got an error if (!token) {
if (!token.isValid()) { cerr << "Failed to scan tokens" << endl;
cerr << "Unexpected character '" << token.getLexme() << "' at " << token.getLine() << ":" << token.getColumn() << endl; exit(1);
return vector<Token>();
} }
currentIndex += token.getLexme().length(); // Abort scanning if we got an error
currentColumn += token.getLexme().length(); if (!token->isValid()) {
cerr << "Unexpected character '" << token->getLexme() << "' at " << token->getLine() << ":" << token->getColumn() << endl;
exit(1);
}
if (token.getKind() == Token::Kind::NEW_LINE) { currentIndex += token->getLexme().length();
currentColumn += token->getLexme().length();
if (token->getKind() == Token::Kind::NEW_LINE) {
currentLine++; currentLine++;
currentColumn = 0; currentColumn = 0;
} }
// filter out multiple new lines // filter out multiple new lines
if (tokens.empty() || token.getKind() != Token::Kind::NEW_LINE || tokens.back().getKind() != token.getKind()) if (tokens.empty() || token->getKind() != Token::Kind::NEW_LINE || tokens.back()->getKind() != token->getKind())
tokens.push_back(token); tokens.push_back(token);
} while (tokens.back().getKind() != Token::Kind::END); } while (token->getKind() != Token::Kind::END);
return tokens; return tokens;
} }
Token Lexer::nextToken() { shared_ptr<Token> Lexer::nextToken() {
while (currentIndex < source.length() && isWhiteSpace(currentIndex)) { while (currentIndex < source.length() && isWhiteSpace(currentIndex)) {
currentIndex++; currentIndex++;
currentColumn++; currentColumn++;
} }
{ shared_ptr<Token> token;
Token token = matchEnd();
if (token.isValid())
return token;
}
{ // arithmetic
Token token = matchSymbol('+', Token::Kind::PLUS); token = match(Token::Kind::PLUS, "+", false);
if (token.isValid()) if (token != nullptr)
return token; return token;
}
{ token = match(Token::Kind::MINUS, "-", false);
Token token = matchSymbol('-', Token::Kind::MINUS); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::STAR, "*", false);
Token token = matchSymbol('*', Token::Kind::STAR); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::SLASH, "/", false);
Token token = matchSymbol('/', Token::Kind::SLASH); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::PERCENT, "%", false);
Token token =matchSymbol('%', Token::Kind::PERCENT); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ // logical
Token token = matchSymbol('(', Token::Kind::LEFT_PAREN); token = match(Token::Kind::NOT_EQUAL, "!=", false);
if (token.isValid()) if (token != nullptr)
return token; return token;
}
{ token = match(Token::Kind::EQUAL, "=", false);
Token token = matchSymbol(')', Token::Kind::RIGHT_PAREN); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::LESS_EQUAL, "<=", false);
Token token = matchSymbol(':', Token::Kind::COLON); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::LESS, "<", false);
Token token = matchSymbol(';', Token::Kind::SEMICOLON); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::GREATER_EQUAL, ">=", false);
Token token = matchKeyword("fun", Token::Kind::FUNCTION); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::GREATER, ">", false);
Token token = matchKeyword("ret", Token::Kind::RETURN); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ // structural
Token token = matchInteger(); token = match(Token::Kind::LEFT_PAREN, "(", false);
if (token.isValid()) if (token != nullptr)
return token; return token;
}
{ token = match(Token::Kind::RIGHT_PAREN, ")", false);
Token token = matchIdentifier(); if (token != nullptr)
if (token.isValid())
return token; return token;
}
{ token = match(Token::Kind::COLON, ":", false);
Token token = matchNewLine(); if (token != nullptr)
if (token.isValid()) return token;
token = match(Token::Kind::SEMICOLON, ";", false);
if (token != nullptr)
return token;
// keywords
token = match(Token::Kind::FUNCTION, "fun", true);
if (token != nullptr)
return token;
token = match(Token::Kind::RETURN, "ret", true);
if (token != nullptr)
return token;
// literal
token = matchInteger();
if (token != nullptr)
return token;
// identifier
token = matchIdentifier();
if (token != nullptr)
return token;
// new line
token = match(Token::Kind::NEW_LINE, "\r\n", false);
if (token != nullptr)
return token;
token = match(Token::Kind::NEW_LINE, "\n", false);
if (token != nullptr)
return token;
// other
token = matchEnd();
if (token != nullptr)
return token; return token;
}
return matchInvalid(); return matchInvalid();
} }
Token Lexer::matchEnd() { shared_ptr<Token> Lexer::match(Token::Kind kind, string lexme, bool needsSeparator) {
if (currentIndex >= source.length()) bool isMatching = source.compare(currentIndex, lexme.length(), lexme) == 0;
return Token(Token::Kind::END, "", currentLine, currentColumn); bool isSeparatorSatisfied = !needsSeparator || isSeparator(currentIndex + lexme.length());
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn); if (isMatching && isSeparatorSatisfied)
} return make_shared<Token>(kind, lexme, currentLine, currentColumn);
Token Lexer::matchNewLine() {
if (isNewLine(currentIndex))
return Token(Token::Kind::NEW_LINE, "\n", currentLine, currentColumn);
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
}
Token Lexer::matchSymbol(char symbol, Token::Kind kind) {
if (source.at(currentIndex) == symbol)
return Token(kind, string(1, symbol), currentLine, currentColumn);
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
}
Token Lexer::matchKeyword(string keyword, Token::Kind kind) {
bool isMatching = source.compare(currentIndex, keyword.length(), keyword) == 0;
if (isMatching && isSeparator(currentIndex + keyword.length()))
return Token(kind, keyword, currentLine, currentColumn);
else else
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn); return nullptr;
} }
Token Lexer::matchInteger() { shared_ptr<Token> Lexer::matchInteger() {
int nextIndex = currentIndex; int nextIndex = currentIndex;
while (nextIndex < source.length() && isDigit(nextIndex)) while (nextIndex < source.length() && isDigit(nextIndex))
nextIndex++; nextIndex++;
if (nextIndex == currentIndex || !isSeparator(nextIndex)) if (nextIndex == currentIndex || !isSeparator(nextIndex))
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn); return nullptr;
string lexme = source.substr(currentIndex, nextIndex - currentIndex); string lexme = source.substr(currentIndex, nextIndex - currentIndex);
return Token(Token::Kind::INTEGER, lexme, currentLine, currentColumn); return make_shared<Token>(Token::Kind::INTEGER, lexme, currentLine, currentColumn);
} }
Token Lexer::matchIdentifier() { shared_ptr<Token> Lexer::matchIdentifier() {
int nextIndex = currentIndex; int nextIndex = currentIndex;
while (nextIndex < source.length() && isIdentifier(nextIndex)) while (nextIndex < source.length() && isIdentifier(nextIndex))
nextIndex++; nextIndex++;
if (nextIndex == currentIndex || !isSeparator(nextIndex)) if (nextIndex == currentIndex || !isSeparator(nextIndex))
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn); return nullptr;
string lexme = source.substr(currentIndex, nextIndex - currentIndex); string lexme = source.substr(currentIndex, nextIndex - currentIndex);
return Token(Token::Kind::IDENTIFIER, lexme, currentLine, currentColumn); return make_shared<Token>(Token::Kind::IDENTIFIER, lexme, currentLine, currentColumn);
} }
Token Lexer::matchInvalid() { shared_ptr<Token> Lexer::matchEnd() {
return Token(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn); if (currentIndex >= source.length())
return make_shared<Token>(Token::Kind::END, "", currentLine, currentColumn);
return nullptr;
}
shared_ptr<Token> Lexer::matchInvalid() {
return make_shared<Token>(Token::Kind::INVALID, source.substr(currentIndex, 1), currentLine, currentColumn);
} }
bool Lexer::isWhiteSpace(int index) { bool Lexer::isWhiteSpace(int index) {
@@ -193,11 +194,6 @@ bool Lexer::isWhiteSpace(int index) {
return character == ' ' || character == '\t'; return character == ' ' || character == '\t';
} }
bool Lexer::isNewLine(int index) {
char character = source.at(index);
return character == '\n';
}
bool Lexer::isDigit(int index) { bool Lexer::isDigit(int index) {
char character = source.at(index); char character = source.at(index);
return character >= '0' && character <= '9'; return character >= '0' && character <= '9';
@@ -223,12 +219,17 @@ bool Lexer::isSeparator(int index) {
case '*': case '*':
case '/': case '/':
case '%': case '%':
case '=':
case '<':
case '>':
case '(': case '(':
case ')': case ')':
case ':': case ':':
case ';':
case ' ': case ' ':
case '\t': case '\t':
case '\n': case '\n':
case '\r';
return true; return true;
default: default:
return false; return false;

View File

@@ -1,6 +1,7 @@
#include "Token.h" #include "Token.h"
Token::Token(Kind kind, string lexme, int line, int column): kind(kind), lexme(lexme), line(line), column(column) { Token::Token(Kind kind, string lexme, int line, int column):
kind(kind), lexme(lexme), line(line), column(column) {
} }
Token::Kind Token::getKind() { Token::Kind Token::getKind() {
@@ -44,6 +45,20 @@ string Token::toString() {
return "SLASH"; return "SLASH";
case PERCENT: case PERCENT:
return "PERCENT"; return "PERCENT";
case EQUAL:
return "EQUAL";
case NOT_EQUAL:
return "NOT_EQUAL";
case LESS:
return "LESS";
case LESS_EQUAL:
return "LESS_EQUAL";
case GREATER:
return "GREATER";
case GREATER_EQUAL:
return "GREATER_EQUAL";
case LEFT_PAREN: case LEFT_PAREN:
return "LEFT_PAREN"; return "LEFT_PAREN";
case RIGHT_PAREN: case RIGHT_PAREN:
@@ -52,14 +67,17 @@ string Token::toString() {
return "COLON"; return "COLON";
case SEMICOLON: case SEMICOLON:
return "SEMICOLON"; return "SEMICOLON";
case INTEGER: case INTEGER:
return "INTEGER"; return "INTEGER(" + lexme + ")";
case IDENTIFIER: case IDENTIFIER:
return "IDENTIFIER"; return "IDENTIFIER(" + lexme + ")";
case FUNCTION: case FUNCTION:
return "FUNCTION"; return "FUNCTION";
case RETURN: case RETURN:
return "RETURN"; return "RETURN";
case NEW_LINE: case NEW_LINE:
return "NEW_LINE"; return "NEW_LINE";
case END: case END:

View File

@@ -14,17 +14,24 @@ public:
SLASH, SLASH,
PERCENT, PERCENT,
EQUAL,
NOT_EQUAL,
LESS,
LESS_EQUAL,
GREATER,
GREATER_EQUAL,
LEFT_PAREN, LEFT_PAREN,
RIGHT_PAREN, RIGHT_PAREN,
COLON, COLON,
SEMICOLON, SEMICOLON,
INTEGER,
IDENTIFIER,
FUNCTION, FUNCTION,
RETURN, RETURN,
INTEGER,
IDENTIFIER,
NEW_LINE, NEW_LINE,
END, END,

View File

@@ -35,31 +35,30 @@ int main(int argc, char **argv) {
std::string source = readFile(std::string(argv[1])); std::string source = readFile(std::string(argv[1]));
Lexer lexer(source); Lexer lexer(source);
std::vector<Token> tokens = lexer.getTokens(); std::vector<shared_ptr<Token>> tokens = lexer.getTokens();
if (tokens.empty()) { for (int i=0; i<tokens.size(); i++) {
exit(1); std::cout << tokens.at(i)->toString();
if (i < tokens.size() - 1)
std::cout << " ";
} }
for (Token &token : tokens)
std::cout << token.toString() << " ";
std::cout << std::endl; std::cout << std::endl;
Parser parser(tokens); //Parser parser(tokens);
vector<shared_ptr<Statement>> statements = parser.getStatements(); //vector<shared_ptr<Statement>> statements = parser.getStatements();
if (statements.empty()) { //if (statements.empty()) {
exit(1); // exit(1);
} //}
for (shared_ptr<Statement> &statement : statements) { //for (shared_ptr<Statement> &statement : statements) {
cout << statement->toString(); // cout << statement->toString();
cout << endl; // cout << endl;
} //}
//ModuleBuilder moduleBuilder(expression); //ModuleBuilder moduleBuilder(statements);
ModuleBuilder moduleBuilder(statements); //shared_ptr<llvm::Module> module = moduleBuilder.getModule();
shared_ptr<llvm::Module> module = moduleBuilder.getModule(); //module->print(llvm::outs(), nullptr);
module->print(llvm::outs(), nullptr);
CodeGenerator codeGenerator(module); //CodeGenerator codeGenerator(module);
codeGenerator.generateObjectFile("dummy.s"); //codeGenerator.generateObjectFile("dummy.s");
return 0; return 0;
} }