I'm creating a parser, and I have just finished my lexer. I wanted to ask if there is anything I should change, add, or reconsider in my code! (I don't think the grammars matter much, since it is only a lexer)
lexical_analyzer.h
#pragma once
#include <iostream>
#include <string>
#include <vector>
// kinds of lexemes (i.e. tokens)
namespace la_enum
{
enum token
{
STRING // anything that isn't something below
, AND // +
, CHAR_REPEATED // *
, LEFT_PARENTHESIS // (
, RIGHT_PARENTHESIS // )
, ANY_CHAR // .
, COUNTER // {N}
, IGNORE_CASE // \I
, SINGLE_CAPTURE // \O{N}
};
}
class lexical_analyzer
{
public:
lexical_analyzer(std::string patternInput) :
pattern(patternInput)
{
addLexemes();
}
private:
void addLexemes(); // adds lexemes and tokens from the pattern to the vectors
bool isSingleSymbol(char); // checks if it is an operator with ONE symbol
void addString(); // is used to store strings (operands)
void addOperator(la_enum::token, std::string&, int, int); // is used to store operators
std::string pattern; // input pattern
std::string characterBuffer; // buffer for operators with more than one symbol
// 'lexemes' and 'tokens' have synched indexes
std::vector<std::string> lexemes; // stores lexemes
std::vector<la_enum::token> tokens; // stores tokens
};
lexical_analyzer.cpp
#include "lexical_analyzer.h"
// adds lexemes and tokens from the pattern to the vectors
void lexical_analyzer::addLexemes()
{
for (int i = 0; i != pattern.size(); i++)
{
// adds lexems and tokens from the char buffer (strings)
if (isSingleSymbol(pattern[i]))
addString();
switch (pattern[i])
{
case ('+') :
addOperator(la_enum::AND, pattern, i, 1);
break;
case ('*') :
addOperator(la_enum::CHAR_REPEATED, pattern, i, 1);
break;
case ('.') :
addOperator(la_enum::ANY_CHAR, pattern, i, 1);
break;
case ('(') :
addOperator(la_enum::LEFT_PARENTHESIS, pattern, i, 1);
break;
case (')') :
addOperator(la_enum::RIGHT_PARENTHESIS, pattern, i, 1);
break;
default :
if (pattern[i] == '{')
{
// checks if it's the right syntax '{N}'...
if (isdigit(pattern[i + 1]) && pattern[i + 2] == '}')
{
addString();
addOperator(la_enum::COUNTER, pattern, i, 3);
i += 2;
}
else
{ // ...otherwise it counts as a string and is added to the buffer
characterBuffer.push_back(pattern[i]);
}
}
else if (pattern[i] == '\\')
{
// checks if it's the right syntax '\I'...
if (pattern[i + 1] == 'I')
{
addString();
addOperator(la_enum::IGNORE_CASE, pattern, i, 2);
i++;
}
// checks if it's the right syntax '\O{N}'...
else if (pattern[i + 1] == 'O' && pattern[i + 2] == '{' && isdigit(pattern[i + 3]) && pattern[i + 4] == '}')
{
addString();
addOperator(la_enum::SINGLE_CAPTURE, pattern, i, 5);
i += 4;
}
else
{ // ...otherwise it counts as a string and is added to the buffer
characterBuffer.push_back(pattern[i]);
}
}
else // If the symbol isn't one of those above
// it counts as a string (operand) and is added to the buffer
{
characterBuffer.push_back(pattern[i]);
}
}
}
// check one last time if the buffer has content which is then added to the vectors
addString();
// prints tokens and lexemes
for (int i = 0; i != lexemes.size(); i++)
{
std::cout << "Token: \"" << tokens[i] << "\" Lexeme: \"" << lexemes[i] << "\"" << std::endl;
}
}
// checks if it is an operator with ONE symbol
bool lexical_analyzer::isSingleSymbol(char c)
{
if (c == '+' || c == '*' || c == '(' || c == ')' || c == '.')
return true;
else
return false;
}
// checks if the buffer has content
// which is then added as lexeme and token
void lexical_analyzer::addString()
{
if (!characterBuffer.empty())
{
lexemes.push_back(characterBuffer);
tokens.push_back(la_enum::STRING);
}
}
// adds an operator as a lexeme in the 'lexeme' vector, and token in 'tokens' vector
void lexical_analyzer::addOperator(la_enum::token tok, std::string& str, int pos, int sz)
{
lexemes.push_back(std::string(str, pos, sz));
tokens.push_back(tok);
characterBuffer.clear();
}
main.cpp
#include <iostream>
#include "lexical_analyzer.h"
int main()
{
//std::string in;
//std::getline(std::cin, in);
//lexical_analyzer(std::move(in));
lexical_analyzer("Hell. (MY)\I n..e (is+was) Melwin.\O{0}");
return 0;
}
output:
Token: "0" Lexeme: "Hell"
Token: "5" Lexeme: "."
Token: "0" Lexeme: " "
Token: "3" Lexeme: "("
Token: "0" Lexeme: "MY"
Token: "4" Lexeme: ")"
Token: "0" Lexeme: "I n"
Token: "5" Lexeme: "."
Token: "5" Lexeme: "."
Token: "0" Lexeme: "e "
Token: "3" Lexeme: "("
Token: "0" Lexeme: "is"
Token: "1" Lexeme: "+"
Token: "0" Lexeme: "was"
Token: "4" Lexeme: ")"
Token: "0" Lexeme: " Melwin"
Token: "5" Lexeme: "."
Token: "0" Lexeme: "O"
Token: "6" Lexeme: "{0}"
mapto get fromchartoenumrather than enum defined in one place and then a big switch statement. The map might be more sophisticated with lambda/callbacks or similar for a design which could scale to something more complex? \$\endgroup\$