Parse EBNF
Write a program that can parse a grammar in Extended Backus–Naur Form (EBNF), and then parse something else according to the grammar.
- Task
The program is only required to decide whether or not the something else belongs to the language described by the grammar, but for extra credit, it can output a syntax tree.
See the tests.
The source of the Algol 68 sample is somewhat largish, so is on a separate page on Rosetta Code here: Parse EBNF/ALGOL 68.
- Output:
Valid EBNF: a/z: 'a' { a = 'a1' ( 'a2' | 'a3' ) { 'a4' } [ 'a5' ] 'a6' ; } 'z'
a: seq
literal "a1"
oneof
seq
literal "a2"
seq
literal "a3"
list
literal "a4"
opt
literal "a5"
literal "a6"
"a1a3a4a4a5a6" is valid according to a
"a1 a2a6" is valid according to a
"a1 a3 a4 a6" is valid according to a
**** Syntax error near "a1 a4 a5 a6"
"a1 a4 a5 a6" is not valid according to a
**** Syntax error near "a5 a5 a6"
"a1 a2 a4 a5 a5 a6" is not valid according to a
**** Unexpected text: "a7" at the end of source
"a1 a2 a4 a5 a6 a7" is not valid according to a
**** Syntax error near "your ad here"
"your ad here" is not valid according to a
Valid EBNF: Arithmetic expressions:
'Arithmetic expressions'
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = '+' | '-' .
times = '*' | '/' .
number = digit { digit } .
digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' .
}
expr: seq
rule term
list
rule plus
rule term
term: seq
rule factor
list
rule times
rule factor
factor: oneof
seq
rule number
seq
literal "("
rule expr
literal ")"
plus: oneof
seq
literal "+"
seq
literal "-"
times: oneof
seq
literal "*"
seq
literal "/"
number: seq
rule digit
list
rule digit
digit: oneof
seq
literal "0"
seq
literal "1"
seq
literal "2"
seq
literal "3"
seq
literal "4"
seq
literal "5"
seq
literal "6"
seq
literal "7"
seq
literal "8"
seq
literal "9"
"2" is valid according to Arithmetic expressions
"2*3 + 4/23 - 7" is valid according to Arithmetic expressions
"(3 + 4) * 6-2+(4*(4))" is valid according to Arithmetic expressions
**** Syntax error near "-2"
"-2" is not valid according to Arithmetic expressions
**** Unexpected text: "+" at the end of source
"3 +" is not valid according to Arithmetic expressions
**** Syntax error near " 3"
"(4 + 3" is not valid according to Arithmetic expressions
**** Expected "{", not "a"
Invalid EBNF: a = '1';
**** Expected "}", not "(eof)"
Invalid EBNF: { a = '1' ;
**** Expected "=", not "world"
Invalid EBNF: { hello world = '1'; }
**** Rule "bar" not defined
Invalid EBNF: { foo = bar . }
using System;
using System.Collections.Generic;
using System.Linq;
public class EBNFParser
{
// Token class equivalent
private class Token
{
public object Value { get; set; }
public bool IsSequence { get; set; }
public Token(object value, bool isSequence)
{
Value = value;
IsSequence = isSequence;
}
}
// Sequence class equivalent to ArrayList with constructor
private class Sequence : List<object>
{
public Sequence(params object[] items)
{
AddRange(items);
}
}
private string src;
private char ch;
private int sdx;
private Token token;
private bool err = false;
private List<string> idents = new List<string>();
private List<int> ididx = new List<int>();
private List<Sequence> productions = new List<Sequence>();
private Sequence extras = new Sequence();
private string[] results = { "pass", "fail" };
private int BoolToInt(bool b)
{
return b ? 1 : 0;
}
private int Invalid(string msg)
{
err = true;
Console.WriteLine(msg);
sdx = src.Length; // set to eof
return -1;
}
private void SkipSpaces()
{
while (sdx < src.Length)
{
ch = src[sdx];
if (" \t\r\n".IndexOf(ch) == -1)
{
break;
}
sdx++;
}
}
private void GetToken()
{
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
SkipSpaces();
if (sdx >= src.Length)
{
token = new Token(-1, false);
return;
}
int tokstart = sdx;
if ("{}()[]|=.;".IndexOf(ch) >= 0)
{
sdx++;
token = new Token(ch, false);
}
else if (ch == '"' || ch == '\'')
{
char closech = ch;
int tokend = tokstart + 1;
while (tokend < src.Length && src[tokend] != closech)
{
tokend++;
}
if (tokend >= src.Length)
{
token = new Token(Invalid("no closing quote"), false);
}
else
{
sdx = tokend + 1;
token = new Token(new Sequence("terminal", src.Substring(tokstart + 1, tokend - tokstart - 1)), true);
}
}
else if (ch >= 'a' && ch <= 'z')
{
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (sdx < src.Length && ch >= 'a' && ch <= 'z')
{
sdx++;
if (sdx < src.Length)
{
ch = src[sdx];
}
}
token = new Token(new Sequence("ident", src.Substring(tokstart, sdx - tokstart)), true);
}
else
{
token = new Token(Invalid("invalid ebnf"), false);
}
}
private void MatchToken(char expectedCh)
{
if (!token.Value.Equals(expectedCh))
{
token = new Token(Invalid($"invalid ebnf ({expectedCh} expected)"), false);
}
else
{
GetToken();
}
}
private int AddIdent(string ident)
{
int k = idents.IndexOf(ident);
if (k == -1)
{
idents.Add(ident);
k = idents.Count - 1;
ididx.Add(-1);
}
return k;
}
private object Factor()
{
object res;
if (token.IsSequence)
{
Sequence t = (Sequence)token.Value;
if (t[0].Equals("ident"))
{
int idx = AddIdent((string)t[1]);
t.Add(idx);
token.Value = t;
}
res = token.Value;
GetToken();
}
else if (token.Value.Equals('['))
{
GetToken();
res = new Sequence("optional", Expression());
MatchToken(']');
}
else if (token.Value.Equals('('))
{
GetToken();
res = Expression();
MatchToken(')');
}
else if (token.Value.Equals('{'))
{
GetToken();
res = new Sequence("repeat", Expression());
MatchToken('}');
}
else
{
throw new InvalidOperationException("invalid token in Factor() function");
}
if (res is Sequence seq && seq.Count == 1)
{
return seq[0];
}
return res;
}
private object Term()
{
Sequence res = new Sequence(Factor());
object[] tokens = { -1, '|', '.', ';', ')', ']', '}' };
while (true)
{
bool found = false;
foreach (object t in tokens)
{
if (t.Equals(token.Value))
{
found = true;
break;
}
}
if (found) break;
res.Add(Factor());
}
if (res.Count == 1)
{
return res[0];
}
return res;
}
private object Expression()
{
Sequence res = new Sequence(Term());
if (token.Value.Equals('|'))
{
res = new Sequence("or", res[0]);
while (token.Value.Equals('|'))
{
GetToken();
res.Add(Term());
}
}
if (res.Count == 1)
{
return res[0];
}
return res;
}
private object Production()
{
// Returns a token or -1; the real result is left in 'productions' etc,
GetToken();
if (!token.Value.Equals('}'))
{
if (token.Value.Equals(-1))
{
return Invalid("invalid ebnf (missing closing })");
}
if (!token.IsSequence)
{
return -1;
}
Sequence t = (Sequence)token.Value;
if (!t[0].Equals("ident"))
{
return -1;
}
string ident = (string)t[1];
int idx = AddIdent(ident);
GetToken();
MatchToken('=');
if (token.Value.Equals(-1))
{
return -1;
}
productions.Add(new Sequence(ident, idx, Expression()));
ididx[idx] = productions.Count - 1;
}
return token.Value;
}
private int Parse(string ebnf)
{
// Returns +1 if ok, -1 if bad.
Console.WriteLine($"parse:\n{ebnf} ===>");
err = false;
src = ebnf;
sdx = 0;
idents.Clear();
ididx.Clear();
productions.Clear();
extras.Clear();
GetToken();
if (token.IsSequence)
{
Sequence t = (Sequence)token.Value;
t[0] = "title";
extras.Add(token.Value);
GetToken();
}
if (!token.Value.Equals('{'))
{
return Invalid("invalid ebnf (missing opening {)");
}
while (true)
{
object tokenResult = Production();
if (tokenResult.Equals('}') || tokenResult.Equals(-1))
{
break;
}
}
GetToken();
if (token.IsSequence)
{
Sequence t = (Sequence)token.Value;
t[0] = "comment";
extras.Add(token.Value);
GetToken();
}
if (!token.Value.Equals(-1))
{
return Invalid("invalid ebnf (missing eof?)");
}
if (err)
{
return -1;
}
int k = -1;
for (int i = 0; i < ididx.Count; i++)
{
if (ididx[i] == -1)
{
k = i;
break;
}
}
if (k != -1)
{
return Invalid($"invalid ebnf (undefined:{idents[k]})");
}
PPrint(productions, "productions");
PPrint(idents, "idents");
PPrint(ididx, "ididx");
PPrint(extras, "extras");
return 1;
}
// Adjusts C#'s normal printing to look more like Phix output.
private void PPrint(object ob, string header)
{
Console.WriteLine($"\n{header}:");
string pp = ob.ToString();
pp = pp.Replace("[", "{");
pp = pp.Replace("]", "}");
pp = pp.Replace(" ", ", ");
for (int i = 0; i < idents.Count; i++)
{
pp = pp.Replace(i.ToString(), i.ToString());
}
Console.WriteLine(pp);
}
// The rules that Applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
private bool Applies(Sequence rule)
{
int wasSdx = sdx; // in case of failure
object r1 = rule[0];
if (!(r1 is string))
{
for (int i = 0; i < rule.Count; i++)
{
if (!Applies((Sequence)rule[i]))
{
sdx = wasSdx;
return false;
}
}
}
else if (r1.Equals("terminal"))
{
SkipSpaces();
string r2 = (string)rule[1];
for (int i = 0; i < r2.Length; i++)
{
if (sdx >= src.Length || src[sdx] != r2[i])
{
sdx = wasSdx;
return false;
}
sdx++;
}
}
else if (r1.Equals("or"))
{
for (int i = 1; i < rule.Count; i++)
{
if (Applies((Sequence)rule[i]))
{
return true;
}
}
sdx = wasSdx;
return false;
}
else if (r1.Equals("repeat"))
{
while (Applies((Sequence)rule[1]))
{
// continue repeating
}
}
else if (r1.Equals("optional"))
{
Applies((Sequence)rule[1]);
}
else if (r1.Equals("ident"))
{
int i = (int)rule[2];
int ii = ididx[i];
if (!Applies((Sequence)productions[ii][2]))
{
sdx = wasSdx;
return false;
}
}
else
{
throw new InvalidOperationException("invalid rule in Applies() function");
}
return true;
}
private void CheckValid(string test)
{
src = test;
sdx = 0;
bool res = Applies((Sequence)productions[0][2]);
SkipSpaces();
if (sdx < src.Length)
{
res = false;
}
Console.WriteLine($"\"{test}\", {results[1 - BoolToInt(res)]}");
}
public static void Main(string[] args)
{
EBNFParser parser = new EBNFParser();
string[] ebnfs = {
"\"a\" {\n" +
" a = \"a1\" ( \"a2\" | \"a3\" ) { \"a4\" } [ \"a5\" ] \"a6\" ;\n" +
"} \"z\" ",
"{\n" +
" expr = term { plus term } .\n" +
" term = factor { times factor } .\n" +
" factor = number | '(' expr ')' .\n" +
" \n" +
" plus = \"+\" | \"-\" .\n" +
" times = \"*\" | \"/\" .\n" +
" \n" +
" number = digit { digit } .\n" +
" digit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n" +
"}",
"a = \"1\"",
"{ a = \"1\" ;",
"{ hello world = \"1\"; }",
"{ foo = bar . }"
};
string[][] tests = {
new string[] {
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
},
new string[] {
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
}
};
for (int i = 0; i < ebnfs.Length; i++)
{
if (parser.Parse(ebnfs[i]) == 1)
{
Console.WriteLine("\ntests:");
if (i < tests.Length)
{
foreach (string test in tests[i])
{
parser.CheckValid(test);
}
}
}
Console.WriteLine();
}
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
System.Collections.Generic.List`1{EBNFParser+Sequence}
idents:
System.Collections.Generic.List`1{System.String}
ididx:
System.Collections.Generic.List`1{System.Int32}
extras:
EBNFParser+Sequence
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
System.Collections.Generic.List`1{EBNFParser+Sequence}
idents:
System.Collections.Generic.List`1{System.String}
ididx:
System.Collections.Generic.List`1{System.Int32}
extras:
EBNFParser+Sequence
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
#include <iostream>
#include <string>
#include <vector>
#include <memory>
#include <variant>
#include <algorithm>
#include <optional>
class Token {
public:
enum Type { CHAR, SEQUENCE, EOF_TOKEN };
Type type;
char ch;
std::vector<std::string> seq;
Token() : type(EOF_TOKEN), ch('\0') {}
Token(char c) : type(CHAR), ch(c) {}
Token(const std::vector<std::string>& s) : type(SEQUENCE), ch('\0'), seq(s) {}
bool isChar(char c) const {
return type == CHAR && ch == c;
}
bool isEOF() const {
return type == EOF_TOKEN;
}
};
class Rule {
public:
enum Type { TERMINAL, IDENT, OR, REPEAT, OPTIONAL, SEQUENCE };
Type type;
std::string value;
size_t idx;
std::vector<std::unique_ptr<Rule>> rules;
std::unique_ptr<Rule> rule;
Rule(Type t, const std::string& v) : type(t), value(v), idx(0) {}
Rule(Type t, const std::string& v, size_t i) : type(t), value(v), idx(i) {}
Rule(Type t, std::vector<std::unique_ptr<Rule>> r) : type(t), idx(0), rules(std::move(r)) {}
Rule(Type t, std::unique_ptr<Rule> r) : type(t), idx(0), rule(std::move(r)) {}
// Copy constructor
Rule(const Rule& other) : type(other.type), value(other.value), idx(other.idx) {
if (other.rule) {
rule = std::make_unique<Rule>(*other.rule);
}
for (const auto& r : other.rules) {
rules.push_back(std::make_unique<Rule>(*r));
}
}
// Assignment operator
Rule& operator=(const Rule& other) {
if (this != &other) {
type = other.type;
value = other.value;
idx = other.idx;
rule.reset();
rules.clear();
if (other.rule) {
rule = std::make_unique<Rule>(*other.rule);
}
for (const auto& r : other.rules) {
rules.push_back(std::make_unique<Rule>(*r));
}
}
return *this;
}
static std::unique_ptr<Rule> createTerminal(const std::string& v) {
return std::make_unique<Rule>(TERMINAL, v);
}
static std::unique_ptr<Rule> createIdent(const std::string& v, size_t i) {
return std::make_unique<Rule>(IDENT, v, i);
}
static std::unique_ptr<Rule> createOr(std::vector<std::unique_ptr<Rule>> r) {
return std::make_unique<Rule>(OR, std::move(r));
}
static std::unique_ptr<Rule> createRepeat(std::unique_ptr<Rule> r) {
return std::make_unique<Rule>(REPEAT, std::move(r));
}
static std::unique_ptr<Rule> createOptional(std::unique_ptr<Rule> r) {
return std::make_unique<Rule>(OPTIONAL, std::move(r));
}
static std::unique_ptr<Rule> createSequence(std::vector<std::unique_ptr<Rule>> r) {
return std::make_unique<Rule>(SEQUENCE, std::move(r));
}
};
class EBNFParser {
private:
std::string src;
char ch;
size_t sdx;
Token token;
bool err;
std::vector<std::string> idents;
std::vector<std::optional<size_t>> ididx;
std::vector<std::tuple<std::string, size_t, std::unique_ptr<Rule>>> productions;
std::vector<std::vector<std::string>> extras;
int btoi(bool b) {
return b ? 1 : 0;
}
int invalid(const std::string& msg) {
err = true;
std::cout << msg << std::endl;
sdx = src.length();
return -1;
}
void skipSpaces() {
while (sdx < src.length()) {
ch = src[sdx];
if (ch != ' ' && ch != '\t' && ch != '\r' && ch != '\n') {
break;
}
sdx++;
}
}
void getToken() {
skipSpaces();
if (sdx >= src.length()) {
token = Token();
return;
}
size_t tokstart = sdx;
if (std::string("{}()[]|=.;").find(ch) != std::string::npos) {
sdx++;
token = Token(ch);
} else if (ch == '"' || ch == '\'') {
char closech = ch;
size_t tokend = tokstart + 1;
while (tokend < src.length() && src[tokend] != closech) {
tokend++;
}
if (tokend >= src.length()) {
invalid("no closing quote");
token = Token();
} else {
sdx = tokend + 1;
std::string content = src.substr(tokstart + 1, tokend - tokstart - 1);
token = Token(std::vector<std::string>{"terminal", content});
}
} else if (std::islower(ch)) {
while (sdx < src.length() && std::islower(src[sdx])) {
sdx++;
}
std::string ident = src.substr(tokstart, sdx - tokstart);
token = Token(std::vector<std::string>{"ident", ident});
} else {
invalid("invalid ebnf");
token = Token();
}
}
void matchToken(char expected_ch) {
if (!token.isChar(expected_ch)) {
invalid("invalid ebnf (" + std::string(1, expected_ch) + " expected)");
} else {
getToken();
}
}
size_t addIdent(const std::string& ident) {
auto it = std::find(idents.begin(), idents.end(), ident);
if (it != idents.end()) {
return std::distance(idents.begin(), it);
} else {
idents.push_back(ident);
size_t k = idents.size() - 1;
ididx.push_back(std::nullopt);
return k;
}
}
std::unique_ptr<Rule> factor() {
if (token.type == Token::SEQUENCE) {
if (token.seq[0] == "ident") {
size_t idx = addIdent(token.seq[1]);
auto rule = Rule::createIdent(token.seq[1], idx);
getToken();
return rule;
} else if (token.seq[0] == "terminal") {
auto rule = Rule::createTerminal(token.seq[1]);
getToken();
return rule;
}
} else if (token.isChar('[')) {
getToken();
auto expr = expression();
matchToken(']');
return Rule::createOptional(std::move(expr));
} else if (token.isChar('(')) {
getToken();
auto expr = expression();
matchToken(')');
return expr;
} else if (token.isChar('{')) {
getToken();
auto expr = expression();
matchToken('}');
return Rule::createRepeat(std::move(expr));
}
throw std::runtime_error("invalid token in factor() function");
}
std::unique_ptr<Rule> term() {
std::vector<std::unique_ptr<Rule>> factors;
factors.push_back(factor());
while (!token.isEOF() && !token.isChar('|') && !token.isChar('.') &&
!token.isChar(';') && !token.isChar(')') && !token.isChar(']') &&
!token.isChar('}')) {
factors.push_back(factor());
}
if (factors.size() == 1) {
return std::move(factors[0]);
} else {
return Rule::createSequence(std::move(factors));
}
}
std::unique_ptr<Rule> expression() {
auto first_term = term();
if (token.isChar('|')) {
std::vector<std::unique_ptr<Rule>> terms;
terms.push_back(std::move(first_term));
while (token.isChar('|')) {
getToken();
terms.push_back(term());
}
return Rule::createOr(std::move(terms));
} else {
return first_term;
}
}
Token production() {
getToken();
if (token.isChar('}')) {
return token;
}
if (token.isEOF()) {
invalid("invalid ebnf (missing closing })");
return Token();
}
if (token.type == Token::SEQUENCE && token.seq[0] == "ident") {
std::string ident = token.seq[1];
size_t idx = addIdent(ident);
getToken();
matchToken('=');
if (token.isEOF()) {
return Token();
}
auto expr = expression();
productions.push_back(std::make_tuple(ident, idx, std::move(expr)));
ididx[idx] = productions.size() - 1;
return token;
}
return Token();
}
void skipSpacesAt(const std::string& s, size_t& sdx) const {
while (sdx < s.length()) {
char ch = s[sdx];
if (ch != ' ' && ch != '\t' && ch != '\r' && ch != '\n') {
break;
}
sdx++;
}
}
bool applies(const Rule* rule, const std::string& s, size_t& sdx) const {
size_t was_sdx = sdx;
switch (rule->type) {
case Rule::SEQUENCE:
for (const auto& r : rule->rules) {
if (!applies(r.get(), s, sdx)) {
sdx = was_sdx;
return false;
}
}
return true;
case Rule::TERMINAL:
skipSpacesAt(s, sdx);
for (char ch : rule->value) {
if (sdx >= s.length() || s[sdx] != ch) {
sdx = was_sdx;
return false;
}
sdx++;
}
return true;
case Rule::OR:
for (const auto& r : rule->rules) {
if (applies(r.get(), s, sdx)) {
return true;
}
}
sdx = was_sdx;
return false;
case Rule::REPEAT:
while (applies(rule->rule.get(), s, sdx)) {
// continue repeating
}
return true;
case Rule::OPTIONAL:
applies(rule->rule.get(), s, sdx);
return true;
case Rule::IDENT:
if (ididx[rule->idx].has_value()) {
size_t prod_idx = ididx[rule->idx].value();
if (!applies(std::get<2>(productions[prod_idx]).get(), s, sdx)) {
sdx = was_sdx;
return false;
}
return true;
} else {
return false;
}
}
return false;
}
void pprintProductions() const {
std::cout << "\nproductions:" << std::endl;
for (const auto& prod : productions) {
std::cout << "(" << std::get<0>(prod) << ", " << std::get<1>(prod) << ", [Rule])" << std::endl;
}
}
void pprintIdents() const {
std::cout << "\nidents:" << std::endl;
for (const auto& ident : idents) {
std::cout << ident << " ";
}
std::cout << std::endl;
}
void pprintIdidx() const {
std::cout << "\nididx:" << std::endl;
for (const auto& idx : ididx) {
if (idx.has_value()) {
std::cout << idx.value() << " ";
} else {
std::cout << "None ";
}
}
std::cout << std::endl;
}
void pprintExtras() const {
std::cout << "\nextras:" << std::endl;
for (const auto& extra : extras) {
std::cout << "[";
for (size_t i = 0; i < extra.size(); ++i) {
std::cout << extra[i];
if (i < extra.size() - 1) std::cout << ", ";
}
std::cout << "]" << std::endl;
}
}
public:
EBNFParser() : ch('\0'), sdx(0), err(false) {}
int parse(const std::string& ebnf) {
std::cout << "parse:\n" << ebnf << " ===>\n" << std::endl;
err = false;
src = ebnf;
sdx = 0;
idents.clear();
ididx.clear();
productions.clear();
extras.clear();
try {
getToken();
if (token.type == Token::SEQUENCE) {
auto t = token.seq;
t[0] = "title";
extras.push_back(t);
getToken();
}
if (!token.isChar('{')) {
return invalid("invalid ebnf (missing opening {)");
}
while (true) {
Token result = production();
if (result.isChar('}') || result.isEOF()) {
break;
}
}
getToken();
if (token.type == Token::SEQUENCE) {
auto t = token.seq;
t[0] = "comment";
extras.push_back(t);
getToken();
}
if (!token.isEOF()) {
return invalid("invalid ebnf (missing eof?)");
}
if (err) {
return -1;
}
// Check for undefined identifiers
for (size_t i = 0; i < ididx.size(); ++i) {
if (!ididx[i].has_value()) {
return invalid("invalid ebnf (undefined:" + idents[i] + ")");
}
}
pprintProductions();
pprintIdents();
pprintIdidx();
pprintExtras();
return 1;
} catch (const std::exception& e) {
return -1;
}
}
void checkValid(const std::string& test) const {
size_t sdx = 0;
bool res = applies(std::get<2>(productions[0]).get(), test, sdx);
skipSpacesAt(test, sdx);
bool final_res = res && sdx >= test.length();
std::string result = final_res ? "pass" : "fail";
std::cout << "\"" << test << "\", " << result << std::endl;
}
};
int main() {
EBNFParser parser;
std::vector<std::string> ebnfs = {
R"("a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" )",
R"({
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
})",
R"(a = "1")",
R"({ a = "1" ;)",
R"({ hello world = "1"; })",
R"({ foo = bar . })"
};
std::vector<std::vector<std::string>> tests = {
{
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
},
{
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
}
};
for (size_t i = 0; i < ebnfs.size(); ++i) {
if (parser.parse(ebnfs[i]) == 1) {
std::cout << "\ntests:" << std::endl;
if (i < tests.size()) {
for (const auto& test : tests[i]) {
parser.checkValid(test);
}
}
}
std::cout << std::endl;
}
return 0;
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
(a, 0, [Rule])
idents:
a
ididx:
0
extras:
[title, a]
[comment, z]
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
(expr, 0, [Rule])
(term, 1, [Rule])
(factor, 3, [Rule])
(plus, 2, [Rule])
(times, 4, [Rule])
(number, 5, [Rule])
(digit, 6, [Rule])
idents:
expr term plus factor times number digit
ididx:
0 1 3 2 4 5 6
extras:
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
The validators created by this parser allow white space before literals (only way to pass the tests). This means that while the parser rejects the "invalid" EBNF { hello world = "1"; }, an EBNF grammar parsed by it would deem it valid.
module EBNF
class Syntax
getter prods, start
getter title, comment
def initialize (@prods : Hash(String, Prod), @start : Prod, title, comment)
@comment = comment || ""
@title = title || ""
end
def matches (s)
sc = Scanner.new s
start.expr.matches sc
sc.eof?
end
def dump (io)
io.puts "title: #{title}"
io.puts "comment: #{comment}"
io.puts "productions:"
prods.keys.sort.each do |prod|
head = " #{prod} = "
io << head
prods[prod].expr.dump io, head.size
io.puts
end
io.puts "start: #{start.name}"
end
def self.parse (source)
syntax = Parser.new(source).parse_syntax
syntax.prods.each_value do |prod|
prod.fix_refs syntax.prods
end
syntax
end
end
class Scanner
getter ch : Char?, io
def initialize (@io : IO)
get
end
def initialize (s : String)
@io = IO::Memory.new(s.to_slice, false)
get
end
def get
@ch = io.read_char
end
def save
{ @ch, io.pos }
end
def restore (tuple)
@ch, @io.pos = tuple
end
def skip_spaces
while (c = ch) && c.whitespace?; get; end
end
def over (s : String)
s.each_char do |char|
raise "<#{s}> expected at #{io.pos}" unless ch == char
get
end
end
def over (char : Char)
over char.to_s
end
def over_one_of (s, msg = nil)
msg ||= "one of <#{s}> expected"
raise "#{msg} at #{io.pos}" unless (c = ch) && c.in_set? s
get
end
def string
quote = ch.not_nil!
over quote
result = String.build do |s|
while (c = ch) && c != quote
s << c; get
end
end
over quote
result
end
def ident
raise "letter expected at #{io.pos}" unless (c = ch) && c.letter?
String.build do |s|
s << c; get
while (c = ch) && c.alphanumeric?
s << c; get
end
end
end
def eof?; ch == nil; end
end
class Parser
getter sc
def initialize (io)
@sc = Scanner.new io
end
def skip; sc.skip_spaces; end
def parse_syntax
skip; title = if (c = sc.ch) && c.in_set? "'\""
sc.string
end
prods = Hash(String, Prod).new
first_prod = nil
skip; sc.over '{'; skip
while (c = sc.ch) && c.letter?
prod = parse_prod
first_prod ||= prod.name
prods[prod.name] = prod
skip
end
raise "production expected at #{sc.io.pos}" unless first_prod
sc.over '}'
skip; comment = if (c = sc.ch) && c.in_set? "\"'"
sc.string
end
EBNF::Syntax.new prods, prods[first_prod], title, comment
end
def parse_prod
skip; name = sc.ident
skip; sc.over '='
expr = parse_or
skip; sc.over_one_of ".;", "terminator expected"
EBNF::Prod.new name, expr
end
def parse_or
exprs = [] of Expr
loop do
exprs << parse_seq
skip; break unless sc.ch == '|'
sc.over '|'
end
return exprs[0] if exprs.size == 1
OrExpr.new exprs
end
def parse_seq
exprs = [] of Expr
loop do
skip; break unless (c = sc.ch)
case c
when .letter? then exprs << RefExpr.new sc.ident
when .in_set? "\"'" then exprs << LitExpr.new sc.string
when '[' then sc.over '['; exprs << OptExpr.new parse_or; sc.over ']'
when '{' then sc.over '{'; exprs << RepExpr.new parse_or; sc.over '}'
when '(' then sc.over '('; exprs << parse_or; sc.over ')'
else
break
end
end
raise "this parser doesn't support empty productions, at #{sc.io.pos}" unless exprs.size > 0
return exprs[0] if exprs.size == 1
SeqExpr.new exprs
end
end
class Prod
getter name, expr
def initialize (@name : String, @expr : Expr)
end
def fix_refs (prods)
@expr.fix_refs prods
end
end
abstract class Expr
getter subexprs = [] of Expr
getter op = ""
def dump (io, indent)
io << "(" << op << " "
newindent = indent + op.size + 2
first = true
subexprs.each do |expr|
unless first
io << "\n" << " " * newindent
end
first = false
expr.dump io, newindent
end
io << ")"
end
def fix_refs (prods)
subexprs.each &.fix_refs(prods)
end
end
class OrExpr < Expr
def initialize (@subexprs)
@op = "or"
end
def matches (sc)
pos = sc.save
subexprs.each do |alt|
return true if alt.matches sc
sc.restore pos
end
false
end
end
class SeqExpr < Expr
def initialize (@subexprs)
@op = "and"
end
def matches (sc)
subexprs.each do |expr|
return false unless expr.matches sc
end
true
end
end
class RefExpr < Expr
getter name, ref : Expr?
def initialize (@name : String)
@op = "ref"
end
def fix_refs (prods)
@ref = prods[name]?.try &.expr
raise "undefined production <#{name}>" unless @ref
end
def dump (io, indent=0)
io << "<" << name << ">"
end
def matches (sc)
raise "undefined production <#{name}>" unless ref
ref.not_nil!.matches sc
end
end
class LitExpr < Expr
getter string
def initialize (@string : String)
end
def dump (io, indent=0)
string.inspect io
end
def matches (sc)
sc.skip_spaces # necessary to pass tests (!?)
begin
sc.over string
rescue
return false
end
true
end
end
class OptExpr < Expr
def initialize (expr)
@op = "opt"
@subexprs = [expr]
end
def matches (sc)
pos = sc.save
unless subexprs.first.matches sc
sc.restore pos
end
true
end
end
class RepExpr < Expr
def initialize (expr)
@op = "rep"
@subexprs = [expr]
end
def matches (sc)
pos = sc.save
while subexprs.first.matches sc
pos = sc.save
end
sc.restore pos
true
end
end
end
def test (syntax, tests)
puts "SOURCE:"
puts syntax
syn = EBNF::Syntax.parse syntax
puts "PARSED:"
syn.dump STDOUT
puts "TESTS:"
tests.each do |test|
result = syn.matches(test) ? "PASS" : "fail"
puts " #{result} #{test}"
end
end
syntax = <<-EOT
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"
EOT
test syntax, ["a1a3a4a4a5a6", "a1 a2a6", "a1 a3 a4 a6", "a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6", "a1 a2 a4 a5 a6 a7", "your ad here"]
puts
syntax = <<-EOT
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}
EOT
test syntax, ["2", "2*3 + 4/23 - 7", "(3 + 4) * 6-2+(4*(4))", "-2", "3 +", "(4 + 3"]
puts
puts "INVALID SYNTAXES:"
[%<a = "1";>, %<{ a = "1" ;>, %<{ hello world = "1"; }>, %<{ foo = bar . }>].each do |syn|
puts "Parsing syntax: #{syn}"
print " -> "
begin
EBNF::Syntax.parse syn
puts "parsed OK (!!)"
rescue e
puts "Error: #{e.message}"
end
end
- Output:
SOURCE:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"
PARSED:
title: a
comment: z
productions:
a = (and "a1"
(or "a2"
"a3")
(rep "a4")
(opt "a5")
"a6")
start: a
TESTS:
PASS a1a3a4a4a5a6
PASS a1 a2a6
PASS a1 a3 a4 a6
fail a1 a4 a5 a6
fail a1 a2 a4 a5 a5 a6
fail a1 a2 a4 a5 a6 a7
fail your ad here
SOURCE:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}
PARSED:
title:
comment:
productions:
digit = (or "0"
"1"
"2"
"3"
"4"
"5"
"6"
"7"
"8"
"9")
expr = (and <term>
(rep (and <plus>
<term>)))
factor = (or <number>
(and "("
<expr>
")"))
number = (and <digit>
(rep <digit>))
plus = (or "+"
"-")
term = (and <factor>
(rep (and <times>
<factor>)))
times = (or "*"
"/")
start: expr
TESTS:
PASS 2
PASS 2*3 + 4/23 - 7
PASS (3 + 4) * 6-2+(4*(4))
fail -2
fail 3 +
fail (4 + 3
INVALID SYNTAXES:
Parsing syntax: a = "1";
-> Error: <{> expected at 1
Parsing syntax: { a = "1" ;
-> Error: <}> expected at 11
Parsing syntax: { hello world = "1"; }
-> Error: <=> expected at 9
Parsing syntax: { foo = bar . }
-> Error: undefined production <bar>
import 'dart:io';
class Token {
dynamic value;
bool isSequence;
Token(this.value, this.isSequence);
}
class Sequence {
List<dynamic> _items = [];
Sequence([List<dynamic>? items]) {
if (items != null) {
_items.addAll(items);
}
}
Sequence.from(List<dynamic> items) {
_items.addAll(items);
}
// Delegate to the internal list
void add(dynamic item) => _items.add(item);
void addAll(Iterable<dynamic> items) => _items.addAll(items);
dynamic operator [](int index) => _items[index];
void operator []=(int index, dynamic value) => _items[index] = value;
int get length => _items.length;
void clear() => _items.clear();
bool get isEmpty => _items.isEmpty;
bool get isNotEmpty => _items.isNotEmpty;
@override
String toString() => _items.toString();
}
class EBNFParser {
late String src;
String ch = '';
int sdx = 0;
Token? token;
bool err = false;
List<String> idents = [];
List<int> ididx = [];
List<Sequence> productions = [];
Sequence extras = Sequence();
List<String> results = ["pass", "fail"];
int btoi(bool b) {
return b ? 1 : 0;
}
int invalid(String msg) {
err = true;
print(msg);
sdx = src.length; // set to eof
return -1;
}
void skipSpaces() {
while (sdx < src.length) {
ch = src[sdx];
if (!" \t\r\n".contains(ch)) {
break;
}
sdx++;
}
}
void getToken() {
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
skipSpaces();
if (sdx >= src.length) {
token = Token(-1, false);
return;
}
int tokstart = sdx;
if ("{}()[]|=.;".contains(ch)) {
sdx++;
token = Token(ch, false);
} else if (ch == '"' || ch == "'") {
String closech = ch;
int tokend = tokstart + 1;
while (tokend < src.length && src[tokend] != closech) {
tokend++;
}
if (tokend >= src.length) {
token = Token(invalid("no closing quote"), false);
} else {
sdx = tokend + 1;
token = Token(Sequence(["terminal", src.substring(tokstart + 1, tokend)]), true);
}
} else if (ch.codeUnitAt(0) >= 'a'.codeUnitAt(0) && ch.codeUnitAt(0) <= 'z'.codeUnitAt(0)) {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (sdx < src.length && ch.codeUnitAt(0) >= 'a'.codeUnitAt(0) && ch.codeUnitAt(0) <= 'z'.codeUnitAt(0)) {
sdx++;
if (sdx < src.length) {
ch = src[sdx];
}
}
token = Token(Sequence(["ident", src.substring(tokstart, sdx)]), true);
} else {
token = Token(invalid("invalid ebnf"), false);
}
}
void matchToken(String expectedCh) {
if (token!.value != expectedCh) {
token = Token(invalid("invalid ebnf ($expectedCh expected)"), false);
} else {
getToken();
}
}
int addIdent(String ident) {
int k = idents.indexOf(ident);
if (k == -1) {
idents.add(ident);
k = idents.length - 1;
ididx.add(-1);
}
return k;
}
dynamic factor() {
dynamic res;
if (token!.isSequence) {
Sequence t = token!.value as Sequence;
if (t[0] == "ident") {
int idx = addIdent(t[1] as String);
t.add(idx);
token!.value = t;
}
res = token!.value;
getToken();
} else if (token!.value == '[') {
getToken();
res = Sequence(["optional", expression()]);
matchToken(']');
} else if (token!.value == '(') {
getToken();
res = expression();
matchToken(')');
} else if (token!.value == '{') {
getToken();
res = Sequence(["repeat", expression()]);
matchToken('}');
} else {
throw Exception("invalid token in factor() function");
}
if (res is Sequence && res.length == 1) {
return res[0];
}
return res;
}
dynamic term() {
Sequence res = Sequence([factor()]);
List<dynamic> tokens = [-1, '|', '.', ';', ')', ']', '}'];
outer:
while (true) {
for (dynamic t in tokens) {
if (t == token!.value) {
break outer;
}
}
res.add(factor());
}
if (res.length == 1) {
return res[0];
}
return res;
}
dynamic expression() {
Sequence res = Sequence([term()]);
if (token!.value == '|') {
res = Sequence(["or", res[0]]);
while (token!.value == '|') {
getToken();
res.add(term());
}
}
if (res.length == 1) {
return res[0];
}
return res;
}
dynamic production() {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken();
if (token!.value != '}') {
if (token!.value == -1) {
return invalid("invalid ebnf (missing closing })");
}
if (!token!.isSequence) {
return -1;
}
Sequence t = token!.value as Sequence;
if (t[0] != "ident") {
return -1;
}
String ident = t[1] as String;
int idx = addIdent(ident);
getToken();
matchToken('=');
if (token!.value == -1) {
return -1;
}
productions.add(Sequence([ident, idx, expression()]));
ididx[idx] = productions.length - 1;
}
return token!.value;
}
int parse(String ebnf) {
// Returns +1 if ok, -1 if bad.
print("parse:");
print("$ebnf ===>");
err = false;
src = ebnf;
sdx = 0;
idents.clear();
ididx.clear();
productions.clear();
extras.clear();
getToken();
if (token!.isSequence) {
Sequence t = token!.value as Sequence;
t[0] = "title";
extras.add(token!.value);
getToken();
}
if (token!.value != '{') {
return invalid("invalid ebnf (missing opening {)");
}
while (true) {
dynamic tokenResult = production();
if (tokenResult == '}' || tokenResult == -1) {
break;
}
}
getToken();
if (token!.isSequence) {
Sequence t = token!.value as Sequence;
t[0] = "comment";
extras.add(token!.value);
getToken();
}
if (token!.value != -1) {
return invalid("invalid ebnf (missing eof?)");
}
if (err) {
return -1;
}
int k = -1;
for (int i = 0; i < ididx.length; i++) {
if (ididx[i] == -1) {
k = i;
break;
}
}
if (k != -1) {
return invalid("invalid ebnf (undefined:${idents[k]})");
}
pprint(productions, "productions");
pprint(idents, "idents");
pprint(ididx, "ididx");
pprint(extras, "extras");
return 1;
}
// Adjusts Dart's normal printing to look more like Phix output.
void pprint(dynamic ob, String header) {
print("\n$header:");
String pp = ob.toString();
pp = pp.replaceAll("[", "{");
pp = pp.replaceAll("]", "}");
pp = pp.replaceAll(" ", ", ");
for (int i = 0; i < idents.length; i++) {
pp = pp.replaceAll(i.toString(), i.toString());
}
print(pp);
}
// The rules that applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
bool applies(Sequence rule) {
int wasSdx = sdx; // in case of failure
dynamic r1 = rule[0];
if (r1 is! String) {
for (int i = 0; i < rule.length; i++) {
if (!applies(rule[i] as Sequence)) {
sdx = wasSdx;
return false;
}
}
} else if (r1 == "terminal") {
skipSpaces();
String r2 = rule[1] as String;
for (int i = 0; i < r2.length; i++) {
if (sdx >= src.length || src[sdx] != r2[i]) {
sdx = wasSdx;
return false;
}
sdx++;
}
} else if (r1 == "or") {
for (int i = 1; i < rule.length; i++) {
if (applies(rule[i] as Sequence)) {
return true;
}
}
sdx = wasSdx;
return false;
} else if (r1 == "repeat") {
while (applies(rule[1] as Sequence)) {
// continue repeating
}
} else if (r1 == "optional") {
applies(rule[1] as Sequence);
} else if (r1 == "ident") {
int i = rule[2] as int;
int ii = ididx[i];
if (!applies(productions[ii][2] as Sequence)) {
sdx = wasSdx;
return false;
}
} else {
throw Exception("invalid rule in applies() function");
}
return true;
}
void checkValid(String test) {
src = test;
sdx = 0;
bool res = applies(productions[0][2] as Sequence);
skipSpaces();
if (sdx < src.length) {
res = false;
}
print('"$test", ${results[1 - btoi(res)]}');
}
}
void main() {
EBNFParser parser = EBNFParser();
List<String> ebnfs = [
'"a" {\n'
' a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;\n'
'} "z" ',
'{\n'
' expr = term { plus term } .\n'
' term = factor { times factor } .\n'
' factor = number | \'(\' expr \')\' .\n'
' \n'
' plus = "+" | "-" .\n'
' times = "*" | "/" .\n'
' \n'
' number = digit { digit } .\n'
' digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .\n'
'}',
'a = "1"',
'{ a = "1" ;',
'{ hello world = "1"; }',
'{ foo = bar . }'
];
List<List<String>> tests = [
[
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
],
[
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
]
];
for (int i = 0; i < ebnfs.length; i++) {
if (parser.parse(ebnfs[i]) == 1) {
print("\ntests:");
if (i < tests.length) {
for (String test in tests[i]) {
parser.checkValid(test);
}
}
}
print("");
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{a,, 0,, {{terminal,, a1},, {or,, {terminal,, a2},, {terminal,, a3}},, {repeat,, {terminal,, a4}},, {optional,, {terminal,, a5}},, {terminal,, a6}}}}
idents:
{a}
ididx:
{0}
extras:
{{title,, a},, {comment,, z}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{expr,, 0,, {{ident,, term,, 1},, {repeat,, {{ident,, plus,, 2},, {ident,, term,, 1}}}}},, {term,, 1,, {{ident,, factor,, 3},, {repeat,, {{ident,, times,, 4},, {ident,, factor,, 3}}}}},, {factor,, 3,, {or,, {ident,, number,, 5},, {{terminal,, (},, {ident,, expr,, 0},, {terminal,, )}}}},, {plus,, 2,, {or,, {terminal,, +},, {terminal,, -}}},, {times,, 4,, {or,, {terminal,, *},, {terminal,, /}}},, {number,, 5,, {{ident,, digit,, 6},, {repeat,, {ident,, digit,, 6}}}},, {digit,, 6,, {or,, {terminal,, 0},, {terminal,, 1},, {terminal,, 2},, {terminal,, 3},, {terminal,, 4},, {terminal,, 5},, {terminal,, 6},, {terminal,, 7},, {terminal,, 8},, {terminal,, 9}}}}
idents:
{expr,, term,, plus,, factor,, times,, number,, digit}
ididx:
{0,, 1,, 3,, 2,, 4,, 5,, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
A more or less faithful translation except that indices are 0-based rather than 1-based and so 1 less than in the Phix results.
package main
import (
"fmt"
"strings"
)
// type aliases for Phix types
type object = interface{}
type sequence = []object
var (
src []rune
ch rune
sdx int
token object
isSeq bool
err = false
idents []string
ididx []int
productions []sequence
extras sequence
results = [2]string{"pass", "fail"}
)
func btoi(b bool) int {
if b {
return 1
}
return 0
}
func invalid(msg string) int {
err = true
fmt.Println(msg)
sdx = len(src) // set to eof
return -1
}
func skipSpaces() {
for {
if sdx >= len(src) {
break
}
ch = src[sdx]
if strings.IndexRune(" \t\r\n", ch) == -1 {
break
}
sdx++
}
}
func getToken() {
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
skipSpaces()
if sdx >= len(src) {
token = -1
isSeq = false
return
}
tokstart := sdx
if strings.IndexRune("{}()[]|=.;", ch) >= 0 {
sdx++
token = ch
isSeq = false
} else if ch == '"' || ch == '\'' {
closech := ch
for tokend := sdx + 1; tokend < len(src); tokend++ {
if src[tokend] == closech {
sdx = tokend + 1
token = sequence{"terminal", string(src[tokstart+1 : tokend])}
isSeq = true
return
}
}
token = invalid("no closing quote")
isSeq = false
} else if ch >= 'a' && ch <= 'z' {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
for {
sdx++
if sdx >= len(src) {
break
}
ch = src[sdx]
if ch < 'a' || ch > 'z' {
break
}
}
token = sequence{"ident", string(src[tokstart:sdx])}
isSeq = true
} else {
token = invalid("invalid ebnf")
isSeq = false
}
}
func matchToken(ch rune) {
if token != ch {
token = invalid(fmt.Sprintf("invalid ebnf (%c expected)", ch))
isSeq = false
} else {
getToken()
}
}
func addIdent(ident string) int {
k := -1
for i, id := range idents {
if id == ident {
k = i
break
}
}
if k == -1 {
idents = append(idents, ident)
k = len(idents) - 1
ididx = append(ididx, -1)
}
return k
}
func factor() object {
var res object
if isSeq {
t := token.([]object)
if t[0] == "ident" {
idx := addIdent(t[1].(string))
t = append(t, idx)
token = t
}
res = token
getToken()
} else if token == '[' {
getToken()
res = sequence{"optional", expression()}
matchToken(']')
} else if token == '(' {
getToken()
res = expression()
matchToken(')')
} else if token == '{' {
getToken()
res = sequence{"repeat", expression()}
matchToken('}')
} else {
panic("invalid token in factor() function")
}
if s, ok := res.(sequence); ok && len(s) == 1 {
return s[0]
}
return res
}
func term() object {
res := sequence{factor()}
tokens := []object{-1, '|', '.', ';', ')', ']', '}'}
outer:
for {
for _, t := range tokens {
if t == token {
break outer
}
}
res = append(res, factor())
}
if len(res) == 1 {
return res[0]
}
return res
}
func expression() object {
res := sequence{term()}
if token == '|' {
res = sequence{"or", res[0]}
for token == '|' {
getToken()
res = append(res, term())
}
}
if len(res) == 1 {
return res[0]
}
return res
}
func production() object {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken()
if token != '}' {
if token == -1 {
return invalid("invalid ebnf (missing closing })")
}
if !isSeq {
return -1
}
t := token.(sequence)
if t[0] != "ident" {
return -1
}
ident := t[1].(string)
idx := addIdent(ident)
getToken()
matchToken('=')
if token == -1 {
return -1
}
productions = append(productions, sequence{ident, idx, expression()})
ididx[idx] = len(productions) - 1
}
return token
}
func parse(ebnf string) int {
// Returns +1 if ok, -1 if bad.
fmt.Printf("parse:\n%s ===>\n", ebnf)
err = false
src = []rune(ebnf)
sdx = 0
idents = idents[:0]
ididx = ididx[:0]
productions = productions[:0]
extras = extras[:0]
getToken()
if isSeq {
t := token.(sequence)
t[0] = "title"
extras = append(extras, token)
getToken()
}
if token != '{' {
return invalid("invalid ebnf (missing opening {)")
}
for {
token = production()
if token == '}' || token == -1 {
break
}
}
getToken()
if isSeq {
t := token.(sequence)
t[0] = "comment"
extras = append(extras, token)
getToken()
}
if token != -1 {
return invalid("invalid ebnf (missing eof?)")
}
if err {
return -1
}
k := -1
for i, idx := range ididx {
if idx == -1 {
k = i
break
}
}
if k != -1 {
return invalid(fmt.Sprintf("invalid ebnf (undefined:%s)", idents[k]))
}
pprint(productions, "productions")
pprint(idents, "idents")
pprint(ididx, "ididx")
pprint(extras, "extras")
return 1
}
// Adjusts Go's normal printing of slices to look more like Phix output.
func pprint(ob object, header string) {
fmt.Printf("\n%s:\n", header)
pp := fmt.Sprintf("%q", ob)
pp = strings.Replace(pp, "[", "{", -1)
pp = strings.Replace(pp, "]", "}", -1)
pp = strings.Replace(pp, " ", ", ", -1)
for i := 0; i < len(idents); i++ {
xs := fmt.Sprintf(`'\x%02d'`, i)
is := fmt.Sprintf("%d", i)
pp = strings.Replace(pp, xs, is, -1)
}
fmt.Println(pp)
}
// The rules that applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
func applies(rule sequence) bool {
wasSdx := sdx // in case of failure
r1 := rule[0]
if _, ok := r1.(string); !ok {
for i := 0; i < len(rule); i++ {
if !applies(rule[i].(sequence)) {
sdx = wasSdx
return false
}
}
} else if r1 == "terminal" {
skipSpaces()
r2 := []rune(rule[1].(string))
for i := 0; i < len(r2); i++ {
if sdx >= len(src) || src[sdx] != r2[i] {
sdx = wasSdx
return false
}
sdx++
}
} else if r1 == "or" {
for i := 1; i < len(rule); i++ {
if applies(rule[i].(sequence)) {
return true
}
}
sdx = wasSdx
return false
} else if r1 == "repeat" {
for applies(rule[1].(sequence)) {
}
} else if r1 == "optional" {
applies(rule[1].(sequence))
} else if r1 == "ident" {
i := rule[2].(int)
ii := ididx[i]
if !applies(productions[ii][2].(sequence)) {
sdx = wasSdx
return false
}
} else {
panic("invalid rule in applies() function")
}
return true
}
func checkValid(test string) {
src = []rune(test)
sdx = 0
res := applies(productions[0][2].(sequence))
skipSpaces()
if sdx < len(src) {
res = false
}
fmt.Printf("%q, %s\n", test, results[1-btoi(res)])
}
func main() {
ebnfs := []string{
`"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" `,
`{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}`,
`a = "1"`,
`{ a = "1" ;`,
`{ hello world = "1"; }`,
`{ foo = bar . }`,
}
tests := [][]string{
{
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here",
},
{
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3",
},
}
for i, ebnf := range ebnfs {
if parse(ebnf) == +1 {
fmt.Println("\ntests:")
for _, test := range tests[i] {
checkValid(test)
}
}
fmt.Println()
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{"a", 0, {{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}}, {"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}}, {"terminal", "a6"}}}}
idents:
{"a"}
ididx:
{0}
extras:
{{"title", "a"}, {"comment", "z"}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{"expr", 0, {{"ident", "term", 1}, {"repeat", {{"ident", "plus", 2}, {"ident", "term", 1}}}}}, {"term", 1, {{"ident", "factor", 3}, {"repeat", {{"ident", "times", 4}, {"ident", "factor", 3}}}}}, {"factor", 3, {"or", {"ident", "number", 5}, {{"terminal", "("}, {"ident", "expr", 0}, {"terminal", ")"}}}}, {"plus", 2, {"or", {"terminal", "+"}, {"terminal", "-"}}}, {"times", 4, {"or", {"terminal", "*"}, {"terminal", "/"}}}, {"number", 5, {{"ident", "digit", 6}, {"repeat", {"ident", "digit", 6}}}}, {"digit", 6, {"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"}, {"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"}, {"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"}, {"terminal", "9"}}}}
idents:
{"expr", "term", "plus", "factor", "times", "number", "digit"}
ididx:
{0, 1, 3, 2, 4, 5, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
We use Parsec to generate Parsec.
import Control.Applicative
import Control.Monad
import Data.Maybe
import qualified Data.Map as M
import System.Environment (getArgs)
import Text.Parsec hiding (many, optional, (<|>))
import Text.Parsec.String
import Text.Parsec.Error
-----------------------------------------------------------------
-- Main
-----------------------------------------------------------------
main = do
{- Uses the EBNF grammar contained in the first file to parse
the second file, then prints a parse tree. -}
[grammar_file, other_file] <- getArgs
ebnf_text <- readFile grammar_file
case parseGrammar grammar_file ebnf_text of
Left err ->
putStrLn $ "Failed to parse EBNF grammar: " ++ show err
Right g -> do
putStrLn "Successfully parsed EBNF grammar."
o <- readFile other_file
case parseWithGrammar g other_file o of
Left err ->
putStrLn $ "Failed to parse second file: " ++ show err
Right tree ->
print tree
-----------------------------------------------------------------
-- Types and user functions
-----------------------------------------------------------------
parseGrammar :: FilePath -> String -> Either ParseError Grammar
parseGrammar fp s =
case runParser ebnf M.empty fp s of
Left e ->
Left e
Right (Grammar g, usedNames) ->
let undefinedRules = foldl (flip M.delete) usedNames $ map fst g
(undefName, undefNamePos) = M.findMin undefinedRules
in if M.null undefinedRules
then Right $ Grammar g
else Left $ newErrorMessage
(Message $ "Undefined rule: " ++ undefName)
undefNamePos
parseWithGrammar :: Grammar -> FilePath -> String -> Either ParseError ParseTree
parseWithGrammar g@(Grammar ((_, firstR) : _)) fp s =
runParser (liftA cleanTree $ firstR <* eof) g fp s
type GParser = Parsec String UsedNames
type UsedNames = M.Map String SourcePos
type Rule = Parsec String Grammar ParseTree
-- We need to keep the Grammar around as a Parsec user state
-- to look up named rules.
data Grammar = Grammar [(String, Rule)]
-- Grammar would be a type synonym instead of an ADT, but
-- infinite types aren't allowed in Haskell.
data ParseTree =
ParseBranch String [ParseTree] |
ParseLeaf String
instance Show ParseTree where
show = showIndented 0
-- show (ParseBranch "" t) = '[' : concatMap ((' ' :) . show) t ++ "]"
-- show (ParseBranch s t) = '(' : s ++ concatMap ((' ' :) . show) t ++ ")"
-- show (ParseLeaf s) = show s
showIndented :: Int -> ParseTree -> String
showIndented i (ParseBranch "" []) =
indent i "[]"
showIndented i (ParseBranch "" t) =
indent i "[" ++
concatMap (showIndented (i + 2)) t ++
"]"
showIndented i (ParseBranch s t) =
indent i ("(" ++ s) ++
concatMap (showIndented (i + 2)) t ++
")"
showIndented i (ParseLeaf s) =
indent i $ show s
indent :: Int -> String -> String
indent i s = "\n" ++ replicate i ' ' ++ s
cleanTree :: ParseTree -> ParseTree
-- Removes empty anonymous branches.
cleanTree (ParseBranch i ts) =
ParseBranch i $ map cleanTree $ filter p ts
where p (ParseBranch "" []) = False
p _ = True
cleanTree x = x
-----------------------------------------------------------------
-- GParser definitions
-----------------------------------------------------------------
ebnf :: GParser (Grammar, UsedNames)
ebnf = liftA2 (,) (ws *> syntax <* eof) getState
syntax :: GParser Grammar
syntax = liftA Grammar $
optional title *>
lcbtw '{' '}' (many production) <*
optional comment
production :: GParser (String, Rule)
production = do
i <- identifier
lc '='
r <- expression
oneOf ".;"
ws
return (i, liftM (nameBranch i) r)
where nameBranch i (ParseBranch _ l) = ParseBranch i l
expression, term :: GParser Rule
expression = liftA (foldl1 (<|>)) $ term `sepBy1` (lc '|')
term = liftA (branch . sequence) $ many1 factor
factor :: GParser Rule
factor = liftA try $
liftA ruleByName rememberName <|>
liftA (leaf . (<* ws) . string) literal <|>
liftA perhaps (lcbtw '[' ']' expression) <|>
lcbtw '(' ')' expression <|>
liftA (branch . many) (lcbtw '{' '}' expression)
where rememberName :: GParser String
rememberName = do
i <- identifier
p <- getPosition
modifyState $ M.insertWith (flip const) i p
{- Adds i → p to the state only if i doesn't
already have an entry. This ensures we report the
*first* usage of each unknown identifier. -}
return i
ruleByName :: String -> Rule
ruleByName name = do
Grammar g <- getState
fromJust (lookup name g) <?> name
perhaps = option $ ParseLeaf ""
identifier :: GParser String
identifier = many1 (noneOf " \t\n=|(){}[].;\"'") <* ws
title = literal
comment = literal
literal =
(lc '\'' *> manyTill anyChar (lc '\'')) <|>
(lc '"' *> manyTill anyChar (lc '"'))
<* ws
-----------------------------------------------------------------
-- Miscellany
-----------------------------------------------------------------
leaf = liftA ParseLeaf
branch = liftA $ ParseBranch ""
lcbtw c1 c2 = between (lc c1) (lc c2)
lc :: Char -> GParser Char
lc c = char c <* ws
ws = many $ oneOf " \n\t"
import java.util.*;
public class EBNFParser {
// Type aliases equivalent
private static class Token {
Object value;
boolean isSequence;
Token(Object value, boolean isSequence) {
this.value = value;
this.isSequence = isSequence;
}
}
private static class Sequence extends ArrayList<Object> {
public Sequence(Object... items) {
Collections.addAll(this, items);
}
}
private String src;
private char ch;
private int sdx;
private Token token;
private boolean err = false;
private List<String> idents = new ArrayList<>();
private List<Integer> ididx = new ArrayList<>();
private List<Sequence> productions = new ArrayList<>();
private Sequence extras = new Sequence();
private String[] results = {"pass", "fail"};
private int btoi(boolean b) {
return b ? 1 : 0;
}
private int invalid(String msg) {
err = true;
System.out.println(msg);
sdx = src.length(); // set to eof
return -1;
}
private void skipSpaces() {
while (sdx < src.length()) {
ch = src.charAt(sdx);
if (" \t\r\n".indexOf(ch) == -1) {
break;
}
sdx++;
}
}
private void getToken() {
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
skipSpaces();
if (sdx >= src.length()) {
token = new Token(-1, false);
return;
}
int tokstart = sdx;
if ("{}()[]|=.;".indexOf(ch) >= 0) {
sdx++;
token = new Token(ch, false);
} else if (ch == '"' || ch == '\'') {
char closech = ch;
int tokend = tokstart + 1;
while (tokend < src.length() && src.charAt(tokend) != closech) {
tokend++;
}
if (tokend >= src.length()) {
token = new Token(invalid("no closing quote"), false);
} else {
sdx = tokend + 1;
token = new Token(new Sequence("terminal", src.substring(tokstart + 1, tokend)), true);
}
} else if (ch >= 'a' && ch <= 'z') {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (sdx < src.length() && ch >= 'a' && ch <= 'z') {
sdx++;
if (sdx < src.length()) {
ch = src.charAt(sdx);
}
}
token = new Token(new Sequence("ident", src.substring(tokstart, sdx)), true);
} else {
token = new Token(invalid("invalid ebnf"), false);
}
}
private void matchToken(char expectedCh) {
if (!token.value.equals(expectedCh)) {
token = new Token(invalid(String.format("invalid ebnf (%c expected)", expectedCh)), false);
} else {
getToken();
}
}
private int addIdent(String ident) {
int k = idents.indexOf(ident);
if (k == -1) {
idents.add(ident);
k = idents.size() - 1;
ididx.add(-1);
}
return k;
}
private Object factor() {
Object res;
if (token.isSequence) {
Sequence t = (Sequence) token.value;
if (t.get(0).equals("ident")) {
int idx = addIdent((String) t.get(1));
t.add(idx);
token.value = t;
}
res = token.value;
getToken();
} else if (token.value.equals('[')) {
getToken();
res = new Sequence("optional", expression());
matchToken(']');
} else if (token.value.equals('(')) {
getToken();
res = expression();
matchToken(')');
} else if (token.value.equals('{')) {
getToken();
res = new Sequence("repeat", expression());
matchToken('}');
} else {
throw new RuntimeException("invalid token in factor() function");
}
if (res instanceof Sequence && ((Sequence) res).size() == 1) {
return ((Sequence) res).get(0);
}
return res;
}
private Object term() {
Sequence res = new Sequence(factor());
Object[] tokens = {-1, '|', '.', ';', ')', ']', '}'};
outer:
while (true) {
for (Object t : tokens) {
if (t.equals(token.value)) {
break outer;
}
}
res.add(factor());
}
if (res.size() == 1) {
return res.get(0);
}
return res;
}
private Object expression() {
Sequence res = new Sequence(term());
if (token.value.equals('|')) {
res = new Sequence("or", res.get(0));
while (token.value.equals('|')) {
getToken();
res.add(term());
}
}
if (res.size() == 1) {
return res.get(0);
}
return res;
}
private Object production() {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken();
if (!token.value.equals('}')) {
if (token.value.equals(-1)) {
return invalid("invalid ebnf (missing closing })");
}
if (!token.isSequence) {
return -1;
}
Sequence t = (Sequence) token.value;
if (!t.get(0).equals("ident")) {
return -1;
}
String ident = (String) t.get(1);
int idx = addIdent(ident);
getToken();
matchToken('=');
if (token.value.equals(-1)) {
return -1;
}
productions.add(new Sequence(ident, idx, expression()));
ididx.set(idx, productions.size() - 1);
}
return token.value;
}
private int parse(String ebnf) {
// Returns +1 if ok, -1 if bad.
System.out.printf("parse:\n%s ===>\n", ebnf);
err = false;
src = ebnf;
sdx = 0;
idents.clear();
ididx.clear();
productions.clear();
extras.clear();
getToken();
if (token.isSequence) {
Sequence t = (Sequence) token.value;
t.set(0, "title");
extras.add(token.value);
getToken();
}
if (!token.value.equals('{')) {
return invalid("invalid ebnf (missing opening {)");
}
while (true) {
Object tokenResult = production();
if (tokenResult.equals('}') || tokenResult.equals(-1)) {
break;
}
}
getToken();
if (token.isSequence) {
Sequence t = (Sequence) token.value;
t.set(0, "comment");
extras.add(token.value);
getToken();
}
if (!token.value.equals(-1)) {
return invalid("invalid ebnf (missing eof?)");
}
if (err) {
return -1;
}
int k = -1;
for (int i = 0; i < ididx.size(); i++) {
if (ididx.get(i) == -1) {
k = i;
break;
}
}
if (k != -1) {
return invalid(String.format("invalid ebnf (undefined:%s)", idents.get(k)));
}
pprint(productions, "productions");
pprint(idents, "idents");
pprint(ididx, "ididx");
pprint(extras, "extras");
return 1;
}
// Adjusts Java's normal printing to look more like Phix output.
private void pprint(Object ob, String header) {
System.out.printf("\n%s:\n", header);
String pp = ob.toString();
pp = pp.replace("[", "{");
pp = pp.replace("]", "}");
pp = pp.replace(" ", ", ");
for (int i = 0; i < idents.size(); i++) {
pp = pp.replace(String.valueOf(i), String.valueOf(i));
}
System.out.println(pp);
}
// The rules that applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
private boolean applies(Sequence rule) {
int wasSdx = sdx; // in case of failure
Object r1 = rule.get(0);
if (!(r1 instanceof String)) {
for (int i = 0; i < rule.size(); i++) {
if (!applies((Sequence) rule.get(i))) {
sdx = wasSdx;
return false;
}
}
} else if (r1.equals("terminal")) {
skipSpaces();
String r2 = (String) rule.get(1);
for (int i = 0; i < r2.length(); i++) {
if (sdx >= src.length() || src.charAt(sdx) != r2.charAt(i)) {
sdx = wasSdx;
return false;
}
sdx++;
}
} else if (r1.equals("or")) {
for (int i = 1; i < rule.size(); i++) {
if (applies((Sequence) rule.get(i))) {
return true;
}
}
sdx = wasSdx;
return false;
} else if (r1.equals("repeat")) {
while (applies((Sequence) rule.get(1))) {
// continue repeating
}
} else if (r1.equals("optional")) {
applies((Sequence) rule.get(1));
} else if (r1.equals("ident")) {
int i = (Integer) rule.get(2);
int ii = ididx.get(i);
if (!applies((Sequence) productions.get(ii).get(2))) {
sdx = wasSdx;
return false;
}
} else {
throw new RuntimeException("invalid rule in applies() function");
}
return true;
}
private void checkValid(String test) {
src = test;
sdx = 0;
boolean res = applies((Sequence) productions.get(0).get(2));
skipSpaces();
if (sdx < src.length()) {
res = false;
}
System.out.printf("\"%s\", %s\n", test, results[1 - btoi(res)]);
}
public static void main(String[] args) {
EBNFParser parser = new EBNFParser();
String[] ebnfs = {
"\"a\" {\n" +
" a = \"a1\" ( \"a2\" | \"a3\" ) { \"a4\" } [ \"a5\" ] \"a6\" ;\n" +
"} \"z\" ",
"{\n" +
" expr = term { plus term } .\n" +
" term = factor { times factor } .\n" +
" factor = number | '(' expr ')' .\n" +
" \n" +
" plus = \"+\" | \"-\" .\n" +
" times = \"*\" | \"/\" .\n" +
" \n" +
" number = digit { digit } .\n" +
" digit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n" +
"}",
"a = \"1\"",
"{ a = \"1\" ;",
"{ hello world = \"1\"; }",
"{ foo = bar . }"
};
String[][] tests = {
{
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
},
{
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
}
};
for (int i = 0; i < ebnfs.length; i++) {
if (parser.parse(ebnfs[i]) == 1) {
System.out.println("\ntests:");
if (i < tests.length) {
for (String test : tests[i]) {
parser.checkValid(test);
}
}
}
System.out.println();
}
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{a,, 0,, {{terminal,, a1},, {or,, {terminal,, a2},, {terminal,, a3}},, {repeat,, {terminal,, a4}},, {optional,, {terminal,, a5}},, {terminal,, a6}}}}
idents:
{a}
ididx:
{0}
extras:
{{title,, a},, {comment,, z}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{expr,, 0,, {{ident,, term,, 1},, {repeat,, {{ident,, plus,, 2},, {ident,, term,, 1}}}}},, {term,, 1,, {{ident,, factor,, 3},, {repeat,, {{ident,, times,, 4},, {ident,, factor,, 3}}}}},, {factor,, 3,, {or,, {ident,, number,, 5},, {{terminal,, (},, {ident,, expr,, 0},, {terminal,, )}}}},, {plus,, 2,, {or,, {terminal,, +},, {terminal,, -}}},, {times,, 4,, {or,, {terminal,, *},, {terminal,, /}}},, {number,, 5,, {{ident,, digit,, 6},, {repeat,, {ident,, digit,, 6}}}},, {digit,, 6,, {or,, {terminal,, 0},, {terminal,, 1},, {terminal,, 2},, {terminal,, 3},, {terminal,, 4},, {terminal,, 5},, {terminal,, 6},, {terminal,, 7},, {terminal,, 8},, {terminal,, 9}}}}
idents:
{expr,, term,, plus,, factor,, times,, number,, digit}
ididx:
{0,, 1,, 3,, 2,, 4,, 5,, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
class EBNFParser {
constructor() {
this.src = '';
this.ch = '';
this.sdx = 0;
this.token = null;
this.err = false;
this.idents = [];
this.ididx = [];
this.productions = [];
this.extras = [];
this.results = ['pass', 'fail'];
}
btoi(b) {
return b ? 1 : 0;
}
invalid(msg) {
this.err = true;
console.log(msg);
this.sdx = this.src.length; // set to eof
return -1;
}
skipSpaces() {
while (this.sdx < this.src.length) {
this.ch = this.src.charAt(this.sdx);
if (' \t\r\n'.indexOf(this.ch) === -1) {
break;
}
this.sdx++;
}
}
getToken() {
// Yields a single character token, one of {}()[]|=.;
// or ["terminal",string] or ["ident", string] or -1.
this.skipSpaces();
if (this.sdx >= this.src.length) {
this.token = { value: -1, isSequence: false };
return;
}
const tokstart = this.sdx;
if ('{}()[]|=.;'.indexOf(this.ch) >= 0) {
this.sdx++;
this.token = { value: this.ch, isSequence: false };
} else if (this.ch === '"' || this.ch === "'") {
const closech = this.ch;
let tokend = tokstart + 1;
while (tokend < this.src.length && this.src.charAt(tokend) !== closech) {
tokend++;
}
if (tokend >= this.src.length) {
this.token = { value: this.invalid('no closing quote'), isSequence: false };
} else {
this.sdx = tokend + 1;
this.token = {
value: ['terminal', this.src.substring(tokstart + 1, tokend)],
isSequence: true
};
}
} else if (this.ch >= 'a' && this.ch <= 'z') {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (this.sdx < this.src.length && this.ch >= 'a' && this.ch <= 'z') {
this.sdx++;
if (this.sdx < this.src.length) {
this.ch = this.src.charAt(this.sdx);
}
}
this.token = {
value: ['ident', this.src.substring(tokstart, this.sdx)],
isSequence: true
};
} else {
this.token = { value: this.invalid('invalid ebnf'), isSequence: false };
}
}
matchToken(expectedCh) {
if (this.token.value !== expectedCh) {
this.token = {
value: this.invalid(`invalid ebnf (${expectedCh} expected)`),
isSequence: false
};
} else {
this.getToken();
}
}
addIdent(ident) {
let k = this.idents.indexOf(ident);
if (k === -1) {
this.idents.push(ident);
k = this.idents.length - 1;
this.ididx.push(-1);
}
return k;
}
factor() {
let res;
if (this.token.isSequence) {
const t = this.token.value;
if (t[0] === 'ident') {
const idx = this.addIdent(t[1]);
t.push(idx);
this.token.value = t;
}
res = this.token.value;
this.getToken();
} else if (this.token.value === '[') {
this.getToken();
res = ['optional', this.expression()];
this.matchToken(']');
} else if (this.token.value === '(') {
this.getToken();
res = this.expression();
this.matchToken(')');
} else if (this.token.value === '{') {
this.getToken();
res = ['repeat', this.expression()];
this.matchToken('}');
} else {
throw new Error('invalid token in factor() function');
}
if (Array.isArray(res) && res.length === 1) {
return res[0];
}
return res;
}
term() {
const res = [this.factor()];
const tokens = [-1, '|', '.', ';', ')', ']', '}'];
while (true) {
if (tokens.includes(this.token.value)) {
break;
}
res.push(this.factor());
}
if (res.length === 1) {
return res[0];
}
return res;
}
expression() {
let res = [this.term()];
if (this.token.value === '|') {
res = ['or', res[0]];
while (this.token.value === '|') {
this.getToken();
res.push(this.term());
}
}
if (res.length === 1) {
return res[0];
}
return res;
}
production() {
// Returns a token or -1; the real result is left in 'productions' etc,
this.getToken();
if (this.token.value !== '}') {
if (this.token.value === -1) {
return this.invalid('invalid ebnf (missing closing })');
}
if (!this.token.isSequence) {
return -1;
}
const t = this.token.value;
if (t[0] !== 'ident') {
return -1;
}
const ident = t[1];
const idx = this.addIdent(ident);
this.getToken();
this.matchToken('=');
if (this.token.value === -1) {
return -1;
}
this.productions.push([ident, idx, this.expression()]);
this.ididx[idx] = this.productions.length - 1;
}
return this.token.value;
}
parse(ebnf) {
// Returns +1 if ok, -1 if bad.
console.log(`parse:\n${ebnf} ===>`);
this.err = false;
this.src = ebnf;
this.sdx = 0;
this.idents = [];
this.ididx = [];
this.productions = [];
this.extras = [];
this.getToken();
if (this.token.isSequence) {
const t = this.token.value;
t[0] = 'title';
this.extras.push(this.token.value);
this.getToken();
}
if (this.token.value !== '{') {
return this.invalid('invalid ebnf (missing opening {)');
}
while (true) {
const tokenResult = this.production();
if (tokenResult === '}' || tokenResult === -1) {
break;
}
}
this.getToken();
if (this.token.isSequence) {
const t = this.token.value;
t[0] = 'comment';
this.extras.push(this.token.value);
this.getToken();
}
if (this.token.value !== -1) {
return this.invalid('invalid ebnf (missing eof?)');
}
if (this.err) {
return -1;
}
let k = -1;
for (let i = 0; i < this.ididx.length; i++) {
if (this.ididx[i] === -1) {
k = i;
break;
}
}
if (k !== -1) {
return this.invalid(`invalid ebnf (undefined:${this.idents[k]})`);
}
this.pprint(this.productions, 'productions');
this.pprint(this.idents, 'idents');
this.pprint(this.ididx, 'ididx');
this.pprint(this.extras, 'extras');
return 1;
}
// Adjusts JavaScript's normal printing to look more like Phix output.
pprint(ob, header) {
console.log(`\n${header}:`);
let pp = JSON.stringify(ob);
pp = pp.replace(/\[/g, '{');
pp = pp.replace(/\]/g, '}');
pp = pp.replace(/,/g, ', ');
for (let i = 0; i < this.idents.length; i++) {
pp = pp.replace(new RegExp(`\\b${i}\\b`, 'g'), String(i));
}
console.log(pp);
}
// The rules that applies() has to deal with are:
// [factors] - if rule[0] is not string,
// just apply one after the other recursively.
// ["terminal", "a1"] -- literal constants
// ["or", <e1>, <e2>, ...} -- (any) one of n
// ["repeat", <e1>] -- as per "{}" in ebnf
// ["optional", <e1>] -- as per "[]" in ebnf
// ["ident", <name>, idx] -- apply the sub-rule
applies(rule) {
const wasSdx = this.sdx; // in case of failure
const r1 = rule[0];
if (typeof r1 !== 'string') {
for (let i = 0; i < rule.length; i++) {
if (!this.applies(rule[i])) {
this.sdx = wasSdx;
return false;
}
}
} else if (r1 === 'terminal') {
this.skipSpaces();
const r2 = rule[1];
for (let i = 0; i < r2.length; i++) {
if (this.sdx >= this.src.length || this.src.charAt(this.sdx) !== r2.charAt(i)) {
this.sdx = wasSdx;
return false;
}
this.sdx++;
}
} else if (r1 === 'or') {
for (let i = 1; i < rule.length; i++) {
if (this.applies(rule[i])) {
return true;
}
}
this.sdx = wasSdx;
return false;
} else if (r1 === 'repeat') {
while (this.applies(rule[1])) {
// continue repeating
}
} else if (r1 === 'optional') {
this.applies(rule[1]);
} else if (r1 === 'ident') {
const i = rule[2];
const ii = this.ididx[i];
if (!this.applies(this.productions[ii][2])) {
this.sdx = wasSdx;
return false;
}
} else {
throw new Error('invalid rule in applies() function');
}
return true;
}
checkValid(test) {
this.src = test;
this.sdx = 0;
let res = this.applies(this.productions[0][2]);
this.skipSpaces();
if (this.sdx < this.src.length) {
res = false;
}
console.log(`"${test}", ${this.results[1 - this.btoi(res)]}`);
}
}
// Main execution
const parser = new EBNFParser();
const ebnfs = [
'"a" {\n' +
' a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;\n' +
'} "z" ',
'{\n' +
' expr = term { plus term } .\n' +
' term = factor { times factor } .\n' +
' factor = number | \'(\' expr \')\' .\n' +
' \n' +
' plus = "+" | "-" .\n' +
' times = "*" | "/" .\n' +
' \n' +
' number = digit { digit } .\n' +
' digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .\n' +
'}',
'a = "1"',
'{ a = "1" ;',
'{ hello world = "1"; }',
'{ foo = bar . }'
];
const tests = [
[
'a1a3a4a4a5a6',
'a1 a2a6',
'a1 a3 a4 a6',
'a1 a4 a5 a6',
'a1 a2 a4 a5 a5 a6',
'a1 a2 a4 a5 a6 a7',
'your ad here'
],
[
'2',
'2*3 + 4/23 - 7',
'(3 + 4) * 6-2+(4*(4))',
'-2',
'3 +',
'(4 + 3'
]
];
for (let i = 0; i < ebnfs.length; i++) {
if (parser.parse(ebnfs[i]) === 1) {
console.log('\ntests:');
if (i < tests.length) {
for (const test of tests[i]) {
parser.checkValid(test);
}
}
}
console.log('');
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{"a", 0, {{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}}, {"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}}, {"terminal", "a6"}}}}
idents:
{"a"}
ididx:
{0}
extras:
{{"title", "a"}, {"comment", "z"}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{"expr", 0, {{"ident", "term", 1}, {"repeat", {{"ident", "plus", 2}, {"ident", "term", 1}}}}}, {"term", 1, {{"ident", "factor", 3}, {"repeat", {{"ident", "times", 4}, {"ident", "factor", 3}}}}}, {"factor", 3, {"or", {"ident", "number", 5}, {{"terminal", "("}, {"ident", "expr", 0}, {"terminal", ")"}}}}, {"plus", 2, {"or", {"terminal", "+"}, {"terminal", "-"}}}, {"times", 4, {"or", {"terminal", "*"}, {"terminal", "/"}}}, {"number", 5, {{"ident", "digit", 6}, {"repeat", {"ident", "digit", 6}}}}, {"digit", 6, {"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"}, {"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"}, {"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"}, {"terminal", "9"}}}}
idents:
{"expr", "term", "plus", "factor", "times", "number", "digit"}
ididx:
{0, 1, 3, 2, 4, 5, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
Creates a Grammar struct from an EBNF grammar, which has a matching Regex for each production of the EBNF grammar.
Tested with Julia v1.7.2
struct Grammar
regex::Regex
rules::Dict{Symbol,Regex}
root::Regex
function Grammar(regex::Regex)
names = keys(match(regex, ""))
new(regex,
Dict{Symbol,Regex}(
Symbol(name) => Regex(
"$(regex.pattern)(?&$(name))",
regex.compile_options,
regex.match_options
)
for name in names if name isa String),
Regex(
"$(regex.pattern)^\\s*(?&$(first(names)))\\s*\$",
regex.compile_options,
regex.match_options))
end
end
Base.getindex(grammar::Grammar, key) = grammar.rules[key]
Base.occursin(grammar::Grammar, str::AbstractString) = occursin(grammar.root, str)
Base.show(io::IO, grammar::Grammar) = print(io, "Grammar(", grammar.regex, ")")
const ebnfgrammar = Grammar(r"""
(?(DEFINE) # EBNF syntax
(?<syntax>
(?&title)? \s*+ { \s*+ ( (?&production) \s*+ )* } \s*+ (?&comment)? )
(?<production>
(?&identifier) \s*+ = \s*+ (?&expression) \s*+ [.;] )
(?<expression>
(?&term) ( \s*+ \| \s*+ (?&term) )* )
(?<term>
(?&factor) ( \s*+ (?&factor) )* )
(?<factor>
(?&identifier)
| (?&literal)
| \[ \s*+ (?&expression) \s*+ \]
| \( \s*+ (?&expression) \s*+ \)
| { \s*+ (?&expression) \s*+ } )
(?<identifier>
[^'"\s=|(){}[\].;]+ )
(?<title>
(?&literal) )
(?<comment>
(?&literal) )
(?<literal>
' [^']+ '
| " [^"]+ " )
)"""x)
function syntax(grammar::Grammar, str::AbstractString)
if occursin(grammar[:syntax], str)
"""(?(DEFINE)$(join(production(grammar, eachmatch(grammar[:production], str)))))"""
else
throw(ErrorException("Invalid EBNF syntax."))
end
end
production(grammar::Grammar, iter) = (production(grammar, m.match) for m in iter)
function production(grammar::Grammar, str::AbstractString)
rstrip(c -> isspace(c) || c == '.' || c == ';', str)
id, expr = split(str, r"\s*=\s*", limit=2)
"(?<$(id)>" * join(
expression(grammar, eachmatch(grammar[:expression], expr)),
"\\s*+|\\s*+") * "\\s*+)"
end
expression(grammar::Grammar, iter) = (expression(grammar, m.match) for m in iter)
function expression(grammar::Grammar, str::AbstractString)
join(term(grammar, eachmatch(grammar[:term], str)), "\\s*+|\\s*+")
end
term(grammar::Grammar, iter) = (term(grammar, m.match) for m in iter)
function term(grammar::Grammar, str::AbstractString)
join(factor(grammar, eachmatch(grammar[:factor], str)), "\\s*+")
end
factor(grammar::Grammar, iter) = (factor(grammar, m.match) for m in iter)
function factor(grammar::Grammar, str::AbstractString)
str = strip(str)
if startswith(str, r"['\"]")
literal(grammar, str)
elseif startswith(str, r"[[({]")
"(\\s*+" * expression(grammar, str[begin+1:end-1]) * (
if startswith(str, '[')
"\\s*+)?"
elseif startswith(str, '{')
"\\s*+)*"
else
"\\s*+)"
end
)
else
"(?&$(str))"
end
end
literal(::Grammar, str::AbstractString) = "\\Q$(str[begin+1:end-1])\\E"
grammar(ebnf::AbstractString) = Grammar(Regex(syntax(ebnfgrammar, ebnf)))
using Test
oneliner = grammar("""
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"
""")
@testset "One liner" begin
@test occursin(oneliner, "a1a3a4a4a5a6")
@test occursin(oneliner, "a1 a2a6")
@test occursin(oneliner, "a1 a3 a4 a6")
@test !occursin(oneliner, "a1 a4 a5 a6")
@test !occursin(oneliner, "a1 a2 a4 a5 a5 a6")
@test !occursin(oneliner, "a1 a2 a4 a5 a6 a7")
@test !occursin(oneliner, "your ad here")
end
arithmetic = grammar("""
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}
""")
@testset "Arithmetic expressions" begin
@test occursin(arithmetic, "2")
@test occursin(arithmetic, "2*3 + 4/23 - 7")
@test occursin(arithmetic, "(3 + 4) * 6-2+(4*(4))")
@test !occursin(arithmetic, "-2")
@test !occursin(arithmetic, "3 +")
@test !occursin(arithmetic, "(4 + 3")
end
@testset "Invalid EBNF" begin
@test_throws ErrorException grammar("""a = "1";""")
@test_throws ErrorException grammar("""{ a = "1";""")
@test_throws ErrorException grammar("""{ hello world = "1"; }""")
@test_throws ErrorException grammar("{ foo = bar . }")
end
class MainKt {
// Data classes for better structure
private data class Token(val value: Any, val isSequence: Boolean)
private class Sequence(vararg items: Any) : ArrayList<Any>() {
init {
addAll(items)
}
}
private var src: String = ""
private var ch: Char = ' '
private var sdx: Int = 0
private var token: Token = Token(-1, false)
private var err: Boolean = false
private val idents = mutableListOf<String>()
private val ididx = mutableListOf<Int>()
private val productions = mutableListOf<Sequence>()
private val extras = Sequence()
private val results = arrayOf("pass", "fail")
private fun Boolean.toInt() = if (this) 1 else 0
private fun invalid(msg: String): Int {
err = true
println(msg)
sdx = src.length // set to eof
return -1
}
private fun skipSpaces() {
while (sdx < src.length) {
ch = src[sdx]
if (ch !in " \t\r\n") {
break
}
sdx++
}
}
private fun getToken() {
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
skipSpaces()
if (sdx >= src.length) {
token = Token(-1, false)
return
}
val tokstart = sdx
when {
ch in "{}()[]|=.;" -> {
sdx++
token = Token(ch, false)
}
ch == '"' || ch == '\'' -> {
val closech = ch
var tokend = tokstart + 1
while (tokend < src.length && src[tokend] != closech) {
tokend++
}
if (tokend >= src.length) {
token = Token(invalid("no closing quote"), false)
} else {
sdx = tokend + 1
token = Token(Sequence("terminal", src.substring(tokstart + 1, tokend)), true)
}
}
ch in 'a'..'z' -> {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (sdx < src.length && ch in 'a'..'z') {
sdx++
if (sdx < src.length) {
ch = src[sdx]
}
}
token = Token(Sequence("ident", src.substring(tokstart, sdx)), true)
}
else -> {
token = Token(invalid("invalid ebnf"), false)
}
}
}
private fun matchToken(expectedCh: Char) {
if (token.value != expectedCh) {
token = Token(invalid("invalid ebnf ($expectedCh expected)"), false)
} else {
getToken()
}
}
private fun addIdent(ident: String): Int {
var k = idents.indexOf(ident)
if (k == -1) {
idents.add(ident)
k = idents.size - 1
ididx.add(-1)
}
return k
}
private fun factor(): Any {
val res: Any = when {
token.isSequence -> {
val t = token.value as Sequence
if (t[0] == "ident") {
val idx = addIdent(t[1] as String)
t.add(idx)
token = Token(t, true)
}
val result = token.value
getToken()
result
}
token.value == '[' -> {
getToken()
val result = Sequence("optional", expression())
matchToken(']')
result
}
token.value == '(' -> {
getToken()
val result = expression()
matchToken(')')
result
}
token.value == '{' -> {
getToken()
val result = Sequence("repeat", expression())
matchToken('}')
result
}
else -> throw RuntimeException("invalid token in factor() function")
}
return if (res is Sequence && res.size == 1) res[0] else res
}
private fun term(): Any {
val res = Sequence(factor())
val tokens = setOf(-1, '|', '.', ';', ')', ']', '}')
while (token.value !in tokens) {
res.add(factor())
}
return if (res.size == 1) res[0] else res
}
private fun expression(): Any {
var res = Sequence(term())
if (token.value == '|') {
res = Sequence("or", res[0])
while (token.value == '|') {
getToken()
res.add(term())
}
}
return if (res.size == 1) res[0] else res
}
private fun production(): Any {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken()
if (token.value != '}') {
if (token.value == -1) {
return invalid("invalid ebnf (missing closing })")
}
if (!token.isSequence) {
return -1
}
val t = token.value as Sequence
if (t[0] != "ident") {
return -1
}
val ident = t[1] as String
val idx = addIdent(ident)
getToken()
matchToken('=')
if (token.value == -1) {
return -1
}
productions.add(Sequence(ident, idx, expression()))
ididx[idx] = productions.size - 1
}
return token.value
}
private fun parse(ebnf: String): Int {
// Returns +1 if ok, -1 if bad.
println("parse:\n$ebnf ===>\n")
err = false
src = ebnf
sdx = 0
idents.clear()
ididx.clear()
productions.clear()
extras.clear()
getToken()
if (token.isSequence) {
val t = token.value as Sequence
t[0] = "title"
extras.add(token.value)
getToken()
}
if (token.value != '{') {
return invalid("invalid ebnf (missing opening {)")
}
while (true) {
val tokenResult = production()
if (tokenResult == '}' || tokenResult == -1) {
break
}
}
getToken()
if (token.isSequence) {
val t = token.value as Sequence
t[0] = "comment"
extras.add(token.value)
getToken()
}
if (token.value != -1) {
return invalid("invalid ebnf (missing eof?)")
}
if (err) {
return -1
}
val k = ididx.indexOfFirst { it == -1 }
if (k != -1) {
return invalid("invalid ebnf (undefined:${idents[k]})")
}
pprint(productions, "productions")
pprint(idents, "idents")
pprint(ididx, "ididx")
pprint(extras, "extras")
return 1
}
// Adjusts Kotlin's normal printing to look more like Phix output.
private fun pprint(ob: Any, header: String) {
println("\n$header:")
var pp = ob.toString()
pp = pp.replace("[", "{")
pp = pp.replace("]", "}")
pp = pp.replace(" ", ", ")
for (i in idents.indices) {
pp = pp.replace(i.toString(), i.toString())
}
println(pp)
}
// The rules that applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
private fun applies(rule: Sequence): Boolean {
val wasSdx = sdx // in case of failure
val r1 = rule[0]
when {
r1 !is String -> {
for (i in rule.indices) {
if (!applies(rule[i] as Sequence)) {
sdx = wasSdx
return false
}
}
}
r1 == "terminal" -> {
skipSpaces()
val r2 = rule[1] as String
for (i in r2.indices) {
if (sdx >= src.length || src[sdx] != r2[i]) {
sdx = wasSdx
return false
}
sdx++
}
}
r1 == "or" -> {
for (i in 1 until rule.size) {
if (applies(rule[i] as Sequence)) {
return true
}
}
sdx = wasSdx
return false
}
r1 == "repeat" -> {
while (applies(rule[1] as Sequence)) {
// continue repeating
}
}
r1 == "optional" -> {
applies(rule[1] as Sequence)
}
r1 == "ident" -> {
val i = rule[2] as Int
val ii = ididx[i]
if (!applies(productions[ii][2] as Sequence)) {
sdx = wasSdx
return false
}
}
else -> throw RuntimeException("invalid rule in applies() function")
}
return true
}
private fun checkValid(test: String) {
src = test
sdx = 0
var res = applies(productions[0][2] as Sequence)
skipSpaces()
if (sdx < src.length) {
res = false
}
println("\"$test\", ${results[1 - res.toInt()]}")
}
companion object {
@JvmStatic
fun main(args: Array<String>) {
val parser = MainKt()
val ebnfs = arrayOf(
"\"a\" {\n" +
" a = \"a1\" ( \"a2\" | \"a3\" ) { \"a4\" } [ \"a5\" ] \"a6\" ;\n" +
"} \"z\" ",
"{\n" +
" expr = term { plus term } .\n" +
" term = factor { times factor } .\n" +
" factor = number | '(' expr ')' .\n" +
" \n" +
" plus = \"+\" | \"-\" .\n" +
" times = \"*\" | \"/\" .\n" +
" \n" +
" number = digit { digit } .\n" +
" digit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n" +
"}",
"a = \"1\"",
"{ a = \"1\" ;",
"{ hello world = \"1\"; }",
"{ foo = bar . }"
)
val tests = arrayOf(
arrayOf(
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
),
arrayOf(
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
)
)
for (i in ebnfs.indices) {
if (parser.parse(ebnfs[i]) == 1) {
println("\ntests:")
if (i < tests.size) {
for (test in tests[i]) {
parser.checkValid(test)
}
}
}
println()
}
}
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{a,, 0,, {{terminal,, a1},, {or,, {terminal,, a2},, {terminal,, a3}},, {repeat,, {terminal,, a4}},, {optional,, {terminal,, a5}},, {terminal,, a6}}}}
idents:
{a}
ididx:
{0}
extras:
{{title,, a},, {comment,, z}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{expr,, 0,, {{ident,, term,, 1},, {repeat,, {{ident,, plus,, 2},, {ident,, term,, 1}}}}},, {term,, 1,, {{ident,, factor,, 3},, {repeat,, {{ident,, times,, 4},, {ident,, factor,, 3}}}}},, {factor,, 3,, {or,, {ident,, number,, 5},, {{terminal,, (},, {ident,, expr,, 0},, {terminal,, )}}}},, {plus,, 2,, {or,, {terminal,, +},, {terminal,, -}}},, {times,, 4,, {or,, {terminal,, *},, {terminal,, /}}},, {number,, 5,, {{ident,, digit,, 6},, {repeat,, {ident,, digit,, 6}}}},, {digit,, 6,, {or,, {terminal,, 0},, {terminal,, 1},, {terminal,, 2},, {terminal,, 3},, {terminal,, 4},, {terminal,, 5},, {terminal,, 6},, {terminal,, 7},, {terminal,, 8},, {terminal,, 9}}}}
idents:
{expr,, term,, plus,, factor,, times,, number,, digit}
ididx:
{0,, 1,, 3,, 2,, 4,, 5,, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
-- EBNF Parser in Lua
local EBNFParser = {}
EBNFParser.__index = EBNFParser
function EBNFParser:new()
local self = setmetatable({}, EBNFParser)
self.src = ""
self.ch = ""
self.sdx = 1 -- Lua uses 1-based indexing
self.token = nil
self.err = false
self.idents = {}
self.ididx = {}
self.productions = {}
self.extras = {}
self.results = {"pass", "fail"}
return self
end
function EBNFParser:btoi(b)
return b and 1 or 0
end
function EBNFParser:invalid(msg)
self.err = true
print(msg)
self.sdx = #self.src + 1 -- set to eof
return -1
end
function EBNFParser:skipSpaces()
while self.sdx <= #self.src do
self.ch = self.src:sub(self.sdx, self.sdx)
if not self.ch:match("[ \t\r\n]") then
break
end
self.sdx = self.sdx + 1
end
end
function EBNFParser:getToken()
self:skipSpaces()
if self.sdx > #self.src then
self.token = {value = -1, isSequence = false}
return
end
local tokstart = self.sdx
if self.ch:match("[{}()%[%]|=.;]") then
self.sdx = self.sdx + 1
self.token = {value = self.ch, isSequence = false}
elseif self.ch == '"' or self.ch == "'" then
local closech = self.ch
local tokend = tokstart + 1
while tokend <= #self.src and self.src:sub(tokend, tokend) ~= closech do
tokend = tokend + 1
end
if tokend > #self.src then
self.token = {value = self:invalid("no closing quote"), isSequence = false}
else
self.sdx = tokend + 1
self.token = {
value = {"terminal", self.src:sub(tokstart + 1, tokend - 1)},
isSequence = true
}
end
elseif self.ch:match("[a-z]") then
-- Identifiers are strictly a-z only
while self.sdx <= #self.src and self.ch:match("[a-z]") do
self.sdx = self.sdx + 1
if self.sdx <= #self.src then
self.ch = self.src:sub(self.sdx, self.sdx)
end
end
self.token = {
value = {"ident", self.src:sub(tokstart, self.sdx - 1)},
isSequence = true
}
else
self.token = {value = self:invalid("invalid ebnf"), isSequence = false}
end
end
function EBNFParser:matchToken(expectedCh)
if self.token.value ~= expectedCh then
self.token = {
value = self:invalid("invalid ebnf (" .. expectedCh .. " expected)"),
isSequence = false
}
else
self:getToken()
end
end
function EBNFParser:addIdent(ident)
for i, id in ipairs(self.idents) do
if id == ident then
return i
end
end
table.insert(self.idents, ident)
local k = #self.idents
table.insert(self.ididx, -1)
return k
end
function EBNFParser:factor()
local res
if self.token.isSequence then
local t = self.token.value
if t[1] == "ident" then
local idx = self:addIdent(t[2])
table.insert(t, idx)
self.token.value = t
end
res = self.token.value
self:getToken()
elseif self.token.value == "[" then
self:getToken()
res = {"optional", self:expression()}
self:matchToken("]")
elseif self.token.value == "(" then
self:getToken()
res = self:expression()
self:matchToken(")")
elseif self.token.value == "{" then
self:getToken()
res = {"repeat", self:expression()}
self:matchToken("}")
else
error("invalid token in factor() function")
end
if type(res) == "table" and #res == 1 then
return res[1]
end
return res
end
function EBNFParser:term()
local res = {self:factor()}
local tokens = {-1, "|", ".", ";", ")", "]", "}"}
while true do
local found = false
for _, token in ipairs(tokens) do
if self.token.value == token then
found = true
break
end
end
if found then
break
end
table.insert(res, self:factor())
end
if #res == 1 then
return res[1]
end
return res
end
function EBNFParser:expression()
local res = {self:term()}
if self.token.value == "|" then
res = {"or", res[1]}
while self.token.value == "|" do
self:getToken()
table.insert(res, self:term())
end
end
if #res == 1 then
return res[1]
end
return res
end
function EBNFParser:production()
self:getToken()
if self.token.value ~= "}" then
if self.token.value == -1 then
return self:invalid("invalid ebnf (missing closing })")
end
if not self.token.isSequence then
return -1
end
local t = self.token.value
if t[1] ~= "ident" then
return -1
end
local ident = t[2]
local idx = self:addIdent(ident)
self:getToken()
self:matchToken("=")
if self.token.value == -1 then
return -1
end
table.insert(self.productions, {ident, idx, self:expression()})
self.ididx[idx] = #self.productions
end
return self.token.value
end
function EBNFParser:parse(ebnf)
print("parse:\n" .. ebnf .. " ===>")
self.err = false
self.src = ebnf
self.sdx = 1
self.idents = {}
self.ididx = {}
self.productions = {}
self.extras = {}
self:getToken()
if self.token.isSequence then
local t = self.token.value
t[1] = "title"
table.insert(self.extras, self.token.value)
self:getToken()
end
if self.token.value ~= "{" then
return self:invalid("invalid ebnf (missing opening {)")
end
while true do
local tokenResult = self:production()
if tokenResult == "}" or tokenResult == -1 then
break
end
end
self:getToken()
if self.token.isSequence then
local t = self.token.value
t[1] = "comment"
table.insert(self.extras, self.token.value)
self:getToken()
end
if self.token.value ~= -1 then
return self:invalid("invalid ebnf (missing eof?)")
end
if self.err then
return -1
end
local k = -1
for i = 1, #self.ididx do
if self.ididx[i] == -1 then
k = i
break
end
end
if k ~= -1 then
return self:invalid("invalid ebnf (undefined:" .. self.idents[k] .. ")")
end
self:pprint(self.productions, "productions")
self:pprint(self.idents, "idents")
self:pprint(self.ididx, "ididx")
self:pprint(self.extras, "extras")
return 1
end
-- Simple JSON-like serialization for Lua tables
local function serialize(t, depth)
depth = depth or 0
if type(t) ~= "table" then
if type(t) == "string" then
return '"' .. t .. '"'
else
return tostring(t)
end
end
local result = "{"
local first = true
for i, v in ipairs(t) do
if not first then
result = result .. ", "
end
result = result .. serialize(v, depth + 1)
first = false
end
result = result .. "}"
return result
end
function EBNFParser:pprint(ob, header)
print("\n" .. header .. ":")
local pp = serialize(ob)
print(pp)
end
function EBNFParser:applies(rule)
local wasSdx = self.sdx -- in case of failure
local r1 = rule[1]
if type(r1) ~= "string" then
for i = 1, #rule do
if not self:applies(rule[i]) then
self.sdx = wasSdx
return false
end
end
elseif r1 == "terminal" then
self:skipSpaces()
local r2 = rule[2]
for i = 1, #r2 do
if self.sdx > #self.src or self.src:sub(self.sdx, self.sdx) ~= r2:sub(i, i) then
self.sdx = wasSdx
return false
end
self.sdx = self.sdx + 1
end
elseif r1 == "or" then
for i = 2, #rule do
if self:applies(rule[i]) then
return true
end
end
self.sdx = wasSdx
return false
elseif r1 == "repeat" then
while self:applies(rule[2]) do
-- continue repeating
end
elseif r1 == "optional" then
self:applies(rule[2])
elseif r1 == "ident" then
local i = rule[3]
local ii = self.ididx[i]
if not self:applies(self.productions[ii][3]) then
self.sdx = wasSdx
return false
end
else
error("invalid rule in applies() function")
end
return true
end
function EBNFParser:checkValid(test)
self.src = test
self.sdx = 1
local res = self:applies(self.productions[1][3])
self:skipSpaces()
if self.sdx <= #self.src then
res = false
end
print('"' .. test .. '", ' .. self.results[1 + self:btoi(not res)])
end
-- Main execution
local parser = EBNFParser:new()
local ebnfs = {
'"a" {\n' ..
' a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;\n' ..
'} "z" ',
'{\n' ..
' expr = term { plus term } .\n' ..
' term = factor { times factor } .\n' ..
' factor = number | \'(\' expr \')\' .\n' ..
' \n' ..
' plus = "+" | "-" .\n' ..
' times = "*" | "/" .\n' ..
' \n' ..
' number = digit { digit } .\n' ..
' digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .\n' ..
'}',
'a = "1"',
'{ a = "1" ;',
'{ hello world = "1"; }',
'{ foo = bar . }'
}
local tests = {
{
'a1a3a4a4a5a6',
'a1 a2a6',
'a1 a3 a4 a6',
'a1 a4 a5 a6',
'a1 a2 a4 a5 a5 a6',
'a1 a2 a4 a5 a6 a7',
'your ad here'
},
{
'2',
'2*3 + 4/23 - 7',
'(3 + 4) * 6-2+(4*(4))',
'-2',
'3 +',
'(4 + 3'
}
}
for i = 1, #ebnfs do
if parser:parse(ebnfs[i]) == 1 then
print('\ntests:')
if i <= #tests then
for _, test in ipairs(tests[i]) do
parser:checkValid(test)
end
end
end
print('')
end
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{"a", 1, {{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}}, {"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}}, {"terminal", "a6"}}}}
idents:
{"a"}
ididx:
{1}
extras:
{{"title", "a"}, {"comment", "z"}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{"expr", 1, {{"ident", "term", 2}, {"repeat", {{"ident", "plus", 3}, {"ident", "term", 2}}}}}, {"term", 2, {{"ident", "factor", 4}, {"repeat", {{"ident", "times", 5}, {"ident", "factor", 4}}}}}, {"factor", 4, {"or", {"ident", "number", 6}, {{"terminal", "("}, {"ident", "expr", 1}, {"terminal", ")"}}}}, {"plus", 3, {"or", {"terminal", "+"}, {"terminal", "-"}}}, {"times", 5, {"or", {"terminal", "*"}, {"terminal", "/"}}}, {"number", 6, {{"ident", "digit", 7}, {"repeat", {"ident", "digit", 7}}}}, {"digit", 7, {"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"}, {"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"}, {"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"}, {"terminal", "9"}}}}
idents:
{"expr", "term", "plus", "factor", "times", "number", "digit"}
ididx:
{1, 2, 4, 3, 5, 6, 7}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
MODULE EBNF;
FROM ASCII IMPORT EOL;
FROM InOut IMPORT Done, Read, Write, WriteLn, WriteInt, WriteString;
FROM EBNFScanner IMPORT Symbol, sym, id, Ino, GetSym, MarkError, SkipLine;
FROM TableHandler IMPORT WordLength, Table, overflow, InitTable, Record, Tabulate;
VAR T0, T1 : Table;
PROCEDURE skip (n : INTEGER);
BEGIN
MarkError (n);
WHILE (sym < lpar) OR (sym > period) DO GetSym END
END skip;
PROCEDURE Expression;
PROCEDURE Term;
PROCEDURE Factor;
BEGIN
IF sym = ident THEN
Record (T0, id, Ino);
GetSym
ELSIF sym = literal THEN
Record (T1, id, Ino);
GetSym
ELSIF sym = lpar THEN
GetSym;
Expression;
IF sym = rpar THEN GetSym ELSE skip (2) END
ELSIF sym = lbk THEN
GetSym;
Expression;
IF sym = rbk THEN GetSym ELSE skip (3) END
ELSIF sym = lbr THEN
GetSym;
Expression;
IF sym = rbr THEN GetSym ELSE skip (4) END
ELSE
skip (5)
END
END Factor;
BEGIN
Factor;
WHILE sym < bar DO Factor END
END Term;
BEGIN
Term;
WHILE sym = bar DO
GetSym;
Term
END
END Expression;
PROCEDURE Production;
BEGIN
Record (T0, id, - INTEGER (Ino));
GetSym;
IF sym = eql THEN GetSym ELSE skip (7) END;
Expression;
IF sym # period THEN
MarkError (8);
SkipLine
END;
GetSym
END Production;
BEGIN
InitTable (T0);
InitTable (T1);
GetSym;
WHILE (sym = ident) AND (overflow = 0) DO Production END;
IF overflow > 0 THEN
WriteLn;
WriteString ("Table overflow");
WriteInt (overflow, 6);
END;
Write (35C);
Tabulate (T0);
Tabulate (T1);
END EBNF.
And the source for the EBNF scanner. I hope you like nested procedures.
IMPLEMENTATION MODULE EBNFScanner;
FROM ASCII IMPORT LF;
FROM InOut IMPORT Read, Write, WriteLn, WriteInt, WriteBf, EOF;
VAR ch : CHAR;
MODULE LineHandler;
IMPORT LF, EOF, ch, Ino, Read, Write, WriteLn, WriteInt, WriteBf;
EXPORT GetCh, MarkError, SkipLine;
CONST LineWidth = 100;
VAR cc : INTEGER;
cc1 : INTEGER;
cc2 : INTEGER;
line : ARRAY [0..LineWidth - 1] OF CHAR;
PROCEDURE GetLine;
BEGIN
IF cc2 > 0 THEN
WriteLn;
cc2 := 0
END;
Read (ch);
IF EOF () THEN
line [0] := 177C;
cc1 := 1
ELSE
INC (Ino);
WriteInt (Ino, 5);
Write (' ');
cc1 := 0;
LOOP
Write (ch);
line [cc1] := ch;
INC (cc1);
IF ch = LF THEN EXIT END;
Read (ch)
END
END
END GetLine;
PROCEDURE GetCh;
BEGIN
WHILE cc = cc1 DO
cc := 0;
GetLine
END;
ch := line [cc];
INC (cc)
END GetCh;
PROCEDURE MarkError (n : INTEGER);
BEGIN
IF cc2 = 0 THEN
Write ('*');
cc2 := 3;
REPEAT
Write (' ');
DEC (cc2)
UNTIL cc2 = 0;
END;
WHILE cc2 < cc DO
Write (' ');
INC (cc2)
END;
Write ('^');
WriteInt (n, 1);
INC (cc2, 2)
END MarkError;
PROCEDURE SkipLine;
BEGIN
WHILE ch # LF DO GetCh END;
GetCh
END SkipLine;
BEGIN (* BEGIN of LineHandler *)
cc := 0;
cc1 := 0;
cc2 := 0
END LineHandler;
PROCEDURE GetSym;
VAR i : CARDINAL;
BEGIN
WHILE ch <= ' ' DO GetCh END;
IF ch = '/' THEN
SkipLine;
WHILE ch <= ' ' DO GetCh END
END;
IF (CAP (ch) <= 'Z') AND (CAP (ch) >= 'A') THEN
i := 0;
sym := literal;
REPEAT
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
IF ch > 'Z' THEN sym := ident END;
GetCh
UNTIL (CAP (ch) < 'A') OR (CAP (ch) > 'Z');
id [i] := ' '
ELSIF ch = "'" THEN
i := 0;
GetCh;
sym := literal;
WHILE ch # "'" DO
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
GetCh
END;
GetCh;
id [i] := ' '
WHILE ch # "'" DO
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
GetCh
END;
GetCh;
id [i] := ' '
ELSIF ch = '"' THEN
i := 0;
GetCh;
sym := literal;
WHILE ch # '"' DO
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
GetCh
END;
GetCh;
id [i] := ' '
ELSIF ch = '=' THEN sym := eql; GetCh
ELSIF ch = '(' THEN sym := lpar; GetCh
ELSIF ch = ')' THEN sym := rpar; GetCh
ELSIF ch = '[' THEN sym := lbk; GetCh
ELSIF ch = ']' THEN sym := rbk; GetCh
ELSIF ch = '{' THEN sym := lbr; GetCh
ELSIF ch = '}' THEN sym := rbr; GetCh
ELSIF ch = '|' THEN sym := bar; GetCh
ELSIF ch = '.' THEN sym := period; GetCh
ELSIF ch = 177C THEN sym := other; GetCh
ELSE
sym := other;
GetCh
END
END GetSym;
BEGIN
Ino := 0;
ch := ' '
END EBNFScanner.
Version 1
#!/usr/bin/perl
use strict; # http://www.rosettacode.org/wiki/Parse_EBNF
use warnings;
$SIG{__WARN__} = sub { print "\nWARN: @_\n"; exit };
my $h = qr/\G\s*/;
my $identifier = qr/$h([a-z]\w*)\b/i;
my $literal = qr/$h(['"])(.+?)\1/s;
my ($title, $comment, %productions, %called, $startsymbol, $show, $errpos);
sub node { bless [ @_[1..$#_] ], $_[0] }
sub err { die "ERROR: ", s/\G\s*\K/<**ERROR @_**>/r, "\n" }
sub want { /$h\Q$_[1]\E/gc ? shift : err "missing '$_[1]' " }
sub addin { node $_[0] => ref $_[1] eq $_[0] ? @{$_[1]} : $_[1], pop }
for my $case ( split /^-{50}.*\n/m, do { local $/; @ARGV ? <> : <DATA> } )
{
$show = $case =~ s/^#show.*//m;
my ($ebnf, $tests) = map s/^#.*\n//gmr, split /^#test.*\n/m, $case, 2;
parse( $ebnf, ($tests // "") =~ /.*\n/g );
}
sub parse
{
eval
{
(%productions, %called, $startsymbol) = ();
local $_ = shift // 'empty ebnf source string';
print '-' x 75, "\n", s/\s*\z/\n\n/r;
syntax(); ################################################## parse the EBNF
print " title: $title\n comment: $comment\n";
$startsymbol or err "no productions";
print "start symbol: $startsymbol\n";
for my $key ( sort keys %productions )
{
$show and print "\n$key =\n", $productions{$key}->show =~ s/^/ /gmr;
}
delete @called{keys %productions};
%called and die "\nERROR: undefined production(s) <@{[ keys %called]}>\n";
for ( @_ ) ###################################################### run tests
{
$errpos = undef;
print "\ntry: $_";
print eval
{
$productions{$startsymbol}->run or pos($_) = $errpos, err; ### run tree
/$h\n/gc or err 'incomplete parse';
} ? "valid\n" : $@;
}
1;
} or print "$@\n";
}
sub syntax ############################################## subs for parsing EBNF
{
$title = /$literal/gc ? $2 : 'none';
/$h\{/gc or err 'missing open brace';
while( /$identifier\s*=/gc ) # is this the start of a production
{
my $name = $1;
$startsymbol //= $name;
my $tree = expr(0);
$productions{$name} =
$productions{$name} ? addin ALT => $productions{$name}, $tree : $tree;
/$h[.;]/gc or err 'missing production terminator';
}
/$h\}/gc or err 'missing close brace';
$comment = /$literal/gc ? $2 : 'none';
/$h\z/gc or err 'extra characters after parse';
}
sub expr
{
my $tree =
/$identifier/gc ? do { $called{$1}++; node PROD => $1 } :
/$literal/gc ? node LIT => $2 :
/$h\[/gc ? node OPTION => want expr(0), ']' :
/$h\{/gc ? node REPEAT => want expr(0), '}' :
/$h\(/gc ? want expr(0), ')' :
err 'Invalid expression';
$tree =
/\G\s+/gc ? $tree :
$_[0] <= 1 && /\G(?=[[{('"a-z])/gci ? addin SEQ => $tree, expr(2) :
$_[0] <= 0 && /\G\|/gc ? addin ALT => $tree, expr(1) :
return $tree while 1;
}
################################################### run parse tree
sub LIT::run { /$h\Q$_[0][0]\E/gc }
sub SEQ::run
{
my $pos = pos($_) // 0;
for my $node ( @{ $_[0] } )
{
$node->run or $errpos = pos($_), pos($_) = $pos, return 0;
}
return 1;
}
sub OPTION::run
{
my $pos = pos($_) // 0;
$_[0][0]->run or pos($_) = $pos, return 1;
}
sub PROD::run
{
$productions{ $_[0][0] }->run;
}
sub REPEAT::run
{
my $pos = pos($_) // 0;
$pos = pos($_) while $_[0][0]->run;
pos($_) = $pos;
return 1;
}
sub ALT::run
{
my $pos = pos($_) // 0;
for my $node ( @{ $_[0] } )
{
eval{ $node->run } and return 1;
pos($_) = $pos;
}
return 0;
}
sub LIT::show { "LIT $_[0][0]\n" } ###################### for nested tree print
sub PROD::show { "PROD $_[0][0]\n" }
sub UNIVERSAL::show
{
join '', ref $_[0], "\n", map $_->show =~ s/^/ /gmr, @{ $_[0] };
}
__DATA__
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"
#show
#tests
a1a3a4a4a5a6
a1 a2a6
a1 a3 a4 a6
a1 a4 a5 a6
a1 a2 a4 a5 a5 a6
a1 a2 a4 a5 a6 a7
your ad here
----------------------------------------------------------------------
"Arithmetic expressions" {
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} 'http://www.rosettacode.org/wiki/Parse_EBNF'
#tests
2
2*3 + 4/23 - 7
(3 + 4) * 6-2+(4*(4))
-2
3 +
(4 + 3
----------------------------------------------------------------------
'some invalid EBNF' { a = "1" ;
----------------------------------------------------------------------
a = "1";
----------------------------------------------------------------------
{ hello world = "1"; }
----------------------------------------------------------------------
{ foo = bar . } "undefined production check"
----------------------------------------------------------------------
- Output:
---------------------------------------------------------------------------
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"
title: a
comment: z
start symbol: a
a =
SEQ
LIT a1
ALT
LIT a2
LIT a3
REPEAT
LIT a4
OPTION
LIT a5
LIT a6
try: a1a3a4a4a5a6
valid
try: a1 a2a6
valid
try: a1 a3 a4 a6
valid
try: a1 a4 a5 a6
ERROR: a1 <**ERROR **>a4 a5 a6
try: a1 a2 a4 a5 a5 a6
ERROR: a1 a2 a4 a5 <**ERROR **>a5 a6
try: a1 a2 a4 a5 a6 a7
ERROR: a1 a2 a4 a5 a6 <**ERROR incomplete parse**>a7
try: your ad here
ERROR: <**ERROR **>your ad here
---------------------------------------------------------------------------
"Arithmetic expressions" {
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} 'http://www.rosettacode.org/wiki/Parse_EBNF'
title: Arithmetic expressions
comment: http://www.rosettacode.org/wiki/Parse_EBNF
start symbol: expr
try: 2
valid
try: 2*3 + 4/23 - 7
valid
try: (3 + 4) * 6-2+(4*(4))
valid
try: -2
ERROR: <**ERROR **>-2
try: 3 +
ERROR: 3 <**ERROR incomplete parse**>+
try: (4 + 3
ERROR: <**ERROR **>(4 + 3
---------------------------------------------------------------------------
'some invalid EBNF' { a = "1" ;
ERROR: 'some invalid EBNF' { a = "1" ;
<**ERROR missing close brace**>
---------------------------------------------------------------------------
a = "1";
ERROR: <**ERROR missing open brace**>a = "1";
---------------------------------------------------------------------------
{ hello world = "1"; }
ERROR: { <**ERROR missing close brace**>hello world = "1"; }
---------------------------------------------------------------------------
{ foo = bar . } "undefined production check"
title: none
comment: undefined production check
start symbol: foo
ERROR: undefined production(s) <bar>
Version 2
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
package EBNFParser;
use Data::Dumper;
sub new {
my $class = shift;
my $self = {
src => '',
ch => '',
sdx => 0,
token => undef,
err => 0,
idents => [],
ididx => [],
productions => [],
extras => [],
results => ["pass", "fail"]
};
bless $self, $class;
return $self;
}
sub btoi {
my ($self, $b) = @_;
return $b ? 1 : 0;
}
sub invalid {
my ($self, $msg) = @_;
$self->{err} = 1;
print "$msg\n";
$self->{sdx} = length($self->{src}); # set to eof
return -1;
}
sub skip_spaces {
my $self = shift;
while ($self->{sdx} < length($self->{src})) {
$self->{ch} = substr($self->{src}, $self->{sdx}, 1);
last unless $self->{ch} =~ /[ \t\r\n]/;
$self->{sdx}++;
}
}
sub get_token {
my $self = shift;
# Yields a single character token, one of {}()[]|=.;
# or ["terminal", string] or ["ident", string] or -1.
$self->skip_spaces();
if ($self->{sdx} >= length($self->{src})) {
$self->{token} = -1;
return;
}
my $tokstart = $self->{sdx};
if ($self->{ch} =~ /[{}()\[\]|=.;]/) {
$self->{sdx}++;
$self->{token} = $self->{ch};
} elsif ($self->{ch} eq '"' || $self->{ch} eq "'") {
my $closech = $self->{ch};
my $tokend = $tokstart + 1;
while ($tokend < length($self->{src}) && substr($self->{src}, $tokend, 1) ne $closech) {
$tokend++;
}
if ($tokend >= length($self->{src})) {
$self->{token} = $self->invalid("no closing quote");
} else {
$self->{sdx} = $tokend + 1;
$self->{token} = ["terminal", substr($self->{src}, $tokstart + 1, $tokend - $tokstart - 1)];
}
} elsif ($self->{ch} =~ /[a-z]/) {
# To simplify things for the purposes of this task,
# identifiers are strictly a-z only, not A-Z or 1-9.
while ($self->{sdx} < length($self->{src}) && substr($self->{src}, $self->{sdx}, 1) =~ /[a-z]/) {
$self->{sdx}++;
if ($self->{sdx} < length($self->{src})) {
$self->{ch} = substr($self->{src}, $self->{sdx}, 1);
}
}
$self->{token} = ["ident", substr($self->{src}, $tokstart, $self->{sdx} - $tokstart)];
} else {
$self->{token} = $self->invalid("invalid ebnf");
}
}
sub match_token {
my ($self, $expected_ch) = @_;
if ($self->{token} ne $expected_ch) {
$self->{token} = $self->invalid(sprintf("invalid ebnf (%s expected)", $expected_ch));
} else {
$self->get_token();
}
}
sub add_ident {
my ($self, $ident) = @_;
my $k = -1;
for my $i (0..$#{$self->{idents}}) {
if ($self->{idents}->[$i] eq $ident) {
$k = $i;
last;
}
}
if ($k == -1) {
push @{$self->{idents}}, $ident;
$k = $#{$self->{idents}};
push @{$self->{ididx}}, -1;
}
return $k;
}
sub factor {
my $self = shift;
my $res;
if (ref($self->{token}) eq 'ARRAY') {
my $t = $self->{token};
if ($t->[0] eq "ident") {
my $idx = $self->add_ident($t->[1]);
push @$t, $idx;
$self->{token} = $t;
}
$res = $self->{token};
$self->get_token();
} elsif ($self->{token} eq '[') {
$self->get_token();
$res = ["optional", $self->expression()];
$self->match_token(']');
} elsif ($self->{token} eq '(') {
$self->get_token();
$res = $self->expression();
$self->match_token(')');
} elsif ($self->{token} eq '{') {
$self->get_token();
$res = ["repeat", $self->expression()];
$self->match_token('}');
} else {
die "invalid token in factor() function";
}
if (ref($res) eq 'ARRAY' && @$res == 1) {
return $res->[0];
}
return $res;
}
sub term {
my $self = shift;
my $res = [$self->factor()];
my @tokens = (-1, '|', '.', ';', ')', ']', '}');
while (1) {
my $found = 0;
for my $t (@tokens) {
if (defined($self->{token}) && $t eq $self->{token}) {
$found = 1;
last;
}
}
last if $found;
push @$res, $self->factor();
}
if (@$res == 1) {
return $res->[0];
}
return $res;
}
sub expression {
my $self = shift;
my $res = [$self->term()];
if (defined($self->{token}) && $self->{token} eq '|') {
$res = ["or", $res->[0]];
while (defined($self->{token}) && $self->{token} eq '|') {
$self->get_token();
push @$res, $self->term();
}
}
if (@$res == 1) {
return $res->[0];
}
return $res;
}
sub production {
my $self = shift;
# Returns a token or -1; the real result is left in 'productions' etc,
$self->get_token();
if (defined($self->{token}) && $self->{token} ne '}') {
if ($self->{token} eq -1) {
return $self->invalid("invalid ebnf (missing closing })");
}
if (ref($self->{token}) ne 'ARRAY') {
return -1;
}
my $t = $self->{token};
if ($t->[0] ne "ident") {
return -1;
}
my $ident = $t->[1];
my $idx = $self->add_ident($ident);
$self->get_token();
$self->match_token('=');
if ($self->{token} eq -1) {
return -1;
}
push @{$self->{productions}}, [$ident, $idx, $self->expression()];
$self->{ididx}->[$idx] = $#{$self->{productions}};
}
return $self->{token};
}
sub parse {
my ($self, $ebnf) = @_;
# Returns +1 if ok, -1 if bad.
printf "parse:\n%s ===>\n", $ebnf;
$self->{err} = 0;
$self->{src} = $ebnf;
$self->{sdx} = 0;
$self->{idents} = [];
$self->{ididx} = [];
$self->{productions} = [];
$self->{extras} = [];
$self->get_token();
if (ref($self->{token}) eq 'ARRAY') {
my $t = $self->{token};
$t->[0] = "title";
push @{$self->{extras}}, $self->{token};
$self->get_token();
}
if (!defined($self->{token}) || $self->{token} ne '{') {
return $self->invalid("invalid ebnf (missing opening {)");
}
while (1) {
my $token_result = $self->production();
last if (defined($token_result) && ($token_result eq '}' || $token_result eq -1));
}
$self->get_token();
if (ref($self->{token}) eq 'ARRAY') {
my $t = $self->{token};
$t->[0] = "comment";
push @{$self->{extras}}, $self->{token};
$self->get_token();
}
if (defined($self->{token}) && $self->{token} ne -1) {
return $self->invalid("invalid ebnf (missing eof?)");
}
if ($self->{err}) {
return -1;
}
my $k = -1;
for my $i (0..$#{$self->{ididx}}) {
if ($self->{ididx}->[$i] == -1) {
$k = $i;
last;
}
}
if ($k != -1) {
return $self->invalid(sprintf("invalid ebnf (undefined:%s)", $self->{idents}->[$k]));
}
$self->pprint($self->{productions}, "productions");
$self->pprint($self->{idents}, "idents");
$self->pprint($self->{ididx}, "ididx");
$self->pprint($self->{extras}, "extras");
return 1;
}
# Adjusts Perl's normal printing to look more like Phix output.
sub pprint {
my ($self, $ob, $header) = @_;
printf "\n%s:\n", $header;
local $Data::Dumper::Terse = 1;
local $Data::Dumper::Indent = 0;
my $pp = Dumper($ob);
$pp =~ s/\[/\{/g;
$pp =~ s/\]/\}/g;
$pp =~ s/,/, /g;
chomp $pp;
print "$pp\n";
}
# The rules that applies() has to deal with are:
# [factors] - if rule[0] is not string,
# just apply one after the other recursively.
# ["terminal", "a1"] -- literal constants
# ["or", <e1>, <e2>, ...] -- (any) one of n
# ["repeat", <e1>] -- as per "{}" in ebnf
# ["optional", <e1>] -- as per "[]" in ebnf
# ["ident", <name>, idx] -- apply the sub-rule
sub applies {
my ($self, $rule) = @_;
my $was_sdx = $self->{sdx}; # in case of failure
my $r1 = ref($rule) eq 'ARRAY' ? $rule->[0] : $rule;
if (ref($rule) eq 'ARRAY' && ref($r1) ne '') {
for my $i (0..$#{$rule}) {
if (!$self->applies($rule->[$i])) {
$self->{sdx} = $was_sdx;
return 0;
}
}
} elsif (ref($rule) eq 'ARRAY' && $r1 eq "terminal") {
$self->skip_spaces();
my $r2 = $rule->[1];
for my $i (0..length($r2)-1) {
if ($self->{sdx} >= length($self->{src}) ||
substr($self->{src}, $self->{sdx}, 1) ne substr($r2, $i, 1)) {
$self->{sdx} = $was_sdx;
return 0;
}
$self->{sdx}++;
}
} elsif (ref($rule) eq 'ARRAY' && $r1 eq "or") {
for my $i (1..$#{$rule}) {
if ($self->applies($rule->[$i])) {
return 1;
}
}
$self->{sdx} = $was_sdx;
return 0;
} elsif (ref($rule) eq 'ARRAY' && $r1 eq "repeat") {
while ($self->applies($rule->[1])) {
# continue repeating
}
} elsif (ref($rule) eq 'ARRAY' && $r1 eq "optional") {
$self->applies($rule->[1]);
} elsif (ref($rule) eq 'ARRAY' && $r1 eq "ident") {
my $i = $rule->[2];
my $ii = $self->{ididx}->[$i];
if (!$self->applies($self->{productions}->[$ii]->[2])) {
$self->{sdx} = $was_sdx;
return 0;
}
} else {
die "invalid rule in applies() function";
}
return 1;
}
sub check_valid {
my ($self, $test) = @_;
$self->{src} = $test;
$self->{sdx} = 0;
my $res = $self->applies($self->{productions}->[0]->[2]);
$self->skip_spaces();
if ($self->{sdx} < length($self->{src})) {
$res = 0;
}
printf "\"%s\", %s\n", $test, $self->{results}->[1 - $self->btoi($res)];
}
# Main execution
package main;
my $parser = EBNFParser->new();
my @ebnfs = (
"\"a\" {\n" .
" a = \"a1\" ( \"a2\" | \"a3\" ) { \"a4\" } [ \"a5\" ] \"a6\" ;\n" .
"} \"z\" ",
"{\n" .
" expr = term { plus term } .\n" .
" term = factor { times factor } .\n" .
" factor = number | '(' expr ')' .\n" .
" \n" .
" plus = \"+\" | \"-\" .\n" .
" times = \"*\" | \"/\" .\n" .
" \n" .
" number = digit { digit } .\n" .
" digit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n" .
"}",
"a = \"1\"",
"{ a = \"1\" ;",
"{ hello world = \"1\"; }",
"{ foo = bar . }"
);
my @tests = (
[
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
],
[
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
]
);
for my $i (0..$#ebnfs) {
if ($parser->parse($ebnfs[$i]) == 1) {
print "\ntests:\n";
if ($i < @tests) {
for my $test (@{$tests[$i]}) {
$parser->check_valid($test);
}
}
}
print "\n";
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{'a', 0, {{'terminal', 'a1'}, {'or', {'terminal', 'a2'}, {'terminal', 'a3'}}, {'repeat', {'terminal', 'a4'}}, {'optional', {'terminal', 'a5'}}, {'terminal', 'a6'}}}}
idents:
{'a'}
ididx:
{0}
extras:
{{'title', 'a'}, {'comment', 'z'}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{'expr', 0, {{'ident', 'term', 1}, {'repeat', {{'ident', 'plus', 2}, {'ident', 'term', 1}}}}}, {'term', 1, {{'ident', 'factor', 3}, {'repeat', {{'ident', 'times', 4}, {'ident', 'factor', 3}}}}}, {'factor', 3, {'or', {'ident', 'number', 5}, {{'terminal', '('}, {'ident', 'expr', 0}, {'terminal', ')'}}}}, {'plus', 2, {'or', {'terminal', '+'}, {'terminal', '-'}}}, {'times', 4, {'or', {'terminal', '*'}, {'terminal', '/'}}}, {'number', 5, {{'ident', 'digit', 6}, {'repeat', {'ident', 'digit', 6}}}}, {'digit', 6, {'or', {'terminal', '0'}, {'terminal', '1'}, {'terminal', '2'}, {'terminal', '3'}, {'terminal', '4'}, {'terminal', '5'}, {'terminal', '6'}, {'terminal', '7'}, {'terminal', '8'}, {'terminal', '9'}}}}
idents:
{'expr', 'term', 'plus', 'factor', 'times', 'number', 'digit'}
ididx:
{0, 1, 3, 2, 4, 5, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
with javascript_semantics string src integer ch, sdx object token bool error = false function invalid(string msg) error = true ?msg sdx = length(src)+1 -- (set to eof) return -1 end function procedure skip_spaces() while 1 do if sdx>length(src) then exit end if ch = src[sdx] if not find(ch,{' ','\t','\r','\n'}) then exit end if sdx += 1 end while end procedure procedure get_token() -- yeilds a single character token, one of {}()[]|=.; -- or {"terminal",string} or {"ident", string} or -1. skip_spaces() if sdx>length(src) then token = -1 return end if integer tokstart = sdx if find(ch,"{}()[]|=.;") then sdx += 1 token = ch elsif ch='\"' or ch='\'' then integer closech = ch for tokend=sdx+1 to length(src) do if src[tokend] = closech then sdx = tokend+1 token = {"terminal",src[tokstart+1..tokend-1]} return end if end for token = invalid("no closing quote") elsif ch>='a' and ch<='z' then -- to simplify things for the purposes of this task, -- identifiers are strictly a-z only, not A-Z or 1-9 while 1 do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch<'a' or ch>'z' then exit end if end while token = {"ident",src[tokstart..sdx-1]} else token = invalid("invalid ebnf") end if end procedure procedure match_token(integer ch) if token!=ch then token = invalid(sprintf("invalid ebnf (%c expected)",ch)) else get_token() end if end procedure sequence idents = {}, ididx = {} function add_ident(string ident) integer k = find(ident,idents) if k=0 then idents = append(idents,ident) k = length(idents) ididx = append(ididx,0) end if return k end function forward function expression() function factor() object res if sequence(token) then if token[1]="ident" then token &= add_ident(token[2]) end if res = token get_token() elsif token='[' then get_token() res = {"optional",expression()} match_token(']') elsif token='(' then get_token() res = expression() match_token(')') elsif token='{' then get_token() res = {"repeat",expression()} match_token('}') else ?9/0 -- erm?? -- res = -1 -- "" end if return res end function function term() -- sequence res = {"factors",factor()} -- (opted against this) sequence res = {factor()} while not find(token,{-1,'|','.',';',')',']','}'}) do res = append(res,factor()) end while if length(res)=1 then res = res[1] end if return res end function function expression() object res = term() if token='|' then res = {"or",res} while token='|' do get_token() res = append(res,term()) end while end if return res end function sequence productions = {} function production() -- returns a token or -1; the real result is left in productions[] etc. get_token() if token!='}' then if token=-1 then return invalid("invalid ebnf (missing closing })") end if if not sequence(token) or token[1]!="ident" then return -1 end if string ident = token[2] integer idx = add_ident(ident) get_token() match_token('=') if token=-1 then return -1 end if sequence res = expression() productions = append(productions,{ident,idx,res}) ididx[idx] = length(productions) end if return token end function sequence extras = {} function parse(string ebnf) -- returns: +1 if ok, -1 if bad puts(1,"parse: "&ebnf&" ===>\n") error = false src = ebnf sdx = 1 idents = {} ididx = {} productions = {} extras = {} get_token() if sequence(token) then token[1] = "title" extras = append(extras,token) get_token() end if if token!='{' then return invalid("invalid ebnf (missing opening {)") end if while 1 do token = production() if token='}' or token=-1 then exit end if end while get_token() if sequence(token) then token[1] = "comment" extras = append(extras,token) get_token() end if if token!=-1 then return invalid("invalid ebnf (missing eof?)") end if if error then return -1 end if integer k = find(0,ididx) if k!=0 then return invalid("invalid ebnf (undefined:"&idents[k]&")") end if ppOpt({pp_Pause,0}) pp(productions) -- pp(idents) -- pp(ididx) -- pp(extras) return 1 end function -- The rules that applies() has to deal with are: -- {factors} - if rule[1] is not string, just apply one after the other, -- recursively. As per term() above, originally had "factors", -- but getting rid of it made the syntax tree much clearer) -- {"terminal", "a1"} -- literal constants -- {"or", <e1>, <e2>, ...} -- (any) one of n -- {"repeat", <e1>} -- as per "{}" in ebnf -- {"optional", <e1>} -- as per "[]" in ebnf -- {"ident", <name>, idx} -- apply the sub-rule function applies(sequence rule) integer was_sdx = sdx -- in case of failure object r1 = rule[1] if not string(r1) then for i=1 to length(rule) do if not applies(rule[i]) then sdx = was_sdx return false end if end for elsif r1="terminal" then skip_spaces() for i=1 to length(rule[2]) do if sdx>length(src) or src[sdx]!=rule[2][i] then sdx = was_sdx return false end if sdx += 1 end for elsif r1="or" then for i=2 to length(rule) do if applies(rule[i]) then return true end if end for sdx = was_sdx return false elsif r1="repeat" then while applies(rule[2]) do end while elsif r1="optional" then if applies(rule[2]) then end if elsif r1="ident" then integer i = rule[3], ii = ididx[i] if not applies(productions[ii][3]) then sdx = was_sdx return false end if else ?9/0 end if return true end function procedure check_valid(string test) src = test sdx = 1 bool res = applies(productions[1][3]) skip_spaces() if sdx<=length(src) then res = false end if ?{test,{"pass","fail"}[2-res]} end procedure constant ebnf = {""" "a" { a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ; } "z" """, """ { expr = term { plus term } . term = factor { times factor } . factor = number | '(' expr ')' . plus = "+" | "-" . times = "*" | "/" . number = digit { digit } . digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" . }""", `a = "1"`, `{ a = "1" ;`, `{ hello world = "1"; }`, `{ foo = bar . }` } constant tests = { {"a1a3a4a4a5a6", "a1 a2a6", "a1 a3 a4 a6", "a1 a4 a5 a6", "a1 a2 a4 a5 a5 a6", "a1 a2 a4 a5 a6 a7", "your ad here"}, {"2", "2*3 + 4/23 - 7", "(3 + 4) * 6-2+(4*(4))", "-2", "3 +", "(4 + 3"}} for i=1 to length(ebnf) do if parse(ebnf[i])=+1 then ?"tests:" for j=1 to length(tests[i]) do check_valid(tests[i][j]) end for end if end for
In real use, I would be tempted to use numeric literals rather than string tags in such structures, but the latter certainly make things ten times easier to debug, plus I got an instantly legible syntax tree dump (the bit just after "===>" below) practically for free.
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
{{"a", 1,
{{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}},
{"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}},
{"terminal", "a6"}}}}
"tests:"
{"a1a3a4a4a5a6","pass"}
{"a1 a2a6","pass"}
{"a1 a3 a4 a6","pass"}
{"a1 a4 a5 a6","fail"}
{"a1 a2 a4 a5 a5 a6","fail"}
{"a1 a2 a4 a5 a6 a7","fail"}
{"your ad here","fail"}
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
{{"expr", 1,
{{"ident", "term", 2},
{"repeat", {{"ident", "plus", 3}, {"ident", "term", 2}}}}},
{"term", 2,
{{"ident", "factor", 4},
{"repeat", {{"ident", "times", 5}, {"ident", "factor", 4}}}}},
{"factor", 4,
{"or", {"ident", "number", 6},
{{"terminal", "("}, {"ident", "expr", 1}, {"terminal", ")"}}}},
{"plus", 3, {"or", {"terminal", "+"}, {"terminal", "-"}}},
{"times", 5, {"or", {"terminal", "*"}, {"terminal", "/"}}},
{"number", 6, {{"ident", "digit", 7}, {"repeat", {"ident", "digit", 7}}}},
{"digit", 7,
{"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"},
{"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"},
{"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"},
{"terminal", "9"}}}}
"tests:"
{"2","pass"}
{"2*3 + 4/23 - 7","pass"}
{"(3 + 4) * 6-2+(4*(4))","pass"}
{"-2","fail"}
{"3 +","fail"}
{"(4 + 3","fail"}
parse: a = "1" ===>
"invalid ebnf (missing opening {)"
parse: { a = "1" ; ===>
"invalid ebnf (missing closing })"
parse: { hello world = "1"; } ===>
"invalid ebnf (= expected)"
parse: { foo = bar . } ===>
"invalid ebnf (undefined:bar)"
(de EBNF
"expr : term ( ( PLUS | MINUS ) term )* ;"
"term : factor ( ( MULT | DIV ) factor )* ;"
"factor : NUMBER ;" )
(for E EBNF
(use (@S @E)
(unless (and (match '(@S : @E ;) (str E)) (not (cdr @S)))
(quit "Invalid EBNF" E) )
(put (car @S) 'ebnf @E) ) )(de matchEbnf (Pat)
(cond
((asoq Pat '((PLUS . +) (MINUS . -) (MULT . *) (DIV . /)))
(let Op (cdr @)
(when (= Op (car *Lst))
(pop '*Lst)
Op ) ) )
((== 'NUMBER Pat)
(cond
((num? (car *Lst))
(pop '*Lst)
@ )
((and (= "-" (car *Lst)) (num? (cadr *Lst)))
(setq *Lst (cddr *Lst))
(- @) ) ) )
((get Pat 'ebnf) (parseLst @))
((atom Pat))
(T
(loop
(T (matchEbnf (pop 'Pat)) @)
(NIL Pat)
(NIL (== '| (pop 'Pat)))
(NIL Pat) ) ) ) )
(de parseLst (Pat)
(let (P (pop 'Pat) X (matchEbnf P))
(loop
(NIL Pat)
(if (n== '* (cadr Pat))
(if (matchEbnf (pop 'Pat))
(setq X (list @ X))
(throw) )
(loop
(NIL *Lst)
(NIL (matchEbnf (car Pat)))
(setq X (list @ X (or (matchEbnf P) (throw)))) )
(setq Pat (cddr Pat)) ) )
X ) )
(de parseEbnf (Str)
(let *Lst (str Str "")
(catch NIL
(parseLst (get 'expr 'ebnf)) ) ) )Output:
: (parseEbnf "1 + 2 * -3 / 7 - 3 * 4") -> (- (+ 1 (/ (* 2 -3) 7)) (* 3 4))
import json
class EBNFParser:
def __init__(self):
self.src = ''
self.ch = ''
self.sdx = 0
self.token = None
self.err = False
self.idents = []
self.ididx = []
self.productions = []
self.extras = []
self.results = ['pass', 'fail']
def btoi(self, b):
return 1 if b else 0
def invalid(self, msg):
self.err = True
print(msg)
self.sdx = len(self.src) # set to eof
return -1
def skip_spaces(self):
while self.sdx < len(self.src):
self.ch = self.src[self.sdx]
if self.ch not in ' \t\r\n':
break
self.sdx += 1
def get_token(self):
# Yields a single character token, one of {}()[]|=.;
# or ["terminal",string] or ["ident", string] or -1.
self.skip_spaces()
if self.sdx >= len(self.src):
self.token = {'value': -1, 'is_sequence': False}
return
tokstart = self.sdx
if self.ch in '{}()[]|=.;':
self.sdx += 1
self.token = {'value': self.ch, 'is_sequence': False}
elif self.ch in '"\'':
closech = self.ch
tokend = tokstart + 1
while tokend < len(self.src) and self.src[tokend] != closech:
tokend += 1
if tokend >= len(self.src):
self.token = {'value': self.invalid('no closing quote'), 'is_sequence': False}
else:
self.sdx = tokend + 1
self.token = {
'value': ['terminal', self.src[tokstart + 1:tokend]],
'is_sequence': True
}
elif 'a' <= self.ch <= 'z':
# To simplify things for the purposes of this task,
# identifiers are strictly a-z only, not A-Z or 1-9.
while self.sdx < len(self.src) and 'a' <= self.ch <= 'z':
self.sdx += 1
if self.sdx < len(self.src):
self.ch = self.src[self.sdx]
self.token = {
'value': ['ident', self.src[tokstart:self.sdx]],
'is_sequence': True
}
else:
self.token = {'value': self.invalid('invalid ebnf'), 'is_sequence': False}
def match_token(self, expected_ch):
if self.token['value'] != expected_ch:
self.token = {
'value': self.invalid(f'invalid ebnf ({expected_ch} expected)'),
'is_sequence': False
}
else:
self.get_token()
def add_ident(self, ident):
try:
k = self.idents.index(ident)
except ValueError:
self.idents.append(ident)
k = len(self.idents) - 1
self.ididx.append(-1)
return k
def factor(self):
if self.token['is_sequence']:
t = self.token['value']
if t[0] == 'ident':
idx = self.add_ident(t[1])
t.append(idx)
self.token['value'] = t
res = self.token['value']
self.get_token()
elif self.token['value'] == '[':
self.get_token()
res = ['optional', self.expression()]
self.match_token(']')
elif self.token['value'] == '(':
self.get_token()
res = self.expression()
self.match_token(')')
elif self.token['value'] == '{':
self.get_token()
res = ['repeat', self.expression()]
self.match_token('}')
else:
raise ValueError('invalid token in factor() function')
if isinstance(res, list) and len(res) == 1:
return res[0]
return res
def term(self):
res = [self.factor()]
tokens = [-1, '|', '.', ';', ')', ']', '}']
while True:
if self.token['value'] in tokens:
break
res.append(self.factor())
if len(res) == 1:
return res[0]
return res
def expression(self):
res = [self.term()]
if self.token['value'] == '|':
res = ['or', res[0]]
while self.token['value'] == '|':
self.get_token()
res.append(self.term())
if len(res) == 1:
return res[0]
return res
def production(self):
# Returns a token or -1; the real result is left in 'productions' etc,
self.get_token()
if self.token['value'] != '}':
if self.token['value'] == -1:
return self.invalid('invalid ebnf (missing closing })')
if not self.token['is_sequence']:
return -1
t = self.token['value']
if t[0] != 'ident':
return -1
ident = t[1]
idx = self.add_ident(ident)
self.get_token()
self.match_token('=')
if self.token['value'] == -1:
return -1
self.productions.append([ident, idx, self.expression()])
self.ididx[idx] = len(self.productions) - 1
return self.token['value']
def parse(self, ebnf):
# Returns +1 if ok, -1 if bad.
print(f'parse:\n{ebnf} ===>')
self.err = False
self.src = ebnf
self.sdx = 0
self.idents = []
self.ididx = []
self.productions = []
self.extras = []
self.get_token()
if self.token['is_sequence']:
t = self.token['value']
t[0] = 'title'
self.extras.append(self.token['value'])
self.get_token()
if self.token['value'] != '{':
return self.invalid('invalid ebnf (missing opening {)')
while True:
token_result = self.production()
if token_result == '}' or token_result == -1:
break
self.get_token()
if self.token['is_sequence']:
t = self.token['value']
t[0] = 'comment'
self.extras.append(self.token['value'])
self.get_token()
if self.token['value'] != -1:
return self.invalid('invalid ebnf (missing eof?)')
if self.err:
return -1
k = -1
for i in range(len(self.ididx)):
if self.ididx[i] == -1:
k = i
break
if k != -1:
return self.invalid(f'invalid ebnf (undefined:{self.idents[k]})')
self.pprint(self.productions, 'productions')
self.pprint(self.idents, 'idents')
self.pprint(self.ididx, 'ididx')
self.pprint(self.extras, 'extras')
return 1
def pprint(self, ob, header):
# Adjusts Python's normal printing to look more like Phix output.
print(f'\n{header}:')
pp = json.dumps(ob)
pp = pp.replace('[', '{')
pp = pp.replace(']', '}')
pp = pp.replace(',', ', ')
for i in range(len(self.idents)):
pp = pp.replace(f'\\b{i}\\b', str(i))
print(pp)
def applies(self, rule):
# The rules that applies() has to deal with are:
# [factors] - if rule[0] is not string,
# just apply one after the other recursively.
# ["terminal", "a1"] -- literal constants
# ["or", <e1>, <e2>, ...} -- (any) one of n
# ["repeat", <e1>] -- as per "{}" in ebnf
# ["optional", <e1>] -- as per "[]" in ebnf
# ["ident", <name>, idx] -- apply the sub-rule
was_sdx = self.sdx # in case of failure
r1 = rule[0]
if not isinstance(r1, str):
for i in range(len(rule)):
if not self.applies(rule[i]):
self.sdx = was_sdx
return False
elif r1 == 'terminal':
self.skip_spaces()
r2 = rule[1]
for i in range(len(r2)):
if self.sdx >= len(self.src) or self.src[self.sdx] != r2[i]:
self.sdx = was_sdx
return False
self.sdx += 1
elif r1 == 'or':
for i in range(1, len(rule)):
if self.applies(rule[i]):
return True
self.sdx = was_sdx
return False
elif r1 == 'repeat':
while self.applies(rule[1]):
# continue repeating
pass
elif r1 == 'optional':
self.applies(rule[1])
elif r1 == 'ident':
i = rule[2]
ii = self.ididx[i]
if not self.applies(self.productions[ii][2]):
self.sdx = was_sdx
return False
else:
raise ValueError('invalid rule in applies() function')
return True
def check_valid(self, test):
self.src = test
self.sdx = 0
res = self.applies(self.productions[0][2])
self.skip_spaces()
if self.sdx < len(self.src):
res = False
print(f'"{test}", {self.results[1 - self.btoi(res)]}')
# Main execution
if __name__ == "__main__":
parser = EBNFParser()
ebnfs = [
'"a" {\n' +
' a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;\n' +
'} "z" ',
'{\n' +
' expr = term { plus term } .\n' +
' term = factor { times factor } .\n' +
' factor = number | \'(\' expr \')\' .\n' +
' \n' +
' plus = "+" | "-" .\n' +
' times = "*" | "/" .\n' +
' \n' +
' number = digit { digit } .\n' +
' digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .\n' +
'}',
'a = "1"',
'{ a = "1" ;',
'{ hello world = "1"; }',
'{ foo = bar . }'
]
tests = [
[
'a1a3a4a4a5a6',
'a1 a2a6',
'a1 a3 a4 a6',
'a1 a4 a5 a6',
'a1 a2 a4 a5 a5 a6',
'a1 a2 a4 a5 a6 a7',
'your ad here'
],
[
'2',
'2*3 + 4/23 - 7',
'(3 + 4) * 6-2+(4*(4))',
'-2',
'3 +',
'(4 + 3'
]
]
for i in range(len(ebnfs)):
if parser.parse(ebnfs[i]) == 1:
print('\ntests:')
if i < len(tests):
for test in tests[i]:
parser.check_valid(test)
print('')
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{"a", 0, {{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}}, {"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}}, {"terminal", "a6"}}}}
idents:
{"a"}
ididx:
{0}
extras:
{{"title", "a"}, {"comment", "z"}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{"expr", 0, {{"ident", "term", 1}, {"repeat", {{"ident", "plus", 2}, {"ident", "term", 1}}}}}, {"term", 1, {{"ident", "factor", 3}, {"repeat", {{"ident", "times", 4}, {"ident", "factor", 3}}}}}, {"factor", 3, {"or", {"ident", "number", 5}, {{"terminal", "("}, {"ident", "expr", 0}, {"terminal", ")"}}}}, {"plus", 2, {"or", {"terminal", "+"}, {"terminal", "-"}}}, {"times", 4, {"or", {"terminal", "*"}, {"terminal", "/"}}}, {"number", 5, {{"ident", "digit", 6}, {"repeat", {"ident", "digit", 6}}}}, {"digit", 6, {"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"}, {"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"}, {"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"}, {"terminal", "9"}}}}
idents:
{"expr", "term", "plus", "factor", "times", "number", "digit"}
ididx:
{0, 1, 3, 2, 4, 5, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
You can run the code at https://rdrr.io/snippets/
# EBNF Parser in R
library(jsonlite)
EBNFParser <- function() {
# Initialize parser state
self <- list()
# Instance variables
self$src <- ""
self$ch <- ""
self$sdx <- 1 # R uses 1-based indexing
self$token <- NULL
self$err <- FALSE
self$idents <- character(0)
self$ididx <- integer(0)
self$productions <- list()
self$extras <- list()
self$results <- c("pass", "fail")
# Helper function to convert boolean to integer
self$btoi <- function(b) {
if (b) 1 else 0
}
# Error handling function
self$invalid <- function(msg) {
self$err <<- TRUE
cat(msg, "\n")
self$sdx <<- nchar(self$src) + 1 # set to eof
return(-1)
}
# Skip whitespace characters
self$skip_spaces <- function() {
while (self$sdx <= nchar(self$src)) {
self$ch <<- substr(self$src, self$sdx, self$sdx)
if (!grepl("[ \t\r\n]", self$ch)) {
break
}
self$sdx <<- self$sdx + 1
}
}
# Tokenizer function
self$get_token <- function() {
self$skip_spaces()
if (self$sdx > nchar(self$src)) {
self$token <<- list(value = -1, is_sequence = FALSE)
return()
}
tokstart <- self$sdx
if (grepl("[{}()\\[\\]|=.;]", self$ch)) {
self$sdx <<- self$sdx + 1
self$token <<- list(value = self$ch, is_sequence = FALSE)
} else if (self$ch %in% c('"', "'")) {
closech <- self$ch
tokend <- tokstart + 1
while (tokend <= nchar(self$src) && substr(self$src, tokend, tokend) != closech) {
tokend <- tokend + 1
}
if (tokend > nchar(self$src)) {
self$token <<- list(value = self$invalid("no closing quote"), is_sequence = FALSE)
} else {
self$sdx <<- tokend + 1
self$token <<- list(
value = list("terminal", substr(self$src, tokstart + 1, tokend - 1)),
is_sequence = TRUE
)
}
} else if (grepl("[a-z]", self$ch)) {
# Identifiers are strictly a-z only
while (self$sdx <= nchar(self$src) && grepl("[a-z]", self$ch)) {
self$sdx <<- self$sdx + 1
if (self$sdx <= nchar(self$src)) {
self$ch <<- substr(self$src, self$sdx, self$sdx)
}
}
self$token <<- list(
value = list("ident", substr(self$src, tokstart, self$sdx - 1)),
is_sequence = TRUE
)
} else {
self$token <<- list(value = self$invalid("invalid ebnf"), is_sequence = FALSE)
}
}
# Match expected token
self$match_token <- function(expected_ch) {
if (!identical(self$token$value, expected_ch)) {
self$token <<- list(
value = self$invalid(paste0("invalid ebnf (", expected_ch, " expected)")),
is_sequence = FALSE
)
} else {
self$get_token()
}
}
# Add identifier to symbol table
self$add_ident <- function(ident) {
k <- which(self$idents == ident)
if (length(k) == 0) {
self$idents <<- c(self$idents, ident)
k <- length(self$idents)
self$ididx <<- c(self$ididx, -1)
} else {
k <- k[1]
}
return(k)
}
# Parse factor (atomic elements)
self$factor <- function() {
if (self$token$is_sequence) {
t <- self$token$value
if (t[[1]] == "ident") {
idx <- self$add_ident(t[[2]])
t[[3]] <- idx
self$token$value <<- t
}
res <- self$token$value
self$get_token()
} else if (identical(self$token$value, "[")) {
self$get_token()
res <- list("optional", self$expression())
self$match_token("]")
} else if (identical(self$token$value, "(")) {
self$get_token()
res <- self$expression()
self$match_token(")")
} else if (identical(self$token$value, "{")) {
self$get_token()
res <- list("repeat", self$expression())
self$match_token("}")
} else {
stop("invalid token in factor() function")
}
if (is.list(res) && length(res) == 1) {
return(res[[1]])
}
return(res)
}
# Parse term (sequence of factors)
self$term <- function() {
res <- list(self$factor())
tokens <- list(-1, "|", ".", ";", ")", "]", "}")
while (TRUE) {
if (any(sapply(tokens, function(x) identical(self$token$value, x)))) {
break
}
res[[length(res) + 1]] <- self$factor()
}
if (length(res) == 1) {
return(res[[1]])
}
return(res)
}
# Parse expression (alternatives)
self$expression <- function() {
res <- list(self$term())
if (identical(self$token$value, "|")) {
res <- list("or", res[[1]])
while (identical(self$token$value, "|")) {
self$get_token()
res[[length(res) + 1]] <- self$term()
}
}
if (length(res) == 1) {
return(res[[1]])
}
return(res)
}
# Parse production rule
self$production <- function() {
self$get_token()
if (!identical(self$token$value, "}")) {
if (identical(self$token$value, -1)) {
return(self$invalid("invalid ebnf (missing closing })"))
}
if (!self$token$is_sequence) {
return(-1)
}
t <- self$token$value
if (t[[1]] != "ident") {
return(-1)
}
ident <- t[[2]]
idx <- self$add_ident(ident)
self$get_token()
self$match_token("=")
if (identical(self$token$value, -1)) {
return(-1)
}
self$productions[[length(self$productions) + 1]] <<- list(ident, idx, self$expression())
self$ididx[idx] <<- length(self$productions)
}
return(self$token$value)
}
# Main parsing function
self$parse <- function(ebnf) {
cat("parse:\n", ebnf, " ===>\n", sep = "")
self$err <<- FALSE
self$src <<- ebnf
self$sdx <<- 1
self$idents <<- character(0)
self$ididx <<- integer(0)
self$productions <<- list()
self$extras <<- list()
self$get_token()
if (self$token$is_sequence) {
t <- self$token$value
t[[1]] <- "title"
self$extras[[length(self$extras) + 1]] <<- self$token$value
self$get_token()
}
if (!identical(self$token$value, "{")) {
return(self$invalid("invalid ebnf (missing opening {)"))
}
while (TRUE) {
token_result <- self$production()
if (identical(token_result, "}") || identical(token_result, -1)) {
break
}
}
self$get_token()
if (self$token$is_sequence) {
t <- self$token$value
t[[1]] <- "comment"
self$extras[[length(self$extras) + 1]] <<- self$token$value
self$get_token()
}
if (!identical(self$token$value, -1)) {
return(self$invalid("invalid ebnf (missing eof?)"))
}
if (self$err) {
return(-1)
}
k <- which(self$ididx == -1)
if (length(k) > 0) {
return(self$invalid(paste0("invalid ebnf (undefined:", self$idents[k[1]], ")")))
}
self$pprint(self$productions, "productions")
self$pprint(self$idents, "idents")
self$pprint(self$ididx, "ididx")
self$pprint(self$extras, "extras")
return(1)
}
# Pretty print function
self$pprint <- function(ob, header) {
cat("\n", header, ":\n", sep = "")
pp <- toJSON(ob, auto_unbox = TRUE)
pp <- gsub("\\[", "{", pp)
pp <- gsub("\\]", "}", pp)
pp <- gsub(",", ", ", pp)
for (i in seq_along(self$idents)) {
pp <- gsub(paste0("\\b", i-1, "\\b"), as.character(i-1), pp)
}
cat(pp, "\n")
}
# Apply grammar rules to input
self$applies <- function(rule) {
was_sdx <- self$sdx # in case of failure
r1 <- rule[[1]]
if (!is.character(r1)) {
for (i in seq_along(rule)) {
if (!self$applies(rule[[i]])) {
self$sdx <<- was_sdx
return(FALSE)
}
}
} else if (r1 == "terminal") {
self$skip_spaces()
r2 <- rule[[2]]
for (i in seq_len(nchar(r2))) {
if (self$sdx > nchar(self$src) || substr(self$src, self$sdx, self$sdx) != substr(r2, i, i)) {
self$sdx <<- was_sdx
return(FALSE)
}
self$sdx <<- self$sdx + 1
}
} else if (r1 == "or") {
for (i in 2:length(rule)) {
if (self$applies(rule[[i]])) {
return(TRUE)
}
}
self$sdx <<- was_sdx
return(FALSE)
} else if (r1 == "repeat") {
while (self$applies(rule[[2]])) {
# continue repeating
}
} else if (r1 == "optional") {
self$applies(rule[[2]])
} else if (r1 == "ident") {
i <- rule[[3]]
ii <- self$ididx[i]
if (!self$applies(self$productions[[ii]][[3]])) {
self$sdx <<- was_sdx
return(FALSE)
}
} else {
stop("invalid rule in applies() function")
}
return(TRUE)
}
# Check if input is valid according to grammar
self$check_valid <- function(test) {
self$src <<- test
self$sdx <<- 1
res <- self$applies(self$productions[[1]][[3]])
self$skip_spaces()
if (self$sdx <= nchar(self$src)) {
res <- FALSE
}
cat('"', test, '", ', self$results[1 + self$btoi(!res)], "\n", sep = "")
}
return(self)
}
# Main execution
parser <- EBNFParser()
ebnfs <- c(
'"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ',
'{
expr = term { plus term } .
term = factor { times factor } .
factor = number | \'(\' expr \')\' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}',
'a = "1"',
'{ a = "1" ;',
'{ hello world = "1"; }',
'{ foo = bar . }'
)
tests <- list(
c(
'a1a3a4a4a5a6',
'a1 a2a6',
'a1 a3 a4 a6',
'a1 a4 a5 a6',
'a1 a2 a4 a5 a5 a6',
'a1 a2 a4 a5 a6 a7',
'your ad here'
),
c(
'2',
'2*3 + 4/23 - 7',
'(3 + 4) * 6-2+(4*(4))',
'-2',
'3 +',
'(4 + 3'
)
)
for (i in seq_along(ebnfs)) {
if (parser$parse(ebnfs[i]) == 1) {
cat('\ntests:\n')
if (i <= length(tests)) {
for (test in tests[[i]]) {
parser$check_valid(test)
}
}
}
cat('\n')
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
invalid ebnf
invalid ebnf (= expected)
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
invalid ebnf
invalid ebnf (= expected)
parse:
a = "1" ===>
invalid ebnf
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf
invalid ebnf (= expected)
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf
invalid ebnf (= expected)
(formerly Perl 6)
This parses the EBNF rule set using a Raku grammar, then if it parses as valid EBNF, constructs a grammar and parses the test strings with that. EBNF rule sets that are naively syntactically correct but missing rules will parse as valid but will give a runtime failure warning about missing methods. It is implemented and exercised using the flavor of EBNF and test cases specified on the test page.
# A Raku grammar to parse EBNF
grammar EBNF {
rule TOP { ^ <title>? '{' [ <production> ]+ '}' <comment>? $ }
rule production { <name> '=' <expression> <[.;]> }
rule expression { <term> +% "|" }
rule term { <factor>+ }
rule factor { <group> | <repeat> | <optional> | <identifier> | <literal> }
rule group { '(' <expression> ')' }
rule repeat { '{' <expression> '}' }
rule optional { '[' <expression> ']' }
token identifier { <-[\|\(\)\{\}\[\]\.\;\"\'\s]>+ } #"
token literal { ["'" <-[']>+ "'" | '"' <-["]>+ '"'] } #"
token title { <literal> }
token comment { <literal> }
token name { <identifier> <?before \h* '='> }
}
class EBNF::Actions {
method TOP($/) {
say "Syntax Tree:\n", $/; # Dump the syntax tree to STDOUT
make 'grammar ' ~
($<title> ?? $<title>.subst(/\W/, '', :g) !! 'unnamed') ~
" \{\n rule TOP \{^[<" ~ $/<production>[0]<name> ~
">]+\$\}\n " ~ $<production>>>.ast ~ "\}"
}
method production($/) {
make 'token ' ~ $<name> ~ ' {' ~
$<expression>.ast ~ "}\n"
}
method expression($/) { make join '|', $<term>>>.ast }
method term($/) { make join '\h*', $<factor>>>.ast }
method factor($/) {
make $<literal> ?? $<literal> !!
$<group> ?? '[' ~ $<group>.ast ~ ']' !!
$<repeat> ?? '[' ~ $<repeat>.ast ~ '\\s*]*' !!
$<optional> ?? '[' ~ $<optional>.ast ~ ']?' !!
'<' ~ $<identifier> ~ '>'
}
method repeat($/) { make $<expression>.ast }
method optional($/) { make $<expression>.ast }
method group($/) { make $<expression>.ast }
}
# An array of test cases
my @tests = (
{
ebnf =>
q<"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z">
,
teststrings => [
'a1a3a4a4a5a6',
'a1 a2a6',
'a1 a3 a4 a6',
'a1 a4 a5 a6',
'a1 a2 a4 a4 a5 a6',
'a1 a2 a4 a5 a5 a6',
'a1 a2 a4 a5 a6 a7',
'your ad here'
]
},
{
ebnf =>
q<{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}>
,
teststrings => [
'2',
'2*3 + 4/23 - 7',
'(3 + 4) * 6-2+(4*(4))',
'-2',
'3 +',
'(4 + 3'
]
},
{
ebnf => q<a = "1";>,
teststrings => ['foobar']
},
{
ebnf => q<{ a = "1" ;>,
teststrings => ['foobar']
},
{
ebnf => q<{ hello world = "1"; }>,
teststrings => ['foobar']
},
{
ebnf => q<{ foo = bar . }>,
teststrings => ['foobar']
}
);
# Test the parser.
my $i = 1;
for @tests -> $test {
unless EBNF.parse($test<ebnf>) {
say "Parsing EBNF grammar:\n";
say "{$test<ebnf>.subst(/^^\h*/,'',:g)}\n";
say "Invalid syntax. Can not be parsed.\n";
say '*' x 79;
next;
}
my $p = EBNF.parse($test<ebnf>, :actions(EBNF::Actions));
my $grammar = $p.ast;
$grammar ~~ m/^'grammar '(\w+)/;
my $title = $0;
my $fn = 'EBNFtest'~$i++;
my $fh = open($fn, :w) orelse .die;
$fh.say( "\{\n", $grammar );
$fh.say( qq|say "Parsing EBNF grammar '$title':\\n";| );
$fh.say( qq|say q<{$test<ebnf>.subst(/^^\h*/,'',:g)}>;| );
$fh.say( q|say "\nValid syntax.\n\nTesting:\n";| );
$fh.say( q|CATCH { default { say " - $_" } };| );
my $len = max $test<teststrings>.flat>>.chars;
for $test<teststrings>.flat -> $s {
$fh.say( qq|printf "%{$len}s", '{$s}';| ~
qq|printf " - %s\\n", {$title}.parse('{$s}')| ~
qq| ?? 'valid.' !! 'NOT valid.';|
);
}
$fh.say( qq| "\\n"} |);
$fh.close;
say qqx/raku $fn/;
say '*' x 79, "\n";
unlink $fn;
}
Output:
Syntax Tree:
「"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"」
title => 「"a"」
literal => 「"a"」
production => 「a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
」
name => 「a」
identifier => 「a」
expression => 「"a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" 」
term => 「"a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" 」
factor => 「"a1" 」
literal => 「"a1"」
factor => 「( "a2" | "a3" ) 」
group => 「( "a2" | "a3" ) 」
expression => 「"a2" | "a3" 」
term => 「"a2" 」
factor => 「"a2" 」
literal => 「"a2"」
term => 「 "a3" 」
factor => 「"a3" 」
literal => 「"a3"」
factor => 「{ "a4" } 」
repeat => 「{ "a4" } 」
expression => 「"a4" 」
term => 「"a4" 」
factor => 「"a4" 」
literal => 「"a4"」
factor => 「[ "a5" ] 」
optional => 「[ "a5" ] 」
expression => 「"a5" 」
term => 「"a5" 」
factor => 「"a5" 」
literal => 「"a5"」
factor => 「"a6" 」
literal => 「"a6"」
comment => 「"z"」
literal => 「"z"」
Parsing EBNF grammar 'a':
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"
Valid syntax.
Testing:
a1a3a4a4a5a6 - valid.
a1 a2a6 - valid.
a1 a3 a4 a6 - valid.
a1 a4 a5 a6 - NOT valid.
a1 a2 a4 a4 a5 a6 - valid.
a1 a2 a4 a5 a5 a6 - NOT valid.
a1 a2 a4 a5 a6 a7 - NOT valid.
your ad here - NOT valid.
*******************************************************************************
Syntax Tree:
「{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}」
production => 「expr = term { plus term } .
」
name => 「expr」
identifier => 「expr」
expression => 「term { plus term } 」
term => 「term { plus term } 」
factor => 「term 」
identifier => 「term」
factor => 「{ plus term } 」
repeat => 「{ plus term } 」
expression => 「plus term 」
term => 「plus term 」
factor => 「plus 」
identifier => 「plus」
factor => 「term 」
identifier => 「term」
production => 「term = factor { times factor } .
」
name => 「term」
identifier => 「term」
expression => 「factor { times factor } 」
term => 「factor { times factor } 」
factor => 「factor 」
identifier => 「factor」
factor => 「{ times factor } 」
repeat => 「{ times factor } 」
expression => 「times factor 」
term => 「times factor 」
factor => 「times 」
identifier => 「times」
factor => 「factor 」
identifier => 「factor」
production => 「factor = number | '(' expr ')' .
」
name => 「factor」
identifier => 「factor」
expression => 「number | '(' expr ')' 」
term => 「number 」
factor => 「number 」
identifier => 「number」
term => 「 '(' expr ')' 」
factor => 「'(' 」
literal => 「'('」
factor => 「expr 」
identifier => 「expr」
factor => 「')' 」
literal => 「')'」
production => 「plus = "+" | "-" .
」
name => 「plus」
identifier => 「plus」
expression => 「"+" | "-" 」
term => 「"+" 」
factor => 「"+" 」
literal => 「"+"」
term => 「 "-" 」
factor => 「"-" 」
literal => 「"-"」
production => 「times = "*" | "/" .
」
name => 「times」
identifier => 「times」
expression => 「"*" | "/" 」
term => 「"*" 」
factor => 「"*" 」
literal => 「"*"」
term => 「 "/" 」
factor => 「"/" 」
literal => 「"/"」
production => 「number = digit { digit } .
」
name => 「number」
identifier => 「number」
expression => 「digit { digit } 」
term => 「digit { digit } 」
factor => 「digit 」
identifier => 「digit」
factor => 「{ digit } 」
repeat => 「{ digit } 」
expression => 「digit 」
term => 「digit 」
factor => 「digit 」
identifier => 「digit」
production => 「digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
」
name => 「digit」
identifier => 「digit」
expression => 「"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" 」
term => 「"0" 」
factor => 「"0" 」
literal => 「"0"」
term => 「 "1" 」
factor => 「"1" 」
literal => 「"1"」
term => 「 "2" 」
factor => 「"2" 」
literal => 「"2"」
term => 「 "3" 」
factor => 「"3" 」
literal => 「"3"」
term => 「 "4" 」
factor => 「"4" 」
literal => 「"4"」
term => 「 "5" 」
factor => 「"5" 」
literal => 「"5"」
term => 「 "6" 」
factor => 「"6" 」
literal => 「"6"」
term => 「 "7" 」
factor => 「"7" 」
literal => 「"7"」
term => 「 "8" 」
factor => 「"8" 」
literal => 「"8"」
term => 「 "9" 」
factor => 「"9" 」
literal => 「"9"」
Parsing EBNF grammar 'unnamed':
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}
Valid syntax.
Testing:
2 - valid.
2*3 + 4/23 - 7 - valid.
(3 + 4) * 6-2+(4*(4)) - valid.
-2 - NOT valid.
3 + - NOT valid.
(4 + 3 - NOT valid.
*******************************************************************************
Parsing EBNF grammar:
a = "1";
Invalid syntax. Can not be parsed.
*******************************************************************************
Parsing EBNF grammar:
{ a = "1" ;
Invalid syntax. Can not be parsed.
*******************************************************************************
Parsing EBNF grammar:
{ hello world = "1"; }
Invalid syntax. Can not be parsed.
*******************************************************************************
Syntax Tree:
「{ foo = bar . }」
production => 「foo = bar . 」
name => 「foo」
identifier => 「foo」
expression => 「bar 」
term => 「bar 」
factor => 「bar 」
identifier => 「bar」
Parsing EBNF grammar 'unnamed':
{ foo = bar . }
Valid syntax.
Testing:
foobar - No such method 'bar' for invocant of type 'unnamed'
*******************************************************************************
#--
# The tokenizer splits the input into Tokens like "identifier",
# ":", ")*" and so on. This design uses a StringScanner on each line of
# input, therefore a Token can never span more than one line.
#
# Each Token knows its original line and position, so an error message
# can locate a bad token.
#++
require 'strscan'
# A line of input.
# where:: A location like "file.txt:3"
# str:: String of this line
Line = Struct.new :where, :str
# A token.
# cat:: A category like :colon, :ident or so on
# str:: String of this token
# line:: Line containing this token
# pos:: Position of this token within this line
Token = Struct.new :cat, :str, :line, :pos
# Reads and returns the next Token. At end of file, returns nil.
#--
# Needs @filename and @in.
#++
def next_token
# Loop until we reach a Token.
loop do
# If at end of line, then get next line, or else declare end of
# file.
if @scanner.eos?
if s = @in.gets
# Each line needs a new Line object. Tokens can hold references
# to old Line objects.
@line = Line.new("#{@filename}:#{@in.lineno}", s)
@scanner.string = s
else
return nil # End of file
end
end
# Skip whitespace.
break unless @scanner.skip(/[[:space:]]+/)
end
# Read token by regular expression.
if s = @scanner.scan(/:/)
c = :colon
elsif s = @scanner.scan(/;/)
c = :semicolon
elsif s = @scanner.scan(/\(/)
c = :paren
elsif s = @scanner.scan(/\)\?/)
c = :option
elsif s = @scanner.scan(/\)\*/)
c = :repeat
elsif s = @scanner.scan(/\)/)
c = :group
elsif s = @scanner.scan(/\|/)
c = :bar
elsif s = @scanner.scan(/[[:alpha:]][[:alnum:]]*/)
c = :ident
elsif s = @scanner.scan(/'[^']*'|"[^"]*"/)
# Fix syntax highlighting for Rosetta Code. => '
c = :string
elsif s = @scanner.scan(/'[^']*|"[^"]*/)
c = :bad_string
elsif s = @scanner.scan(/.*/)
c = :unknown
end
Token.new(c, s, @line, (@scanner.pos - s.length))
end
# Prints a _message_ to standard error, along with location of _token_.
def error(token, message)
line = token.line
# We print a caret ^ pointing at the bad token. We make a very crude
# attempt to align the caret ^ in the correct column. If the input
# line has a non-[:print:] character, like a tab, then we print it as
# a space.
STDERR.puts <<EOF
#{line.where}: #{message}
#{line.str.gsub(/[^[:print:]]/, " ")}
#{" " * token.pos}^
EOF
end
#--
# The parser converts Tokens to a Grammar object. The parser also
# detects syntax errors.
#++
# A parsed EBNF grammar. It is an Array of Productions.
class Grammar < Array; end
# A production.
# ident:: The identifier
# alts:: An Array of Alternatives
Production = Struct.new :ident, :alts
# An array of Alternatives, as from "(a | b)".
class Group < Array; end
# An optional group, as from "(a | b)?".
class OptionalGroup < Group; end
# A repeated group, as from "(a | b)*".
class RepeatedGroup < Group; end
# An array of identifiers and string literals.
class Alternative < Array; end
#--
# Needs @filename and @in.
#++
def parse
# TODO: this only dumps the tokens.
while t = next_token
error(t, "#{t.cat}")
end
end
# Set @filename and @in. Parse input.
case ARGV.length
when 0 then @filename = "-"
when 1 then @filename = ARGV[0]
else fail "Too many arguments"
end
open(@filename) do |f|
@in = f
@scanner = StringScanner.new("")
parse
end
use std::collections::HashMap;
#[derive(Debug, Clone)]
enum Token {
Char(char),
Sequence(Vec<String>),
EOF,
}
#[derive(Debug, Clone)]
enum Rule {
Terminal(String),
Ident(String, usize),
Or(Vec<Rule>),
Repeat(Box<Rule>),
Optional(Box<Rule>),
Sequence(Vec<Rule>),
}
pub struct EBNFParser {
src: String,
ch: char,
sdx: usize,
token: Token,
err: bool,
idents: Vec<String>,
ididx: Vec<Option<usize>>,
productions: Vec<(String, usize, Rule)>,
extras: Vec<Vec<String>>,
}
impl EBNFParser {
pub fn new() -> Self {
EBNFParser {
src: String::new(),
ch: '\0',
sdx: 0,
token: Token::EOF,
err: false,
idents: Vec::new(),
ididx: Vec::new(),
productions: Vec::new(),
extras: Vec::new(),
}
}
fn btoi(&self, b: bool) -> usize {
if b { 1 } else { 0 }
}
fn invalid(&mut self, msg: &str) -> i32 {
self.err = true;
println!("{}", msg);
self.sdx = self.src.len(); // set to eof
-1
}
fn skip_spaces(&mut self) {
while self.sdx < self.src.len() {
self.ch = self.src.chars().nth(self.sdx).unwrap_or('\0');
if ![' ', '\t', '\r', '\n'].contains(&self.ch) {
break;
}
self.sdx += 1;
}
}
fn get_token(&mut self) {
// Yields a single character token, one of {}()[]|=.;
// or ["terminal",string] or ["ident", string] or EOF.
self.skip_spaces();
if self.sdx >= self.src.len() {
self.token = Token::EOF;
return;
}
let tokstart = self.sdx;
if "{}()[]|=.;".contains(self.ch) {
self.sdx += 1;
self.token = Token::Char(self.ch);
} else if self.ch == '"' || self.ch == '\'' {
let closech = self.ch;
let mut tokend = tokstart + 1;
while tokend < self.src.len() && self.src.chars().nth(tokend).unwrap_or('\0') != closech {
tokend += 1;
}
if tokend >= self.src.len() {
self.invalid("no closing quote");
self.token = Token::EOF;
} else {
self.sdx = tokend + 1;
let content = self.src[tokstart + 1..tokend].to_string();
self.token = Token::Sequence(vec!["terminal".to_string(), content]);
}
} else if self.ch.is_ascii_lowercase() {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while self.sdx < self.src.len() {
self.ch = self.src.chars().nth(self.sdx).unwrap_or('\0');
if !self.ch.is_ascii_lowercase() {
break;
}
self.sdx += 1;
}
let ident = self.src[tokstart..self.sdx].to_string();
self.token = Token::Sequence(vec!["ident".to_string(), ident]);
} else {
self.invalid("invalid ebnf");
self.token = Token::EOF;
}
}
fn match_token(&mut self, expected_ch: char) {
if !matches!(&self.token, Token::Char(ch) if *ch == expected_ch) {
self.invalid(&format!("invalid ebnf ({} expected)", expected_ch));
} else {
self.get_token();
}
}
fn add_ident(&mut self, ident: String) -> usize {
if let Some(k) = self.idents.iter().position(|x| x == &ident) {
k
} else {
self.idents.push(ident);
let k = self.idents.len() - 1;
self.ididx.push(None);
k
}
}
fn factor(&mut self) -> Result<Rule, String> {
match &self.token.clone() {
Token::Sequence(t) => {
if t[0] == "ident" {
let idx = self.add_ident(t[1].clone());
let rule = Rule::Ident(t[1].clone(), idx);
self.get_token();
Ok(rule)
} else if t[0] == "terminal" {
let rule = Rule::Terminal(t[1].clone());
self.get_token();
Ok(rule)
} else {
Err("invalid token sequence".to_string())
}
}
Token::Char('[') => {
self.get_token();
let expr = self.expression()?;
self.match_token(']');
Ok(Rule::Optional(Box::new(expr)))
}
Token::Char('(') => {
self.get_token();
let expr = self.expression()?;
self.match_token(')');
Ok(expr)
}
Token::Char('{') => {
self.get_token();
let expr = self.expression()?;
self.match_token('}');
Ok(Rule::Repeat(Box::new(expr)))
}
_ => Err("invalid token in factor() function".to_string()),
}
}
fn term(&mut self) -> Result<Rule, String> {
let mut factors = vec![self.factor()?];
let stop_tokens = [Token::EOF, Token::Char('|'), Token::Char('.'), Token::Char(';'),
Token::Char(')'), Token::Char(']'), Token::Char('}')];
while !stop_tokens.iter().any(|t| std::mem::discriminant(t) == std::mem::discriminant(&self.token)) {
factors.push(self.factor()?);
}
if factors.len() == 1 {
Ok(factors.into_iter().next().unwrap())
} else {
Ok(Rule::Sequence(factors))
}
}
fn expression(&mut self) -> Result<Rule, String> {
let first_term = self.term()?;
if matches!(self.token, Token::Char('|')) {
let mut terms = vec![first_term];
while matches!(self.token, Token::Char('|')) {
self.get_token();
terms.push(self.term()?);
}
Ok(Rule::Or(terms))
} else {
Ok(first_term)
}
}
fn production(&mut self) -> Result<Token, String> {
// Returns a token or EOF; the real result is left in 'productions' etc,
self.get_token();
if matches!(self.token, Token::Char('}')) {
return Ok(self.token.clone());
}
if matches!(self.token, Token::EOF) {
self.invalid("invalid ebnf (missing closing })");
return Ok(Token::EOF);
}
if let Token::Sequence(t) = &self.token {
if t[0] == "ident" {
let ident = t[1].clone();
let idx = self.add_ident(ident.clone());
self.get_token();
self.match_token('=');
if matches!(self.token, Token::EOF) {
return Ok(Token::EOF);
}
let expr = self.expression()?;
self.productions.push((ident, idx, expr));
self.ididx[idx] = Some(self.productions.len() - 1);
return Ok(self.token.clone());
}
}
Ok(Token::EOF)
}
pub fn parse(&mut self, ebnf: &str) -> i32 {
// Returns +1 if ok, -1 if bad.
println!("parse:\n{} ===>\n", ebnf);
self.err = false;
self.src = ebnf.to_string();
self.sdx = 0;
self.idents.clear();
self.ididx.clear();
self.productions.clear();
self.extras.clear();
self.get_token();
if let Token::Sequence(mut t) = self.token.clone() {
t[0] = "title".to_string();
self.extras.push(t);
self.get_token();
}
if !matches!(self.token, Token::Char('{')) {
return self.invalid("invalid ebnf (missing opening {)");
}
loop {
match self.production() {
Ok(Token::Char('}')) | Ok(Token::EOF) => break,
Err(_) => return -1,
_ => continue,
}
}
self.get_token();
if let Token::Sequence(mut t) = self.token.clone() {
t[0] = "comment".to_string();
self.extras.push(t);
self.get_token();
}
if !matches!(self.token, Token::EOF) {
return self.invalid("invalid ebnf (missing eof?)");
}
if self.err {
return -1;
}
// Check for undefined identifiers
for (i, ididx_val) in self.ididx.iter().enumerate() {
if ididx_val.is_none() {
return self.invalid(&format!("invalid ebnf (undefined:{})", self.idents[i]));
}
}
self.pprint_productions();
self.pprint_idents();
self.pprint_ididx();
self.pprint_extras();
1
}
fn pprint_productions(&self) {
println!("\nproductions:");
println!("{:?}", self.productions);
}
fn pprint_idents(&self) {
println!("\nidents:");
println!("{:?}", self.idents);
}
fn pprint_ididx(&self) {
println!("\nididx:");
println!("{:?}", self.ididx);
}
fn pprint_extras(&self) {
println!("\nextras:");
println!("{:?}", self.extras);
}
fn applies(&self, rule: &Rule, src: &str, sdx: &mut usize) -> bool {
let was_sdx = *sdx;
match rule {
Rule::Sequence(rules) => {
for rule in rules {
if !self.applies(rule, src, sdx) {
*sdx = was_sdx;
return false;
}
}
true
}
Rule::Terminal(terminal) => {
self.skip_spaces_at(src, sdx);
for ch in terminal.chars() {
if *sdx >= src.len() || src.chars().nth(*sdx).unwrap_or('\0') != ch {
*sdx = was_sdx;
return false;
}
*sdx += 1;
}
true
}
Rule::Or(rules) => {
for rule in rules {
if self.applies(rule, src, sdx) {
return true;
}
}
*sdx = was_sdx;
false
}
Rule::Repeat(rule) => {
while self.applies(rule, src, sdx) {
// continue repeating
}
true
}
Rule::Optional(rule) => {
self.applies(rule, src, sdx);
true
}
Rule::Ident(_name, idx) => {
if let Some(prod_idx) = self.ididx[*idx] {
if !self.applies(&self.productions[prod_idx].2, src, sdx) {
*sdx = was_sdx;
return false;
}
true
} else {
false
}
}
}
}
fn skip_spaces_at(&self, src: &str, sdx: &mut usize) {
while *sdx < src.len() {
let ch = src.chars().nth(*sdx).unwrap_or('\0');
if ![' ', '\t', '\r', '\n'].contains(&ch) {
break;
}
*sdx += 1;
}
}
fn check_valid(&self, test: &str) {
let mut sdx = 0;
let res = self.applies(&self.productions[0].2, test, &mut sdx);
self.skip_spaces_at(test, &mut sdx);
let final_res = res && sdx >= test.len();
let result = if final_res { "pass" } else { "fail" };
println!("\"{}\", {}", test, result);
}
}
fn main() {
let mut parser = EBNFParser::new();
let ebnfs = [
r#""a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" "#,
r#"{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}"#,
r#"a = "1""#,
r#"{ a = "1" ;"#,
r#"{ hello world = "1"; }"#,
r#"{ foo = bar . }"#,
];
let tests = [
vec![
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here",
],
vec![
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3",
],
];
for (i, ebnf) in ebnfs.iter().enumerate() {
if parser.parse(ebnf) == 1 {
println!("\ntests:");
if i < tests.len() {
for test in &tests[i] {
parser.check_valid(test);
}
}
}
println!();
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
invalid ebnf (missing eof?)
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
invalid ebnf (= expected)
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
import scala.collection.mutable
import scala.util.{Try, Success, Failure}
case class Token(value: Any, isSequence: Boolean)
class EBNFParser {
type Sequence = mutable.ArrayBuffer[Any]
private var src: String = ""
private var ch: Char = ' '
private var sdx: Int = 0
private var token: Token = Token(-1, false)
private var err: Boolean = false
private val idents: mutable.ArrayBuffer[String] = mutable.ArrayBuffer[String]()
private val ididx: mutable.ArrayBuffer[Int] = mutable.ArrayBuffer[Int]()
private val productions: mutable.ArrayBuffer[Sequence] = mutable.ArrayBuffer[Sequence]()
private val extras: Sequence = mutable.ArrayBuffer[Any]()
private val results = Array("pass", "fail")
private def btoi(b: Boolean): Int = if (b) 1 else 0
private def invalid(msg: String): Int = {
err = true
println(msg)
sdx = src.length // set to eof
-1
}
private def skipSpaces(): Unit = {
while (sdx < src.length) {
ch = src(sdx)
if (!" \t\r\n".contains(ch)) {
return
}
sdx += 1
}
}
private def getToken(): Unit = {
// Yields a single character token, one of {}()[]|=.;
// or Sequence("terminal", string) or Sequence("ident", string) or -1.
skipSpaces()
if (sdx >= src.length) {
token = Token(-1, false)
return
}
val tokstart = sdx
if ("{}()[]|=.;".contains(ch)) {
sdx += 1
token = Token(ch, false)
} else if (ch == '"' || ch == '\'') {
val closech = ch
var tokend = tokstart + 1
while (tokend < src.length && src(tokend) != closech) {
tokend += 1
}
if (tokend >= src.length) {
token = Token(invalid("no closing quote"), false)
} else {
sdx = tokend + 1
val seq = mutable.ArrayBuffer[Any]("terminal", src.substring(tokstart + 1, tokend))
token = Token(seq, true)
}
} else if (ch >= 'a' && ch <= 'z') {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (sdx < src.length && ch >= 'a' && ch <= 'z') {
sdx += 1
if (sdx < src.length) {
ch = src(sdx)
}
}
val seq = mutable.ArrayBuffer[Any]("ident", src.substring(tokstart, sdx))
token = Token(seq, true)
} else {
token = Token(invalid("invalid ebnf"), false)
}
}
private def matchToken(expectedCh: Char): Unit = {
if (token.value != expectedCh) {
token = Token(invalid(s"invalid ebnf ($expectedCh expected)"), false)
} else {
getToken()
}
}
private def addIdent(ident: String): Int = {
idents.indexOf(ident) match {
case -1 =>
idents += ident
val k = idents.length - 1
ididx += -1
k
case k => k
}
}
private def factor(): Any = {
val res = if (token.isSequence) {
val t = token.value.asInstanceOf[Sequence]
if (t(0) == "ident") {
val idx = addIdent(t(1).asInstanceOf[String])
t += idx
token = Token(t, true)
}
val result = token.value
getToken()
result
} else if (token.value == '[') {
getToken()
val seq = mutable.ArrayBuffer[Any]("optional", expression())
matchToken(']')
seq
} else if (token.value == '(') {
getToken()
val result = expression()
matchToken(')')
result
} else if (token.value == '{') {
getToken()
val seq = mutable.ArrayBuffer[Any]("repeat", expression())
matchToken('}')
seq
} else {
throw new RuntimeException("invalid token in factor() function")
}
res match {
case seq: mutable.ArrayBuffer[_] if seq.length == 1 => seq(0)
case other => other
}
}
private def term(): Any = {
val res = mutable.ArrayBuffer[Any](factor())
val tokens = Set[Any](-1, '|', '.', ';', ')', ']', '}')
while (!tokens.contains(token.value)) {
res += factor()
}
if (res.length == 1) res(0) else res
}
private def expression(): Any = {
var res = mutable.ArrayBuffer[Any](term())
if (token.value == '|') {
res = mutable.ArrayBuffer[Any]("or", res(0))
while (token.value == '|') {
getToken()
res += term()
}
}
if (res.length == 1) res(0) else res
}
private def production(): Any = {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken()
if (token.value != '}') {
if (token.value == -1) {
return invalid("invalid ebnf (missing closing })")
}
if (!token.isSequence) {
return -1
}
val t = token.value.asInstanceOf[Sequence]
if (t(0) != "ident") {
return -1
}
val ident = t(1).asInstanceOf[String]
val idx = addIdent(ident)
getToken()
matchToken('=')
if (token.value == -1) {
return -1
}
val prod = mutable.ArrayBuffer[Any](ident, idx, expression())
productions += prod
ididx(idx) = productions.length - 1
}
token.value
}
def parse(ebnf: String): Int = {
// Returns +1 if ok, -1 if bad.
println(s"parse:\n$ebnf ===>\n")
err = false
src = ebnf
sdx = 0
idents.clear()
ididx.clear()
productions.clear()
extras.clear()
getToken()
if (token.isSequence) {
val t = token.value.asInstanceOf[Sequence]
t(0) = "title"
extras += token.value
getToken()
}
if (token.value != '{') {
return invalid("invalid ebnf (missing opening {)")
}
var continue = true
while (continue) {
val tokenResult = production()
if (tokenResult == '}' || tokenResult == -1) {
continue = false
}
}
getToken()
if (token.isSequence) {
val t = token.value.asInstanceOf[Sequence]
t(0) = "comment"
extras += token.value
getToken()
}
if (token.value != -1) {
return invalid("invalid ebnf (missing eof?)")
}
if (err) {
return -1
}
val k = ididx.indexOf(-1)
if (k != -1) {
return invalid(s"invalid ebnf (undefined:${idents(k)})")
}
pprint(productions, "productions")
pprint(idents, "idents")
pprint(ididx, "ididx")
pprint(extras, "extras")
1
}
// Adjusts Scala's normal printing to look more like Phix output.
private def pprint(ob: Any, header: String): Unit = {
println(s"\n$header:")
var pp = ob.toString
pp = pp.replace("ArrayBuffer(", "{")
pp = pp.replace(")", "}")
pp = pp.replace(" ", ", ")
for (i <- idents.indices) {
pp = pp.replace(i.toString, i.toString)
}
println(pp)
}
// The rules that applies() has to deal with are:
// {factors} - if rule(0) is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
private def applies(rule: Sequence): Boolean = {
val wasSdx = sdx // in case of failure
val r1 = rule(0)
val result = r1 match {
case s: String => s match {
case "terminal" =>
skipSpaces()
val r2 = rule(1).asInstanceOf[String]
r2.forall { char =>
if (sdx >= src.length || src(sdx) != char) {
false
} else {
sdx += 1
true
}
}
case "or" =>
(1 until rule.length).exists(i => applies(rule(i).asInstanceOf[Sequence]))
case "repeat" =>
while (applies(rule(1).asInstanceOf[Sequence])) {
// continue repeating
}
true
case "optional" =>
applies(rule(1).asInstanceOf[Sequence])
true
case "ident" =>
val i = rule(2).asInstanceOf[Int]
val ii = ididx(i)
applies(productions(ii)(2).asInstanceOf[Sequence])
case _ =>
throw new RuntimeException("invalid rule in applies() function")
}
case _ =>
// {factors} - if rule(0) is not string, apply one after the other recursively
rule.forall(r => applies(r.asInstanceOf[Sequence]))
}
if (!result) {
sdx = wasSdx
}
result
}
private def checkValid(test: String): Unit = {
src = test
sdx = 0
var res = applies(productions(0)(2).asInstanceOf[Sequence])
skipSpaces()
if (sdx < src.length) {
res = false
}
println(s""""$test", ${results(1 - btoi(res))}""")
}
def main(args: Array[String]): Unit = {
val ebnfs = Array(
""""a" {
| a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
|} "z" """.stripMargin,
"""{
| expr = term { plus term } .
| term = factor { times factor } .
| factor = number | '(' expr ')' .
|
| plus = "+" | "-" .
| times = "*" | "/" .
|
| number = digit { digit } .
| digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
|}""".stripMargin,
"""a = "1"""",
"""{ a = "1" ;""",
"""{ hello world = "1"; }""",
"""{ foo = bar . }"""
)
val tests = Array(
Array(
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
),
Array(
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
)
)
for (i <- ebnfs.indices) {
if (parse(ebnfs(i)) == 1) {
println("\ntests:")
if (i < tests.length) {
tests(i).foreach(checkValid)
}
}
println()
}
}
}
object EBNFParser {
def main(args: Array[String]): Unit = {
val parser = new EBNFParser()
parser.main(args)
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{a,, 0,, {{terminal,, a1},, {or,, {terminal,, a2},, {terminal,, a3}},, {repeat,, {terminal,, a4}},, {optional,, {terminal,, a5}},, {terminal,, a6}}}}
idents:
{a}
ididx:
{0}
extras:
{{title,, a},, {comment,, z}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{expr,, 0,, {{ident,, term,, 1},, {repeat,, {{ident,, plus,, 2},, {ident,, term,, 1}}}}},, {term,, 1,, {{ident,, factor,, 3},, {repeat,, {{ident,, times,, 4},, {ident,, factor,, 3}}}}},, {factor,, 3,, {or,, {ident,, number,, 5},, {{terminal,, (},, {ident,, expr,, 0},, {terminal,, }}}}},, {plus,, 2,, {or,, {terminal,, +},, {terminal,, -}}},, {times,, 4,, {or,, {terminal,, *},, {terminal,, /}}},, {number,, 5,, {{ident,, digit,, 6},, {repeat,, {ident,, digit,, 6}}}},, {digit,, 6,, {or,, {terminal,, 0},, {terminal,, 1},, {terminal,, 2},, {terminal,, 3},, {terminal,, 4},, {terminal,, 5},, {terminal,, 6},, {terminal,, 7},, {terminal,, 8},, {terminal,, 9}}}}
idents:
{expr,, term,, plus,, factor,, times,, number,, digit}
ididx:
{0,, 1,, 3,, 2,, 4,, 5,, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
import Foundation
class EBNFParser {
// Data structures for better organization
private struct Token {
let value: Any
let isSequence: Bool
init(_ value: Any, _ isSequence: Bool) {
self.value = value
self.isSequence = isSequence
}
}
private class Sequence {
private var items: [Any] = []
init(_ items: Any...) {
self.items = Array(items)
}
subscript(index: Int) -> Any {
get { return items[index] }
set { items[index] = newValue }
}
func add(_ item: Any) {
items.append(item)
}
var count: Int {
return items.count
}
var size: Int {
return items.count
}
func contains(_ item: Any) -> Bool {
for i in items {
if let str1 = i as? String, let str2 = item as? String {
if str1 == str2 { return true }
}
if let int1 = i as? Int, let int2 = item as? Int {
if int1 == int2 { return true }
}
if let char1 = i as? Character, let char2 = item as? Character {
if char1 == char2 { return true }
}
}
return false
}
var description: String {
return items.map { "\($0)" }.joined(separator: ", ")
}
}
private var src: String = ""
private var ch: Character = " "
private var sdx: Int = 0
private var token: Token = Token(-1, false)
private var err: Bool = false
private var idents: [String] = []
private var ididx: [Int] = []
private var productions: [Sequence] = []
private var extras = Sequence()
private let results = ["pass", "fail"]
private func boolToInt(_ value: Bool) -> Int {
return value ? 1 : 0
}
private func invalid(_ msg: String) -> Int {
err = true
print(msg)
sdx = src.count // set to eof
return -1
}
private func skipSpaces() {
while sdx < src.count {
let index = src.index(src.startIndex, offsetBy: sdx)
ch = src[index]
if !(" \t\r\n".contains(ch)) {
break
}
sdx += 1
}
}
private func getToken() {
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
skipSpaces()
if sdx >= src.count {
token = Token(-1, false)
return
}
let tokstart = sdx
if "{}()[]|=.;".contains(ch) {
sdx += 1
token = Token(ch, false)
} else if ch == "\"" || ch == "'" {
let closech = ch
var tokend = tokstart + 1
while tokend < src.count {
let index = src.index(src.startIndex, offsetBy: tokend)
if src[index] == closech {
break
}
tokend += 1
}
if tokend >= src.count {
token = Token(invalid("no closing quote"), false)
} else {
sdx = tokend + 1
let startIndex = src.index(src.startIndex, offsetBy: tokstart + 1)
let endIndex = src.index(src.startIndex, offsetBy: tokend)
let substring = String(src[startIndex..<endIndex])
token = Token(Sequence("terminal", substring), true)
}
} else if ch.isLowercase && ch.isLetter {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while sdx < src.count {
let index = src.index(src.startIndex, offsetBy: sdx)
ch = src[index]
if !ch.isLowercase || !ch.isLetter {
break
}
sdx += 1
}
let startIndex = src.index(src.startIndex, offsetBy: tokstart)
let endIndex = src.index(src.startIndex, offsetBy: sdx)
let substring = String(src[startIndex..<endIndex])
token = Token(Sequence("ident", substring), true)
} else {
token = Token(invalid("invalid ebnf"), false)
}
}
private func matchToken(_ expectedCh: Character) {
if let tokenChar = token.value as? Character, tokenChar != expectedCh {
token = Token(invalid("invalid ebnf (\(expectedCh) expected)"), false)
} else if !(token.value is Character) {
token = Token(invalid("invalid ebnf (\(expectedCh) expected)"), false)
} else {
getToken()
}
}
private func addIdent(_ ident: String) -> Int {
if let k = idents.firstIndex(of: ident) {
return k
} else {
idents.append(ident)
let k = idents.count - 1
ididx.append(-1)
return k
}
}
private func factor() -> Any {
let res: Any
if token.isSequence {
let t = token.value as! Sequence
if let firstItem = t[0] as? String, firstItem == "ident" {
let identName = t[1] as! String
let idx = addIdent(identName)
t.add(idx)
token = Token(t, true)
}
res = token.value
getToken()
} else if let tokenChar = token.value as? Character {
switch tokenChar {
case "[":
getToken()
let result = Sequence("optional", expression())
matchToken("]")
res = result
case "(":
getToken()
let result = expression()
matchToken(")")
res = result
case "{":
getToken()
let result = Sequence("repeat", expression())
matchToken("}")
res = result
default:
fatalError("invalid token in factor() function")
}
} else {
fatalError("invalid token in factor() function")
}
if let sequence = res as? Sequence, sequence.count == 1 {
return sequence[0]
}
return res
}
private func term() -> Any {
let res = Sequence(factor())
let tokens: Set<Character> = ["|", ".", ";", ")", "]", "}"]
while true {
if let tokenChar = token.value as? Character, tokens.contains(tokenChar) {
break
}
if let tokenInt = token.value as? Int, tokenInt == -1 {
break
}
res.add(factor())
}
return res.count == 1 ? res[0] : res
}
private func expression() -> Any {
var res = Sequence(term())
if let tokenChar = token.value as? Character, tokenChar == "|" {
res = Sequence("or", res[0])
while let tokenChar = token.value as? Character, tokenChar == "|" {
getToken()
res.add(term())
}
}
return res.count == 1 ? res[0] : res
}
private func production() -> Any {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken()
if let tokenChar = token.value as? Character, tokenChar != "}" {
if let tokenInt = token.value as? Int, tokenInt == -1 {
return invalid("invalid ebnf (missing closing })")
}
if !token.isSequence {
return -1
}
let t = token.value as! Sequence
if let firstItem = t[0] as? String, firstItem != "ident" {
return -1
}
let ident = t[1] as! String
let idx = addIdent(ident)
getToken()
matchToken("=")
if let tokenInt = token.value as? Int, tokenInt == -1 {
return -1
}
productions.append(Sequence(ident, idx, expression()))
ididx[idx] = productions.count - 1
}
return token.value
}
private func parse(_ ebnf: String) -> Int {
// Returns +1 if ok, -1 if bad.
print("parse:\n\(ebnf) ===>\n")
err = false
src = ebnf
sdx = 0
idents.removeAll()
ididx.removeAll()
productions.removeAll()
extras = Sequence()
getToken()
if token.isSequence {
let t = token.value as! Sequence
t[0] = "title"
extras.add(token.value)
getToken()
}
if let tokenChar = token.value as? Character, tokenChar != "{" {
return invalid("invalid ebnf (missing opening {)")
}
while true {
let tokenResult = production()
if let tokenChar = tokenResult as? Character, tokenChar == "}" {
break
}
if let tokenInt = tokenResult as? Int, tokenInt == -1 {
break
}
}
getToken()
if token.isSequence {
let t = token.value as! Sequence
t[0] = "comment"
extras.add(token.value)
getToken()
}
if let tokenInt = token.value as? Int, tokenInt != -1 {
return invalid("invalid ebnf (missing eof?)")
}
if err {
return -1
}
if let k = ididx.firstIndex(of: -1) {
return invalid("invalid ebnf (undefined:\(idents[k]))")
}
pprint(productions, "productions")
pprint(idents, "idents")
pprint(ididx, "ididx")
pprint(extras, "extras")
return 1
}
// Adjusts Swift's normal printing to look more like Phix output.
private func pprint(_ ob: Any, _ header: String) {
print("\n\(header):")
var pp = "\(ob)"
pp = pp.replacingOccurrences(of: "[", with: "{")
pp = pp.replacingOccurrences(of: "]", with: "}")
pp = pp.replacingOccurrences(of: " ", with: ", ")
for i in idents.indices {
pp = pp.replacingOccurrences(of: "\(i)", with: "\(i)")
}
print(pp)
}
// The rules that applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
private func applies(_ rule: Sequence) -> Bool {
let wasSdx = sdx // in case of failure
let r1 = rule[0]
if let r1String = r1 as? String {
switch r1String {
case "terminal":
skipSpaces()
let r2 = rule[1] as! String
for char in r2 {
if sdx >= src.count {
sdx = wasSdx
return false
}
let index = src.index(src.startIndex, offsetBy: sdx)
if src[index] != char {
sdx = wasSdx
return false
}
sdx += 1
}
case "or":
for i in 1..<rule.count {
if applies(rule[i] as! Sequence) {
return true
}
}
sdx = wasSdx
return false
case "repeat":
while applies(rule[1] as! Sequence) {
// continue repeating
}
case "optional":
_ = applies(rule[1] as! Sequence)
case "ident":
let i = rule[2] as! Int
let ii = ididx[i]
if !applies(productions[ii][2] as! Sequence) {
sdx = wasSdx
return false
}
default:
fatalError("invalid rule in applies() function")
}
} else {
// r1 is not a string, so apply each rule in sequence
for i in 0..<rule.count {
if !applies(rule[i] as! Sequence) {
sdx = wasSdx
return false
}
}
}
return true
}
private func checkValid(_ test: String) {
src = test
sdx = 0
var res = applies(productions[0][2] as! Sequence)
skipSpaces()
if sdx < src.count {
res = false
}
print("\"\(test)\", \(results[1 - boolToInt(res)])")
}
func run() {
let ebnfs = [
"\"a\" {\n" +
" a = \"a1\" ( \"a2\" | \"a3\" ) { \"a4\" } [ \"a5\" ] \"a6\" ;\n" +
"} \"z\" ",
"{\n" +
" expr = term { plus term } .\n" +
" term = factor { times factor } .\n" +
" factor = number | '(' expr ')' .\n" +
" \n" +
" plus = \"+\" | \"-\" .\n" +
" times = \"*\" | \"/\" .\n" +
" \n" +
" number = digit { digit } .\n" +
" digit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n" +
"}",
"a = \"1\"",
"{ a = \"1\" ;",
"{ hello world = \"1\"; }",
"{ foo = bar . }"
]
let tests = [
[
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
],
[
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
]
]
for i in ebnfs.indices {
if parse(ebnfs[i]) == 1 {
print("\ntests:")
if i < tests.count {
for test in tests[i] {
checkValid(test)
}
}
}
print()
}
}
}
// Usage
let parser = EBNFParser()
parser.run()
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
invalid ebnf
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
invalid ebnf
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
productions:
{}
idents:
{}
ididx:
{}
extras:
Main.EBNFParser.(unknown, context, at, $5564c3abf008).Sequence
tests:
parse:
{ hello world = "1"; } ===>
productions:
{}
idents:
{}
ididx:
{}
extras:
Main.EBNFParser.(unknown, context, at, $5564c3abf008).Sequence
tests:
parse:
{ foo = bar . } ===>
productions:
{}
idents:
{}
ididx:
{}
extras:
Main.EBNFParser.(unknown, context, at, $5564c3abf008).Sequence
...
Demonstration lexer and parser. Note that this parser supports parenthesized expressions, making the grammar recursive.
package require Tcl 8.6
# Utilities to make the coroutine easier to use
proc provide args {while {![yield $args]} {yield}}
proc next lexer {$lexer 1}
proc pushback lexer {$lexer 0}
# Lexical analyzer coroutine core
proc lexer {str} {
yield [info coroutine]
set symbols {+ PLUS - MINUS * MULT / DIV ( LPAR ) RPAR}
set idx 0
while 1 {
switch -regexp -matchvar m -- $str {
{^\s+} {
# No special action for whitespace
}
{^([-+*/()])} {
provide [dict get $symbols [lindex $m 1]] [lindex $m 1] $idx
}
{^(\d+)} {
provide NUMBER [lindex $m 1] $idx
}
{^$} {
provide EOT "EOT" $idx
return
}
. {
provide PARSE_ERROR [lindex $m 0] $idx
}
}
# Trim the matched string
set str [string range $str [string length [lindex $m 0]] end]
incr idx [string length [lindex $m 0]]
}
}
# Utility functions to help with making an LL(1) parser; ParseLoop handles
# EBNF looping constructs, ParseSeq handles sequence constructs.
proc ParseLoop {lexer def} {
upvar 1 token token payload payload index index
foreach {a b} $def {
if {$b ne "-"} {set b [list set c $b]}
lappend m $a $b
}
lappend m default {pushback $lexer; break}
while 1 {
lassign [next $lexer] token payload index
switch -- $token {*}$m
if {[set c [catch {uplevel 1 $c} res opt]]} {
dict set opt -level [expr {[dict get $opt -level]+1}]
return -options $opt $res
}
}
}
proc ParseSeq {lexer def} {
upvar 1 token token payload payload index index
foreach {t s} $def {
lassign [next $lexer] token payload index
switch -- $token $t {
if {[set c [catch {uplevel 1 $s} res opt]]} {
dict set opt -level [expr {[dict get $opt -level]+1}]
return -options $opt $res
}
} EOT {
throw SYNTAX "end of text at position $index"
} default {
throw SYNTAX "\"$payload\" at position $index"
}
}
}
# Main parser driver; contains "master" grammar that ensures that the whole
# text is matched and not just a prefix substring. Note also that the parser
# runs the lexer as a coroutine (with a fixed name in this basic demonstration
# code).
proc parse {str} {
set lexer [coroutine l lexer $str]
try {
set parsed [parse.expr $lexer]
ParseLoop $lexer {
EOT {
return $parsed
}
}
throw SYNTAX "\"$payload\" at position $index"
} trap SYNTAX msg {
return -code error "syntax error: $msg"
} finally {
catch {rename $lexer ""}
}
}
# Now the descriptions of how to match each production in the grammar...
proc parse.expr {lexer} {
set expr [parse.term $lexer]
ParseLoop $lexer {
PLUS - MINUS {
set expr [list $token $expr [parse.term $lexer]]
}
}
return $expr
}
proc parse.term {lexer} {
set term [parse.factor $lexer]
ParseLoop $lexer {
MULT - DIV {
set term [list $token $term [parse.factor $lexer]]
}
}
return $term
}
proc parse.factor {lexer} {
ParseLoop $lexer {
NUMBER {
return $payload
}
MINUS {
ParseSeq $lexer {
NUMBER {return -$payload}
}
}
LPAR {
set result [parse.expr $lexer]
ParseSeq $lexer {
RPAR {return $result}
}
break
}
EOT {
throw SYNTAX "end of text at position $index"
}
}
throw SYNTAX "\"$payload\" at position $index"
}
# Demonstration code
puts [parse "1 - 2 - -3 * 4 + 5"]
puts [parse "1 - 2 - -3 * (4 + 5)"]
Output:
PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5
MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}
Translated via the Go entry.
import "./fmt" for Conv, Fmt
import "./iterate" for Stepped
import "./str" for Char
import "./seq" for Lst
var src = ""
var ch = ""
var sdx = 0
var token = null
var isSeq = false
var err = false
var idents = []
var ididx = []
var productions = []
var extras = []
var results = ["pass", "fail"]
var invalid = Fn.new { |msg|
err = true
System.print(msg)
sdx = src.count // set to EOF
return -1
}
var skipSpaces = Fn.new {
while (true) {
if (sdx >= src.count) return
ch = src[sdx]
if (!" \t\r\n".contains(ch)) return
sdx = sdx + 1
}
}
var getToken = Fn.new {
// Yields a single character token, one of {}()[]|=.;
// or {"terminal",string} or {"ident", string} or -1.
skipSpaces.call()
if (sdx >= src.count) {
token = -1
isSeq = false
return
}
var tokstart = sdx
if ("{}()[]|=.;".contains(ch)) {
sdx = sdx + 1
token = ch
isSeq = false
} else if (ch == "\"" || ch == "'") {
var closech = ch
for (tokend in Stepped.ascend(sdx + 1...src.count)) {
if (src[tokend] == closech) {
sdx = tokend + 1
token = ["terminal", src[tokstart+1...tokend]]
isSeq = true
return
}
}
token = invalid.call("no closing quote")
isSeq = false
} else if (Char.isAsciiLower(ch)) {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (true) {
sdx = sdx + 1
if (sdx >= src.count) break
ch = src[sdx]
if (!Char.isAsciiLower(ch)) break
}
token = ["ident", src[tokstart...sdx]]
isSeq = true
} else {
token = invalid.call("invalid ebnf")
isSeq = false
}
}
var matchToken = Fn.new { |ch|
if (token != ch) {
token = invalid.call("invalid ebnf (%(ch) expected)")
isSeq = false
} else {
getToken.call()
}
}
var addIdent = Fn.new { |ident|
var k = -1
var i = 0
for (id in idents) {
if (id == ident) {
k = i
break
}
i = i + 1
}
if (k == -1) {
idents.add(ident)
k = idents.count - 1
ididx.add(-1)
}
return k
}
var expression // forward declaration
var factor = Fn.new {
var res
if (isSeq) {
if (token[0] == "ident") {
var idx = addIdent.call(token[1])
token.add(idx)
}
res = token
getToken.call()
} else if (token == "[") {
getToken.call()
res = ["optional", expression.call()]
matchToken.call("]")
} else if (token == "(") {
getToken.call()
res = expression.call()
matchToken.call(")")
} else if (token == "{") {
getToken.call()
res = ["repeat", expression.call()]
matchToken.call("}")
} else {
Fiber.abort("invalid token in factor() function")
}
if (res is Sequence && res.count == 1) return res[0]
return res
}
var term = Fn.new {
var res = [factor.call()]
var tokens = [-1, "|", ".", ";", ")", "]", "}"]
while (true) {
var outer = false
for (t in tokens) {
if (t == token) {
outer = true
break
}
}
if (outer) break
res.add(factor.call())
}
if (res.count == 1) return res[0]
return res
}
// declared earlier
expression = Fn.new {
var res = [term.call()]
if (token == "|") {
res = ["or", res[0]]
while (token == "|") {
getToken.call()
res.add(term.call())
}
}
if (res.count == 1) return res[0]
return res
}
var production = Fn.new {
// Returns a token or -1; the real result is left in 'productions' etc,
getToken.call()
if (token != "}") {
if (token == -1) return invalid.call("invalid ebnf (missing closing })")
if (!isSeq) return -1
if (token[0] != "ident") return -1
var ident = token[1]
var idx = addIdent.call(ident)
getToken.call()
matchToken.call("=")
if (token == -1) return -1
productions.add([ident, idx, expression.call()])
ididx[idx] = productions.count - 1
}
return token
}
// Adjusts Wren's normal printing of lists to look more like Phix output.
var pprint = Fn.new { |ob, header|
Fmt.print("\n$s:", header)
var quote // recursive closure
quote = Fn.new { |list|
for (i in 0...list.count) {
if (list[i] is String) {
list[i] = Fmt.swrite("$q", list[i])
} else if (list[i] is List) quote.call(list[i])
}
}
var obs
if (ob is String) {
obs = Fmt.swrite("$q", ob)
} else if (ob is List) {
obs = Lst.clone(ob)
quote.call(obs)
}
var pp = obs.toString
pp = pp.replace("[", "{")
pp = pp.replace("]", "}")
for (i in 0...idents.count) {
var xs = Fmt.swrite("'\\x$02d'", i)
pp = pp.replace(xs, i.toString)
}
System.print(pp)
}
var parse = Fn.new { |ebnf|
// Returns +1 if ok, -1 if bad.
System.print("parse:\n%(ebnf) ===>")
err = false
src = ebnf
sdx = 0
idents.clear()
ididx.clear()
productions.clear()
extras.clear()
getToken.call()
if (isSeq) {
token[0] = "title"
extras.add(token)
getToken.call()
}
if (token != "{") return invalid.call("invalid ebnf (missing opening {)")
while (true) {
token = production.call()
if (token == "}" || token == -1) break
}
getToken.call()
if (isSeq) {
token[0] = "comment"
extras.add(token)
getToken.call()
}
if (token != -1) return invalid.call("invalid ebnf (missing eof?)")
if (err) return -1
var k = -1
var i = 0
for (idx in ididx) {
if (idx == -1) {
k = i
break
}
i = i + 1
}
if (k != -1) return invalid.call("invalid ebnf (undefined:%(idents[k]))")
pprint.call(productions, "productions")
pprint.call(idents, "idents")
pprint.call(ididx, "ididx")
pprint.call(extras, "extras")
return 1
}
// The rules that applies() has to deal with are:
// {factors} - if rule[0] is not string,
// just apply one after the other recursively.
// {"terminal", "a1"} -- literal constants
// {"or", <e1>, <e2>, ...} -- (any) one of n
// {"repeat", <e1>} -- as per "{}" in ebnf
// {"optional", <e1>} -- as per "[]" in ebnf
// {"ident", <name>, idx} -- apply the sub-rule
var applies // recursive function
applies = Fn.new { |rule|
var wasSdx = sdx // in case of failure
var r1 = rule[0]
if (!(r1 is String)) {
for (i in 0...rule.count) {
if (!applies.call(rule[i])) {
sdx = wasSdx
return false
}
}
} else if (r1 == "terminal") {
skipSpaces.call()
var r2 = rule[1]
for (i in 0...r2.count) {
if (sdx >= src.count || src[sdx] != r2[i]) {
sdx = wasSdx
return false
}
sdx = sdx + 1
}
} else if (r1 == "or") {
for (i in Stepped.ascend(1...rule.count)) {
if (applies.call(rule[i])) return true
}
sdx = wasSdx
return false
} else if (r1 == "repeat") {
while (applies.call(rule[1])) {}
} else if (r1 == "optional") {
applies.call(rule[1])
} else if (r1 == "ident") {
var i = rule[2]
var ii = ididx[i]
if (!(applies.call(productions[ii][2]))) {
sdx = wasSdx
return false
}
} else {
Fiber.abort("invalid rule in applies() function")
}
return true
}
var checkValid = Fn.new { |test|
src = test
sdx = 0
var res = applies.call(productions[0][2])
skipSpaces.call()
if (sdx < src.count) res = false
Fmt.print("$q, $s", test, results[1-Conv.btoi(res)])
}
var ebnfs = [
""""a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" """,
"""{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}""",
"a = \"1\"",
"""{ a = "1" ;""",
"""{ hello world = "1"; }""",
"""{ foo = bar . }"""
]
var tests = [
[
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"
],
[
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"
]
]
var i = 0
for (ebnf in ebnfs) {
if (parse.call(ebnf) == 1) {
System.print("\ntests:")
for (test in tests[i]) checkValid.call(test)
}
System.print()
i = i + 1
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
{{"a", 0, {{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}}, {"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}}, {"terminal", "a6"}}}}
idents:
{"a"}
ididx:
{0}
extras:
{{"title", "a"}, {"comment", "z"}}
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
{{"expr", 0, {{"ident", "term", 1}, {"repeat", {{"ident", "plus", 2}, {"ident", "term", 1}}}}}, {"term", 1, {{"ident", "factor", 3}, {"repeat", {{"ident", "times", 4}, {"ident", "factor", 3}}}}}, {"factor", 3, {"or", {"ident", "number", 5}, {{"terminal", "("}, {"ident", "expr", 0}, {"terminal", ")"}}}}, {"plus", 2, {"or", {"terminal", "+"}, {"terminal", "-"}}}, {"times", 4, {"or", {"terminal", "*"}, {"terminal", "/"}}}, {"number", 5, {{"ident", "digit", 6}, {"repeat", {"ident", "digit", 6}}}}, {"digit", 6, {"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"}, {"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"}, {"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"}, {"terminal", "9"}}}}
idents:
{"expr", "term", "plus", "factor", "times", "number", "digit"}
ididx:
{0, 1, 3, 2, 4, 5, 6}
extras:
{}
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (= expected)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)
const std = @import("std");
const print = std.debug.print;
const ArrayList = std.ArrayList;
const Allocator = std.mem.Allocator;
const ParseError = error{
OutOfMemory,
InvalidTokenSequence,
InvalidTokenInFactor,
};
const Token = union(enum) {
char: u8,
sequence: []const []const u8,
eof,
pub fn deinit(self: Token, allocator: Allocator) void {
switch (self) {
.sequence => |seq| {
for (seq) |str| {
allocator.free(str);
}
allocator.free(seq);
},
else => {},
}
}
};
const Rule = union(enum) {
terminal: []const u8,
ident: struct { name: []const u8, idx: usize },
my_or: []const *Rule,
repeat: *Rule,
optional: *Rule,
sequence: []const *Rule,
pub fn deinit(self: *Rule, allocator: Allocator) void {
switch (self.*) {
.terminal => |term| allocator.free(term),
.ident => |ident| allocator.free(ident.name),
.my_or => |rules| {
for (rules) |rule| {
rule.deinit(allocator);
allocator.destroy(rule);
}
allocator.free(rules);
},
.repeat => |rule| {
rule.deinit(allocator);
allocator.destroy(rule);
},
.optional => |rule| {
rule.deinit(allocator);
allocator.destroy(rule);
},
.sequence => |rules| {
for (rules) |rule| {
rule.deinit(allocator);
allocator.destroy(rule);
}
allocator.free(rules);
},
}
}
};
const Production = struct {
name: []const u8,
idx: usize,
rule: *Rule,
pub fn deinit(self: Production, allocator: Allocator) void {
allocator.free(self.name);
self.rule.deinit(allocator);
allocator.destroy(self.rule);
}
};
pub const EBNFParser = struct {
allocator: Allocator,
src: []const u8,
ch: u8,
sdx: usize,
token: Token,
err: bool,
idents: ArrayList([]const u8),
ididx: ArrayList(?usize),
productions: ArrayList(Production),
extras: ArrayList([]const []const u8),
pub fn init(allocator: Allocator) EBNFParser {
return EBNFParser{
.allocator = allocator,
.src = "",
.ch = 0,
.sdx = 0,
.token = .eof,
.err = false,
.idents = ArrayList([]const u8).init(allocator),
.ididx = ArrayList(?usize).init(allocator),
.productions = ArrayList(Production).init(allocator),
.extras = ArrayList([]const []const u8).init(allocator),
};
}
pub fn deinit(self: *EBNFParser) void {
for (self.idents.items) |ident| {
self.allocator.free(ident);
}
self.idents.deinit();
self.ididx.deinit();
for (self.productions.items) |prod| {
prod.deinit(self.allocator);
}
self.productions.deinit();
for (self.extras.items) |extra| {
for (extra) |str| {
self.allocator.free(str);
}
self.allocator.free(extra);
}
self.extras.deinit();
self.token.deinit(self.allocator);
}
fn btoi(b: bool) usize {
return if (b) 1 else 0;
}
fn invalid(self: *EBNFParser, msg: []const u8) i32 {
self.err = true;
print("{s}\n", .{msg});
self.sdx = self.src.len; // set to eof
return -1;
}
fn skipSpaces(self: *EBNFParser) void {
while (self.sdx < self.src.len) {
self.ch = self.src[self.sdx];
if (self.ch != ' ' and self.ch != '\t' and self.ch != '\r' and self.ch != '\n') {
break;
}
self.sdx += 1;
}
}
fn getToken(self: *EBNFParser) ParseError!void {
// Clean up previous token
self.token.deinit(self.allocator);
self.skipSpaces();
if (self.sdx >= self.src.len) {
self.token = .eof;
return;
}
const tokstart = self.sdx;
if (std.mem.indexOfScalar(u8, "{}()[]|=.;", self.ch) != null) {
self.sdx += 1;
self.token = Token{ .char = self.ch };
} else if (self.ch == '"' or self.ch == '\'') {
const closech = self.ch;
var tokend = tokstart + 1;
while (tokend < self.src.len and self.src[tokend] != closech) {
tokend += 1;
}
if (tokend >= self.src.len) {
_ = self.invalid("no closing quote");
self.token = .eof;
} else {
self.sdx = tokend + 1;
const content = try self.allocator.dupe(u8, self.src[tokstart + 1..tokend]);
const terminal_str = try self.allocator.dupe(u8, "terminal");
const seq = try self.allocator.alloc([]const u8, 2);
seq[0] = terminal_str;
seq[1] = content;
self.token = Token{ .sequence = seq };
}
} else if (std.ascii.isLower(self.ch)) {
// To simplify things for the purposes of this task,
// identifiers are strictly a-z only, not A-Z or 1-9.
while (self.sdx < self.src.len) {
self.ch = self.src[self.sdx];
if (!std.ascii.isLower(self.ch)) {
break;
}
self.sdx += 1;
}
const ident = try self.allocator.dupe(u8, self.src[tokstart..self.sdx]);
const ident_str = try self.allocator.dupe(u8, "ident");
const seq = try self.allocator.alloc([]const u8, 2);
seq[0] = ident_str;
seq[1] = ident;
self.token = Token{ .sequence = seq };
} else {
_ = self.invalid("invalid ebnf");
self.token = .eof;
}
}
fn matchToken(self: *EBNFParser, expected_ch: u8) ParseError!void {
switch (self.token) {
.char => |ch| {
if (ch != expected_ch) {
_ = self.invalid("invalid ebnf (token mismatch)");
} else {
try self.getToken();
}
},
else => {
_ = self.invalid("invalid ebnf (expected char token)");
},
}
}
fn addIdent(self: *EBNFParser, ident: []const u8) ParseError!usize {
for (self.idents.items, 0..) |existing_ident, k| {
if (std.mem.eql(u8, existing_ident, ident)) {
return k;
}
}
const ident_copy = try self.allocator.dupe(u8, ident);
try self.idents.append(ident_copy);
const k = self.idents.items.len - 1;
try self.ididx.append(null);
return k;
}
fn factor(self: *EBNFParser) ParseError!*Rule {
const rule = try self.allocator.create(Rule);
switch (self.token) {
.sequence => |seq| {
if (std.mem.eql(u8, seq[0], "ident")) {
const idx = try self.addIdent(seq[1]);
const name_copy = try self.allocator.dupe(u8, seq[1]);
rule.* = Rule{ .ident = .{ .name = name_copy, .idx = idx } };
try self.getToken();
} else if (std.mem.eql(u8, seq[0], "terminal")) {
const term_copy = try self.allocator.dupe(u8, seq[1]);
rule.* = Rule{ .terminal = term_copy };
try self.getToken();
} else {
self.allocator.destroy(rule);
return error.InvalidTokenSequence;
}
},
.char => |ch| {
switch (ch) {
'[' => {
try self.getToken();
const expr = try self.expression();
try self.matchToken(']');
rule.* = Rule{ .optional = expr };
},
'(' => {
try self.getToken();
const expr = try self.expression();
try self.matchToken(')');
self.allocator.destroy(rule);
return expr;
},
'{' => {
try self.getToken();
const expr = try self.expression();
try self.matchToken('}');
rule.* = Rule{ .repeat = expr };
},
else => {
self.allocator.destroy(rule);
return error.InvalidTokenInFactor;
},
}
},
else => {
self.allocator.destroy(rule);
return error.InvalidTokenInFactor;
},
}
return rule;
}
fn term(self: *EBNFParser) ParseError!*Rule {
var factors = ArrayList(*Rule).init(self.allocator);
defer factors.deinit();
try factors.append(try self.factor());
while (true) {
switch (self.token) {
.eof => break,
.char => |ch| {
switch (ch) {
'|', '.', ';', ')', ']', '}' => break,
else => {},
}
},
else => {},
}
// Check if we should stop
const should_stop = switch (self.token) {
.eof => true,
.char => |ch| switch (ch) {
'|', '.', ';', ')', ']', '}' => true,
else => false,
},
else => false,
};
if (should_stop) break;
try factors.append(try self.factor());
}
if (factors.items.len == 1) {
return factors.items[0];
} else {
const rule = try self.allocator.create(Rule);
const rules_copy = try self.allocator.dupe(*Rule, factors.items);
rule.* = Rule{ .sequence = rules_copy };
return rule;
}
}
fn expression(self: *EBNFParser) ParseError!*Rule {
const first_term = try self.term();
switch (self.token) {
.char => |ch| {
if (ch == '|') {
var terms = ArrayList(*Rule).init(self.allocator);
defer terms.deinit();
try terms.append(first_term);
while (true) {
switch (self.token) {
.char => |token_ch| {
if (token_ch == '|') {
try self.getToken();
try terms.append(try self.term());
} else {
break;
}
},
else => break,
}
}
const rule = try self.allocator.create(Rule);
const rules_copy = try self.allocator.dupe(*Rule, terms.items);
rule.* = Rule{ .my_or = rules_copy };
return rule;
}
},
else => {},
}
return first_term;
}
fn production(self: *EBNFParser) ParseError!Token {
try self.getToken();
switch (self.token) {
.char => |ch| {
if (ch == '}') {
return self.token;
}
},
.eof => {
_ = self.invalid("invalid ebnf (missing closing })");
return .eof;
},
else => {},
}
switch (self.token) {
.sequence => |seq| {
if (std.mem.eql(u8, seq[0], "ident")) {
const ident = seq[1];
const idx = try self.addIdent(ident);
try self.getToken();
try self.matchToken('=');
switch (self.token) {
.eof => return .eof,
else => {},
}
const expr = try self.expression();
const ident_copy = try self.allocator.dupe(u8, ident);
const prod = Production{
.name = ident_copy,
.idx = idx,
.rule = expr,
};
try self.productions.append(prod);
self.ididx.items[idx] = self.productions.items.len - 1;
return self.token;
}
},
else => {},
}
return .eof;
}
pub fn parse(self: *EBNFParser, ebnf: []const u8) ParseError!i32 {
print("parse:\n{s} ===>\n\n", .{ebnf});
self.err = false;
self.src = ebnf;
self.sdx = 0;
// Clear previous data
for (self.idents.items) |ident| {
self.allocator.free(ident);
}
self.idents.clearRetainingCapacity();
self.ididx.clearRetainingCapacity();
for (self.productions.items) |prod| {
prod.deinit(self.allocator);
}
self.productions.clearRetainingCapacity();
for (self.extras.items) |extra| {
for (extra) |str| {
self.allocator.free(str);
}
self.allocator.free(extra);
}
self.extras.clearRetainingCapacity();
try self.getToken();
switch (self.token) {
.sequence => |seq| {
const title_str = try self.allocator.dupe(u8, "title");
const content = try self.allocator.dupe(u8, seq[1]);
const extra = try self.allocator.alloc([]const u8, 2);
extra[0] = title_str;
extra[1] = content;
try self.extras.append(extra);
try self.getToken();
},
else => {},
}
switch (self.token) {
.char => |ch| {
if (ch != '{') {
return self.invalid("invalid ebnf (missing opening {)");
}
},
else => {
return self.invalid("invalid ebnf (missing opening {)");
},
}
while (true) {
const prod_result = self.production() catch return -1;
switch (prod_result) {
.char => |ch| {
if (ch == '}') break;
},
.eof => break,
else => continue,
}
}
try self.getToken();
switch (self.token) {
.sequence => |seq| {
const comment_str = try self.allocator.dupe(u8, "comment");
const content = try self.allocator.dupe(u8, seq[1]);
const extra = try self.allocator.alloc([]const u8, 2);
extra[0] = comment_str;
extra[1] = content;
try self.extras.append(extra);
try self.getToken();
},
else => {},
}
switch (self.token) {
.eof => {},
else => {
return self.invalid("invalid ebnf (missing eof?)");
},
}
if (self.err) {
return -1;
}
// Check for undefined identifiers
for (self.ididx.items, 0..) |ididx_val, i| {
if (ididx_val == null) {
print("invalid ebnf (undefined:{s})\n", .{self.idents.items[i]});
return -1;
}
}
self.pprintProductions();
self.pprintIdents();
self.pprintIdidx();
self.pprintExtras();
return 1;
}
fn pprintProductions(self: *EBNFParser) void {
print("\nproductions:\n", .{});
for (self.productions.items) |prod| {
print("({s}, {d}, rule)\n", .{ prod.name, prod.idx });
}
}
fn pprintIdents(self: *EBNFParser) void {
print("\nidents:\n", .{});
for (self.idents.items) |ident| {
print("{s}, ", .{ident});
}
print("\n", .{});
}
fn pprintIdidx(self: *EBNFParser) void {
print("\nididx:\n", .{});
for (self.ididx.items) |idx| {
if (idx) |i| {
print("{d}, ", .{i});
} else {
print("null, ", .{});
}
}
print("\n", .{});
}
fn pprintExtras(self: *EBNFParser) void {
print("\nextras:\n", .{});
for (self.extras.items) |extra| {
for (extra) |str| {
print("{s} ", .{str});
}
print("\n", .{});
}
}
fn applies(self: *EBNFParser, rule: *Rule, src: []const u8, sdx: *usize) bool {
const was_sdx = sdx.*;
switch (rule.*) {
.sequence => |rules| {
for (rules) |subrule| {
if (!self.applies(subrule, src, sdx)) {
sdx.* = was_sdx;
return false;
}
}
return true;
},
.terminal => |terminal| {
self.skipSpacesAt(src, sdx);
for (terminal) |ch| {
if (sdx.* >= src.len or src[sdx.*] != ch) {
sdx.* = was_sdx;
return false;
}
sdx.* += 1;
}
return true;
},
.my_or => |rules| {
for (rules) |subrule| {
if (self.applies(subrule, src, sdx)) {
return true;
}
}
sdx.* = was_sdx;
return false;
},
.repeat => |subrule| {
while (self.applies(subrule, src, sdx)) {
// continue repeating
}
return true;
},
.optional => |subrule| {
_ = self.applies(subrule, src, sdx);
return true;
},
.ident => |ident_info| {
if (self.ididx.items[ident_info.idx]) |prod_idx| {
if (!self.applies(self.productions.items[prod_idx].rule, src, sdx)) {
sdx.* = was_sdx;
return false;
}
return true;
} else {
return false;
}
},
}
}
fn skipSpacesAt(self: *EBNFParser, src: []const u8, sdx: *usize) void {
_ = self;
while (sdx.* < src.len) {
const ch = src[sdx.*];
if (ch != ' ' and ch != '\t' and ch != '\r' and ch != '\n') {
break;
}
sdx.* += 1;
}
}
fn checkValid(self: *EBNFParser, my_test: []const u8) void {
var sdx: usize = 0;
const res = self.applies(self.productions.items[0].rule, my_test, &sdx);
self.skipSpacesAt(my_test, &sdx);
const final_res = res and sdx >= my_test.len;
const result = if (final_res) "pass" else "fail";
print("\"{s}\", {s}\n", .{ my_test, result });
}
};
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
var parser = EBNFParser.init(allocator);
defer parser.deinit();
const ebnfs = [_][]const u8{
"\"a\" {\n a = \"a1\" ( \"a2\" | \"a3\" ) { \"a4\" } [ \"a5\" ] \"a6\" ;\n} \"z\" ",
"{\n expr = term { plus term } .\n term = factor { times factor } .\n factor = number | '(' expr ')' .\n \n plus = \"+\" | \"-\" .\n times = \"*\" | \"/\" .\n \n number = digit { digit } .\n digit = \"0\" | \"1\" | \"2\" | \"3\" | \"4\" | \"5\" | \"6\" | \"7\" | \"8\" | \"9\" .\n}",
"a = \"1\"",
"{ a = \"1\" ;",
"{ hello world = \"1\"; }",
"{ foo = bar . }",
};
const tests = [_][]const []const u8{
&[_][]const u8{
"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here",
},
&[_][]const u8{
"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3",
},
};
for (ebnfs, 0..) |ebnf, i| {
const result = parser.parse(ebnf) catch -1;
if (result == 1) {
print("\ntests:\n", .{});
if (i < tests.len) {
for (tests[i]) |my_test| {
parser.checkValid(my_test);
}
}
}
print("\n", .{});
}
}
- Output:
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
productions:
(�, 0, rule)
idents:
a,
ididx:
0,
extras:
title a
comment z
tests:
"a1a3a4a4a5a6", pass
"a1 a2a6", pass
"a1 a3 a4 a6", pass
"a1 a4 a5 a6", fail
"a1 a2 a4 a5 a5 a6", fail
"a1 a2 a4 a5 a6 a7", fail
"your ad here", fail
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
productions:
(����, 0, rule)
(����, 1, rule)
(������, 3, rule)
(����, 2, rule)
(�����, 4, rule)
(������, 5, rule)
(�����, 6, rule)
idents:
expr, term, plus, factor, times, number, digit,
ididx:
0, 1, 3, 2, 4, 5, 6,
extras:
tests:
"2", pass
"2*3 + 4/23 - 7", pass
"(3 + 4) * 6-2+(4*(4))", pass
"-2", fail
"3 +", fail
"(4 + 3", fail
parse:
a = "1" ===>
invalid ebnf (missing opening {)
parse:
{ a = "1" ; ===>
invalid ebnf (missing closing })
parse:
{ hello world = "1"; } ===>
invalid ebnf (expected char token)
parse:
{ foo = bar . } ===>
invalid ebnf (undefined:bar)