I have rewritten my tokenizer according to most of the suggestions from the previous question here.
API
It now reads all chars as long as they match the pattern. I use three types of attributes to achieve this.
Regex- reads by regular expressions; this one requires a single group that is the value of the token; it can match more but only the value ofGroups[1]is used as a resultConst- reads a constant pattern where the entire length must matchQText- reads quoted text or falls back to regex. I chose not to use regex for quoted strings because this is pretty damn tricky.
They return a tuple where:
Success- indicates whther a pattern was matchedToken- the actual value of the tokenLength- the total length of the match; I use this to advance the index to the next token
These are the tree attributes:
public delegate (bool Success, string Token, int Length) MatchDelegate(string value, int offset);
public abstract class MatcherAttribute : Attribute
{
public abstract (bool Success, string Token, int Length) Match(string value, int offset);
}
public class RegexAttribute : MatcherAttribute
{
private readonly Regex _regex;
public RegexAttribute([RegexPattern] string pattern)
{
_regex = new Regex(pattern);
}
public override (bool Success, string Token, int Length) Match(string value, int offset)
{
var match = _regex.Match(value, offset);
// Make sure the match was at the offset.
return (match.Success && match.Index == offset, match.Groups[1].Value, match.Length);
}
}
public class ConstAttribute : MatcherAttribute
{
private readonly string _pattern;
public ConstAttribute(string pattern) => _pattern = pattern;
public override (bool Success, string Token, int Length) Match(string value, int offset)
{
var matchCount = _pattern.TakeWhile((t, i) => value[offset + i].Equals(t)).Count();
// All characters have to be matched.
return (matchCount == _pattern.Length, _pattern, matchCount);
}
}
// "foo \"bar\" baz"
// ^ starts here ^ ends here
public class QTextAttribute : RegexAttribute
{
public static readonly IImmutableSet<char> Escapables = new[] { '\\', '"' }.ToImmutableHashSet();
public QTextAttribute([RegexPattern] string pattern) : base(pattern) { }
public override (bool Success, string Token, int Length) Match(string value, int offset)
{
return
value[offset] == '"'
? MatchQuoted(value, offset)
: base.Match(value, offset);
}
private (bool Success, string Token, int Length) MatchQuoted(string value, int offset)
{
var token = new StringBuilder();
var escapeSequence = false;
var quote = false;
for (var i = offset; i < value.Length; i++)
{
var c = value[i];
switch (c)
{
case '"' when !escapeSequence:
switch (i == offset)
{
// Entering quoted text.
case true:
quote = !quote;
continue; // Don't eat quotes.
// End of quoted text.
case false:
return (true, token.ToString(), i - offset + 1);
}
break; // Makes the compiler happy.
case '\\' when !escapeSequence:
escapeSequence = true;
break;
default:
switch (escapeSequence)
{
case true:
switch (Escapables.Contains(c))
{
case true:
// Remove escape char.
token.Length--;
break;
}
escapeSequence = false;
break;
}
break;
}
token.Append(c);
}
return (false, token.ToString(), 0);
}
}
The tokenizer is now an instantiable class with an interface. It can be used raw or be derived to create a specific tokenizer. When created, it turns state transitions into a dictionary. This is what the StateTransitionMapper is for. The tokenizer picks the first non-empty token. I guess I probably should use the longest one - as this is what different websites suggest - so I might change this later. What do you think? Would that be better?
It starts with the default state which is by convention 0 becuase TToken is constrained to be Enum and its default value is 0. I named this dummy state simply Start.
public static class StateTransitionMapper
{
public static IImmutableDictionary<TToken, IImmutableList<State<TToken>>> CreateTransitionMap<TToken>(IImmutableList<State<TToken>> states) where TToken : Enum
{
return states.Aggregate(ImmutableDictionary<TToken, IImmutableList<State<TToken>>>.Empty, (mappings, state) =>
{
var nextStates =
from n in state.Next
join s in states on n equals s.Token
select s;
return mappings.Add(state.Token, nextStates.ToImmutableList());
});
}
}
public interface ITokenizer<TToken> where TToken : Enum
{
IEnumerable<Token<TToken>> Tokenize(string value);
}
public class Tokenizer<TToken> : ITokenizer<TToken> where TToken : Enum
{
private readonly IImmutableDictionary<TToken, IImmutableList<State<TToken>>> _transitions;
public Tokenizer(IImmutableList<State<TToken>> states)
{
_transitions = StateTransitionMapper.CreateTransitionMap(states);
}
public IEnumerable<Token<TToken>> Tokenize(string value)
{
var current = _transitions[default];
for (var i = 0; i < value.Length;)
{
var matches =
from state in current
let token = state.Consume(value, i)
// Consider only non-empty tokens.
where token.Length > 0
select (state, token);
if (matches.FirstOrDefault() is var match && match.token is null)
{
throw new ArgumentException($"Invalid character '{value[i]}' at {i}.");
}
else
{
if (match.state.IsToken)
{
yield return match.token;
}
i += match.token.Length;
current = _transitions[match.state.Token];
}
}
}
}
The tokenizer is supported by the State and Token classes where the State now reads all matching chars and caches the MatchDelegate it gets from the MatcherAttribute. IsToken property is used to ignore tokens that aren't actually real or usable tokens. I use this with the CommandLineTokenizer.
public class State<TToken> where TToken : Enum
{
private readonly MatchDelegate _match;
public State(TToken token, params TToken[] next)
{
Token = token;
Next = next;
_match =
typeof(TToken)
.GetField(token.ToString())
.GetCustomAttribute<MatcherAttribute>() is MatcherAttribute matcher
? (MatchDelegate)(matcher.Match)
: (MatchDelegate)((value, offset) => (false, string.Empty, 0));
}
public bool IsToken { get; set; } = true;
public TToken Token { get; }
public IEnumerable<TToken> Next { get; }
public Token<TToken> Consume(string value, int offset)
{
return new Token<TToken>(_match(value, offset))
{
Type = Token,
Index = offset
};
}
public override string ToString() => $"{Token} --> [{string.Join(", ", Next)}]";
}
public class Token<TToken> where TToken : Enum
{
public Token((bool Success, string Token, int Length) match)
{
Length = match.Success ? match.Length : 0;
Text = match.Success ? match.Token : string.Empty;
}
public int Index { get; set; }
public int Length { get; set; }
public string Text { get; set; }
public TToken Type { get; set; }
public override string ToString() => $"{Index}: {Text} ({Type})";
}
Examples and tests
I tested it with two tokenizers. They are very simple because just derived from the Tokenizer. They define their own state transitions and tokens.
One if for a UriString:
using static UriToken;
public class UriStringParserTest
{
private static readonly ITokenizer<UriToken> Tokenizer = new UriStringTokenizer();
[Theory]
[InlineData(
"scheme://user@host:123/pa/th?key-1=val-1&key-2=val-2#f",
"scheme //user host 123/pa/th key-1 val-1 key-2 val-2 f")]
[InlineData(
"scheme://user@host:123/pa/th?key-1=val-1&key-2=val-2",
"scheme //user host 123/pa/th key-1 val-1 key-2 val-2")]
[InlineData(
"scheme://user@host:123/pa/th?key-1=val-1",
"scheme //user host 123/pa/th key-1 val-1")]
[InlineData(
"scheme://user@host:123/pa/th",
"scheme //user host 123/pa/th")]
[InlineData(
"scheme:///pa/th",
"scheme ///pa/th"
)]
public void Can_tokenize_URIs(string uri, string expected)
{
var tokens = Tokenizer.Tokenize(uri).ToList();
var actual = string.Join("", tokens.Select(t => t.Text));
Assert.Equal(expected.Replace(" ", string.Empty), actual);
}
[Fact]
public void Throws_when_invalid_character()
{
// Using single letters for faster debugging.
var uri = "s://:u@h:1/p?k=v&k=v#f";
// ^ - invalid character
var ex = Assert.Throws<ArgumentException>(() => Tokenizer.Tokenize(uri).ToList());
Assert.Equal("Invalid character ':' at 4.", ex.Message);
}
}
public class UriStringTokenizer : Tokenizer<UriToken>
{
/*
scheme:[//[userinfo@]host[:port]]path[?key=value&key=value][#fragment]
[ ----- authority ----- ] [ ----- query ------ ]
scheme: ------------------------ '/'path ------------------------- --------- UriString
\ / \ /\ /
// --------- host ----- / ?key ------ &key ------ / #fragment
\ / \ / \ / \ /
userinfo@ :port =value =value
*/
private static readonly State<UriToken>[] States =
{
new State<UriToken>(default, Scheme),
new State<UriToken>(Scheme, AuthorityPrefix, Path),
new State<UriToken>(AuthorityPrefix, UserInfo, Host, Path),
new State<UriToken>(UserInfo, Host),
new State<UriToken>(Host, Port, Path),
new State<UriToken>(Port, Path),
new State<UriToken>(Path, Key, Fragment),
new State<UriToken>(Key, UriToken.Value, Fragment),
new State<UriToken>(UriToken.Value, Key, Fragment),
new State<UriToken>(Fragment, Fragment),
};
public UriStringTokenizer() : base(States.ToImmutableList()) { }
}
public enum UriToken
{
Start = 0,
[Regex(@"([a-z0-9\+\.\-]+):")]
Scheme,
[Const("//")]
AuthorityPrefix,
[Regex(@"([a-z0-9_][a-z0-9\.\-_:]+)@")]
UserInfo,
[Regex(@"([a-z0-9\.\-_]+)")]
Host,
[Regex(@":([0-9]*)")]
Port,
[Regex(@"(\/?[a-z_][a-z0-9\/:\.\-\%_@]+)")]
Path,
[Regex(@"[\?\&\;]([a-z0-9\-]*)")]
Key,
[Regex(@"=([a-z0-9\-]*)")]
Value,
[Regex(@"#([a-z]*)")]
Fragment,
}
and the other for a CommandLine:
using static CommandLineToken;
public class CommandLineTokenizerTest
{
private static readonly ITokenizer<CommandLineToken> Tokenizer = new CommandLineTokenizer();
[Theory]
[InlineData(
"command -argument value -argument",
"command argument value argument")]
[InlineData(
"command -argument value value",
"command argument value value")]
[InlineData(
"command -argument:value,value",
"command argument value value")]
[InlineData(
"command -argument=value",
"command argument value")]
[InlineData(
@"command -argument=""foo--bar"",value -argument value",
@"command argument foo--bar value argument value")]
[InlineData(
@"command -argument=""foo--\""bar"",value -argument value",
@"command argument foo-- ""bar value argument value")]
public void Can_tokenize_command_lines(string uri, string expected)
{
var tokens = Tokenizer.Tokenize(uri).ToList();
var actual = string.Join("", tokens.Select(t => t.Text));
Assert.Equal(expected.Replace(" ", string.Empty), actual);
}
}
public enum CommandLineToken
{
Start = 0,
[Regex(@"\s*(\?|[a-z0-9][a-z0-9\-_]*)")]
Command,
[Regex(@"\s*[\-\.\/]([a-z0-9][a-z\-_]*)")]
Argument,
[Regex(@"[\=\:\,\s]")]
ValueBegin,
[QText(@"([a-z0-9\.\;\-]*)")]
Value,
}
public class CommandLineTokenizer : Tokenizer<CommandLineToken>
{
/*
command [-argument][=value][,value]
command --------------------------- CommandLine
\ /
-argument ------ ------ /
\ / \ /
=value ,value
*/
private static readonly State<CommandLineToken>[] States =
{
new State<CommandLineToken>(default, Command),
new State<CommandLineToken>(Command, Argument),
new State<CommandLineToken>(Argument, Argument, ValueBegin),
new State<CommandLineToken>(ValueBegin, Value) { IsToken = false },
new State<CommandLineToken>(Value, Argument, ValueBegin),
};
public CommandLineTokenizer() : base(States.ToImmutableList()) { }
}
Questions
- Would you say this is an improvement?
- Maybe something is still too unconventional? I guess this is probably still not a true state-machine becuase of the loop inside the tokenizer. Am I right?
- Did I miss any important suggestion or misinterpreted it?