I see you've already figured out how to replace reduce_while
with explicit recursion in your answer. Here's a more idiomatic approach that you'll see most handwritten tokenizers in Elixir and Erlang use. This approach can be much faster than naive regex based tokenizers and also allow adding logic that pure regex based tokenizers cannot (although in this case you don't need it).
Here's the code with some inline comments:
defmodule Compiler.Tokenizer.Dogbert do
def tokenize(code), do: tokenize(code, [])
# We're done. Reverse the tokens since we collected them in reverse order.
defp tokenize("", acc), do: Enum.reverse(acc)
# Remove leading whitespace.
defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
# Identifier
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
{value, rest} = take_while(binary, fn b -> b in ?a..?z end)
type = case value do
"def" -> :def
"end" -> :end
_ -> :identifier
end
tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
end
# Number
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
{value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
end
# (
defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
# )
defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])
# A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
# We use indexes to track the position for efficiency.
# Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
defp take_while(binary, fun), do: take_while(binary, fun, 0)
defp take_while(binary, fun, byte) do
if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
take_while(binary, fun, byte + 1)
else
<<value::binary-size(byte), rest::binary>> = binary
{value, rest}
end
end
end
Test:
code = "def f()
1
end"
IO.inspect Compiler.Tokenizer.Dogbert.tokenize(code)
Output:
[%Compiler.Token{type: :def, value: "def"},
%Compiler.Token{type: :identifier, value: "f"},
%Compiler.Token{type: :oparen, value: "("},
%Compiler.Token{type: :cparen, value: ")"},
%Compiler.Token{type: :integer, value: "1"},
%Compiler.Token{type: :end, value: "end"}]
Here's a benchmark using benchee
comparing your implementation to mine. Your implementation has some easy to fix inefficiencies (like not constructing Regex on every run and avoiding ++
), but I'd expect a Regex based approach to always be slower than the one I used.
defmodule Compiler.Token do
defstruct [:type, :value]
end
defmodule Compiler.Tokenizer.Dogbert do
def tokenize(code), do: tokenize(code, [])
# We're done. Reverse the tokens since we collected them in reverse order.
defp tokenize("", acc), do: Enum.reverse(acc)
# Remove leading whitespace.
defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
# Identifier
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
{value, rest} = take_while(binary, fn b -> b in ?a..?z end)
type = case value do
"def" -> :def
"end" -> :end
_ -> :identifier
end
tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
end
# Number
defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
{value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
end
# (
defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
# )
defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])
# A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
# We use indexes to track the position for efficiency.
# Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
defp take_while(binary, fun), do: take_while(binary, fun, 0)
defp take_while(binary, fun, byte) do
if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
take_while(binary, fun, byte + 1)
else
<<value::binary-size(byte), rest::binary>> = binary
{value, rest}
end
end
end
defmodule Compiler.Tokenizer.PaulRuescher do
@token_types [
{:def, "\\bdef\\b"},
{:end, "\\bend\\b"},
{:identifier, "\\b[a-zA-Z]+\\b"},
{:integer, "\\b[0-9]+\\b"},
{:oparen, "\\("},
{:cparen, "\\)"}
]
def tokenize(code_string) do
call_tokenize(code_string, [])
end
def call_tokenize("", accumulator) do
accumulator
end
def call_tokenize(code_string, accumulator) do
{type, value} = attempt_tokenize(@token_types, code_string)
base = byte_size(value)
token = %Compiler.Token{type: type, value: value}
binary_part(code_string, base, byte_size(code_string) - base)
|> String.trim()
|> call_tokenize(accumulator ++ [token])
end
def attempt_tokenize(token_types, code_string, index \\ 0) do
{type, re} = Enum.at(token_types, index)
case Regex.run(~r/\A#{re}/, code_string) do
nil -> attempt_tokenize(token_types, code_string, index + 1)
value -> {type, hd(value)}
end
end
end
code = String.duplicate("def f()
1
end", 1000)
IO.inspect Compiler.Tokenizer.PaulRuescher.tokenize(code) == Compiler.Tokenizer.Dogbert.tokenize(code)
Benchee.run(%{
"@paulruescher" => fn -> Compiler.Tokenizer.PaulRuescher.tokenize(code) end,
"@Dogbert" => fn -> Compiler.Tokenizer.Dogbert.tokenize(code) end,
})
Results:
true
...
Name ips average deviation median
@Dogbert 442.18 2.26 ms ±17.03% 2.43 ms
@paulruescher 11.78 84.92 ms ±8.37% 83.67 ms
Comparison:
@Dogbert 442.18
@paulruescher 11.78 - 37.55x slower