2
votes

I'm following along with a recent Destroy All Software screencast on Compilers. All of the example code is written in Ruby, but I'm trying to achieve the same outputs with Elixir.

In plain-english, my approach goes something like this:

  1. pass a string of code to a recursive function (call_tokenizer)
  2. loop over token-types, and run a regex with the current token-type
  3. if a match is found, append that result to an accumulator remove the result from the beginning of the string, and pass the remaining string and the accumulator to call_tokenizer. Additionally, :halt the `reduce_while loop, and return the accumulated tokens.
  4. if no result was found, continue on with the loop

I've written a couple functions in Elixir that work in the sense that I get the expected outputs from my input. However, I think I already have at least one issue with my code.

I'm using Enum.reduce_while, but I don't think I'm using it as it's intended to be used. My assumption is that there's some better way to re-write the reduce_while as a recursive function.

If I needed to summarize that in a question it would be how do I achieve the same result, without relying on the halting/continue properties that reduce_while give me? Are there any other issues with this code sample that I should be aware of?

Here is my code and expected output:

// Code
defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize() do
    code = "def f()
      1
    end"

    IO.inspect call_tokenize(code, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code, accumulator) do
    Enum.reduce_while(@token_types, "", fn {type, re}, acc ->
      result = Regex.run(~r/\A#{re}/, code)

      if result do
        value = hd(result)
        base = byte_size(value)
        token = %Compiler.Token{type: type, value: value}
        tokens = binary_part(code, base, byte_size(code) - base)
          |> String.trim()
          |> call_tokenize(accumulator ++ [token])
        {:halt, tokens}
      else
        {:cont, acc}
      end
    end)
  end
end


// Expected output
[%Compiler.Token{type: :def, value: "def"},
 %Compiler.Token{type: :identifier, value: "f"},
 %Compiler.Token{type: :oparen, value: "("},
 %Compiler.Token{type: :cparen, value: ")"},
 %Compiler.Token{type: :integer, value: "1"},
 %Compiler.Token{type: :end, value: "end"}]    
2

2 Answers

3
votes

I see you've already figured out how to replace reduce_while with explicit recursion in your answer. Here's a more idiomatic approach that you'll see most handwritten tokenizers in Elixir and Erlang use. This approach can be much faster than naive regex based tokenizers and also allow adding logic that pure regex based tokenizers cannot (although in this case you don't need it).

Here's the code with some inline comments:

defmodule Compiler.Tokenizer.Dogbert do
  def tokenize(code), do: tokenize(code, [])

  # We're done. Reverse the tokens since we collected them in reverse order.
  defp tokenize("", acc), do: Enum.reverse(acc)
  # Remove leading whitespace.
  defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
  # Identifier
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
    {value, rest} = take_while(binary, fn b -> b in ?a..?z end)
    type = case value do
      "def" -> :def
      "end" -> :end
      _ -> :identifier
    end
    tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
  end
  # Number
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
    {value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
    tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
  end
  # (
  defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
  # )
  defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])

  # A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
  # We use indexes to track the position for efficiency.
  # Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
  defp take_while(binary, fun), do: take_while(binary, fun, 0)
  defp take_while(binary, fun, byte) do
    if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
      take_while(binary, fun, byte + 1)
    else
      <<value::binary-size(byte), rest::binary>> = binary
      {value, rest}
    end
  end
end

Test:

code = "def f()
  1
end"

IO.inspect Compiler.Tokenizer.Dogbert.tokenize(code)

Output:

[%Compiler.Token{type: :def, value: "def"},
 %Compiler.Token{type: :identifier, value: "f"},
 %Compiler.Token{type: :oparen, value: "("},
 %Compiler.Token{type: :cparen, value: ")"},
 %Compiler.Token{type: :integer, value: "1"},
 %Compiler.Token{type: :end, value: "end"}]

Here's a benchmark using benchee comparing your implementation to mine. Your implementation has some easy to fix inefficiencies (like not constructing Regex on every run and avoiding ++), but I'd expect a Regex based approach to always be slower than the one I used.

defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer.Dogbert do
  def tokenize(code), do: tokenize(code, [])

  # We're done. Reverse the tokens since we collected them in reverse order.
  defp tokenize("", acc), do: Enum.reverse(acc)
  # Remove leading whitespace.
  defp tokenize(<<h, rest::binary>>, acc) when h in ' \t\r\n', do: tokenize(rest, acc)
  # Identifier
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?a..?z do
    {value, rest} = take_while(binary, fn b -> b in ?a..?z end)
    type = case value do
      "def" -> :def
      "end" -> :end
      _ -> :identifier
    end
    tokenize(rest, [%Compiler.Token{type: type, value: value} | acc])
  end
  # Number
  defp tokenize(binary = <<h, _::binary>>, acc) when h in ?0..?9 do
    {value, rest} = take_while(binary, fn b -> b in ?0..?9 end)
    tokenize(rest, [%Compiler.Token{type: :integer, value: value} | acc])
  end
  # (
  defp tokenize("(" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :oparen, value: "("} | acc])
  # )
  defp tokenize(")" <> rest, acc), do: tokenize(rest, [%Compiler.Token{type: :cparen, value: ")"} | acc])

  # A simple helper function that extracts the leading part of the binary as long as `fun` returns `true` when called with a byte, starting from the first byte of the binary. It returns the extracted binary and the remaining binary.
  # We use indexes to track the position for efficiency.
  # Using accumulators like for lists can be inefficient for binaries since we have to allocate memory which can be avoided if we deal with byte offsets and do a `binary:part` at the end.
  defp take_while(binary, fun), do: take_while(binary, fun, 0)
  defp take_while(binary, fun, byte) do
    if byte < byte_size(binary) && fun.(:binary.at(binary, byte)) do
      take_while(binary, fun, byte + 1)
    else
      <<value::binary-size(byte), rest::binary>> = binary
      {value, rest}
    end
  end
end

defmodule Compiler.Tokenizer.PaulRuescher do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize(code_string) do
    call_tokenize(code_string, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code_string, accumulator) do
    {type, value} = attempt_tokenize(@token_types, code_string)
    base = byte_size(value)
    token = %Compiler.Token{type: type, value: value}
    binary_part(code_string, base, byte_size(code_string) - base)
      |> String.trim()
      |> call_tokenize(accumulator ++ [token])
  end

  def attempt_tokenize(token_types, code_string, index \\ 0) do
    {type, re} = Enum.at(token_types, index)

    case Regex.run(~r/\A#{re}/, code_string) do
      nil -> attempt_tokenize(token_types, code_string, index + 1)
      value -> {type, hd(value)}
    end
  end
end

code = String.duplicate("def f()
  1
end", 1000)

IO.inspect Compiler.Tokenizer.PaulRuescher.tokenize(code) == Compiler.Tokenizer.Dogbert.tokenize(code)

Benchee.run(%{
  "@paulruescher" => fn -> Compiler.Tokenizer.PaulRuescher.tokenize(code) end,
  "@Dogbert" => fn -> Compiler.Tokenizer.Dogbert.tokenize(code) end,
})

Results:

true
...

Name                    ips        average  deviation         median
@Dogbert             442.18        2.26 ms    ±17.03%        2.43 ms
@paulruescher         11.78       84.92 ms     ±8.37%       83.67 ms

Comparison:
@Dogbert             442.18
@paulruescher         11.78 - 37.55x slower
0
votes

I've refactored my previous code example, and am already much happier with this. I can see a few edge cases, like if the tokenizer never matches a regex, but I think I'm OK with this for now.

defmodule Compiler.Token do
  defstruct [:type, :value]
end

defmodule Compiler.Tokenizer do
  @token_types [
    {:def, "\\bdef\\b"},
    {:end, "\\bend\\b"},
    {:identifier, "\\b[a-zA-Z]+\\b"},
    {:integer, "\\b[0-9]+\\b"},
    {:oparen, "\\("},
    {:cparen, "\\)"}
  ]

  def tokenize(code_string) do
    call_tokenize(code_string, [])
  end

  def call_tokenize("", accumulator) do
    accumulator
  end

  def call_tokenize(code_string, accumulator) do
    {type, value} = attempt_tokenize(@token_types, code_string)
    base = byte_size(value)
    token = %Compiler.Token{type: type, value: value}
    binary_part(code_string, base, byte_size(code_string) - base)
      |> String.trim()
      |> call_tokenize(accumulator ++ [token])
  end

  def attempt_tokenize(token_types, code_string, index \\ 0) do
    {type, re} = Enum.at(token_types, index)

    case Regex.run(~r/\A#{re}/, code_string) do
      nil -> attempt_tokenize(token_types, code_string, index + 1)
      value -> {type, hd(value)}
    end
  end
end