I am evaluating the use of Erlang ETS to store a large in-memory data set that. My test data source is a CSV file that consumes only 350 MBytes of disk.
My parser reads row by row and splices it into a list, then create a tuple and stores it in ETS, using a "bag" configuration.
After loading all the data in ETS I noticed that my computer's 8GB of RAM was all gone, and the OS had created virtual memory, occupying somewhere near 16GB or RAM. The erlang's Beam process seems to take a consume about 10 times fold more memory than the size of disk data.
Here is the test code:
-module(load_test_data).
-author("gextra").
%% API
-export([test/0]).
init_ets() ->
ets:new(memdatabase, [bag, named_table]).
parse(File) ->
{ok, F} = file:open(File, [read, raw]),
parse(F, file:read_line(F), []).
parse(F, eof, Done) ->
file:close(F),
lists:reverse(Done);
parse(F, Line, Done) ->
parse(F, file:read_line(F), [ parse_row_commodity_data(Line) | Done ]).
parse_row_commodity_data(Line) ->
{ok, Data} = Line,
%%io:fwrite(Data),
LineList = re:split(Data,"\,",[{return,list}]),
ReportingCountry = lists:nth(1, LineList),
YearPeriod = lists:nth(2, LineList),
Year = lists:nth(3, LineList),
Period = lists:nth(4, LineList),
TradeFlow = lists:nth(5, LineList),
Commodity = lists:nth(6, LineList),
PartnerCountry = lists:nth(7, LineList),
NetWeight = lists:nth(8, LineList),
Value = lists:nth(9, LineList),
IsReported = lists:nth(10, LineList),
ets:insert(memdatabase, {YearPeriod ++ ReportingCountry ++ Commodity , { ReportingCountry, Year, Period, TradeFlow, Commodity, PartnerCountry, NetWeight, Value, IsReported } }).
test() ->
init_ets(),
parse("/data/000-2010-1.csv").