-module(parse).
-export([bimodal_tokenize/2, csv_tok/1, csv/1]).


%% Bi-modal quote/escape tokenizer with single-character controls.
%% This structure seems quite common for ad-hoc languages.
%% Token stream structured as a left fold.

%% Input is either an igen.erl impure generator, or a source.erl lazy
%% stream.  
bimodal_tokenize(Config, {igen,_,_}=IGen) ->
    %% upk/1 is only called once for each stream head, so
    %% to_source_leaky/1 can be used as long as we close properly.
    try bimodal_tokenize(Config, igen:to_source_leaky(IGen))
    catch C:E -> igen:close(IGen), throw({C,E})
    end;

bimodal_tokenize(Config, InSrc) ->
    fun(Fun, Init) ->
            tok_fld(Config, normal, [], upk(InSrc), Fun, Init)
    end.

%% Input uses the source.erl interface, with the added promise to only
%% unpack each head once, meaning it can operate on igen as well.


%% Input is an igen.erl impure generator.
upk(Src) -> source:unpack(Src).

%% Left fold core routine.
tok_fld(_,normal,Stack,eof,F,S) -> atm(Stack,F,S);
tok_fld(C,normal,Stack,{Char,Rest},F,S) ->
    case maps:find(Char, C) of
        {ok, escape} ->
            error(bad_escape);
        {ok, quote} ->
            tok_fld(C, quote, Stack, upk(Rest), F, S);
        {ok, Token} ->
            tok_fld(C, normal, [], upk(Rest), F, F(Token,atm(Stack,F,S)));
        _ ->
            tok_fld(C, normal, [Char | Stack], upk(Rest), F, S)
    end;
tok_fld(C,quote,Stack,{Char,Rest},F,S) ->
    case maps:find(Char, C) of
        {ok, escape} ->
            {Char1,Rest1} = upk(Rest),
            CharTx = 
                case maps:find({escape,Char1},C) of
                    {ok, CharEsc} -> CharEsc;
                    _ -> Char1
                end,
            tok_fld(C,quote,[CharTx|Stack],upk(Rest1),F,S);
        {ok, quote} ->
            tok_fld(C,normal,Stack,upk(Rest),F,S);
        _ ->
            tok_fld(C,quote,[Char|Stack],upk(Rest),F,S)
            
    end.
%%atm([],    _, S) -> S; %% Skip empty?
atm(Stack, F, S) -> F({atom,lists:reverse(Stack)},S).


csv_tok({list, List}) ->
    csv_tok(source:from_list(List));
csv_tok({file, FileName}) ->
    {ok, Data} = file:read_file(FileName),
    csv_tok({list, binary_to_list(Data)});
csv_tok(InSrc) ->
    bimodal_tokenize(
      #{ %% Used by tokenizer
         $"  => quote,
         $\\ => escape,
         %% Left in output stream
         $,  => comma,
         $\n => lf,
         $\r => cr,
         %% Escaped characters
         {escape, $r} => 13,
         {escape, $n} => 10
       },
      InSrc).

csv(In) ->
    Tokens = fold:to_list(csv_tok(In)),
    csv_p(Tokens, [], []).

r(Q) -> lists:reverse(Q).
b(L) -> list_to_binary(L).

%% Not really a parser: doesn't need a stack to perform recursion.
%% Compare e.g. to gdb.erl msg_parse
    
%% I: input
%% C: colum state
%% R: row state

csv_p([{atom,""}],  [], R)   -> r(R); %% empty atom between last lf and eof.
csv_p([{atom,A}|I], C,  R)   -> csv_p(I, [b(A)|C], R);
csv_p([comma   |I], C,  R)   -> csv_p(I, C,        R); %% only used for tokenizing
csv_p([cr      |I], C,  R)   -> csv_p(I, C,        R); %% ignored, support CRLF as well
csv_p([lf      |I], C,  R)   -> csv_p(I, [],       [r(C)|R]); 

csv_p(Input, Queue, Stack) -> error({parse,Input,Queue,Stack}).