This is only part of my huffman tree generated using ocaml. The tree is represented as (char*int list) list:
[(' ', [0]); ('e', [1; 0]); ('t', [1; 1; 0]); ('a', [1; 1; 1; 0]);
('o', [1; 1; 1; 1; 0]); ('n', [1; 1; 1; 1; 1; 0]).....].
The (char*int list) is the code and the corresponding encoded bitstream. I'm wondering if this is a correct tree or I understood something wrong. In this way, the longest encoded ASC II code will be 255 bits. The original file is 213.3k and after encoding, it becomes 227k while in the instructions, I was told it should generate a file around 119k. I don't know where my problem is because I did everything following the instructions. Can someone tell me what is wrong in here?
My biggest problem is that: if I use huffman coding, only the 8 most frequent chars can save me space while the other 247 chars will cost extra space, is that true? If it isn't, why?
The codes I wrote was following the instructions in this link: http://www.cs.cornell.edu/Courses/cs3110/2012sp/hw/ps3/ps3.html
This is my code of encoding function:
type huffmantree = Node of huffmantree*(int*int)*huffmantree
| Leaf of char*int | Nil
type encoding = char * (int list)
let look_up (chr: char) (encl : encoding list) : int list =
let rec look_up_rec encl =
match encl with
| [] -> raise (Failure "Not found")
| (ch,theL)::tl -> if ch = chr then theL
else look_up_rec tl
in
look_up_rec encl
;;
let get_codes (hm : huffmantree): encoding list =
let rec get_codes_rec aTree word=
match aTree with
| Nil -> []
| Node (Leaf(lKey,lFreq),value,Nil) -> [(lKey,[0])]
| Node (Leaf(lKey,lFreq),value,Leaf(rKey,rFreq)) ->
[(lKey,List.append word [0]);(rKey,List.append word [1])]
| Node (Leaf(lKey,lFreq),value,rNode) ->
(lKey,List.append word [0])::(get_codes_rec rNode (List.append word [1]))
in
get_codes_rec hm []
;;
let encode (text : char list) : huffmantree * int list =
let sortedT = List.fast_sort (fun ch1 ch2->
if (int_of_char ch1)>=(int_of_char) ch2 then 1 else -1) text
in
let rec cre_freq_list aList m =
match aList with
| [] -> []
| hd::[] -> [(hd,m+1)]
| hd1::hd2::tl -> if hd1=hd2 then cre_freq_list (hd2::tl) (m+1)
else (hd1,(m+1))::(cre_freq_list (hd2::tl) 0)
in
let sortedF = List.fast_sort (fun (ch1,fr1) (ch2,fr2) ->
if fr1>=fr2 then 1 else -1) (cre_freq_list sortedT 0)
in
let rec createHuff sortedF=
match sortedF with
| [] -> Nil
| (ch,va)::[] -> Node (Leaf (ch,va),(256,va),Nil)
| (ach,aval)::tl ->
let rec creH_rec the_tl sib n freq=
match the_tl with
| (bch,bval)::[] -> Node(Leaf (bch,bval),(n,bval+freq),sib)
| (bch,bval)::btl -> creH_rec btl
(Node (Leaf (bch,bval),(n,bval+freq),sib)) (n+1)
(freq+bval)
in creH_rec tl (Leaf(ach,aval)) 256 aval
in
let huff = createHuff sortedF
in
let rec make_codes text =
match text with
| [] -> []
| hd::tl -> List.append (look_up hd (get_codes huff))
(make_codes tl)
in
(huff,(make_codes text))