2
votes

This is only part of my huffman tree generated using ocaml. The tree is represented as (char*int list) list:

[(' ', [0]); ('e', [1; 0]); ('t', [1; 1; 0]); ('a', [1; 1; 1; 0]);
 ('o', [1; 1; 1; 1; 0]); ('n', [1; 1; 1; 1; 1; 0]).....].

The (char*int list) is the code and the corresponding encoded bitstream. I'm wondering if this is a correct tree or I understood something wrong. In this way, the longest encoded ASC II code will be 255 bits. The original file is 213.3k and after encoding, it becomes 227k while in the instructions, I was told it should generate a file around 119k. I don't know where my problem is because I did everything following the instructions. Can someone tell me what is wrong in here?

My biggest problem is that: if I use huffman coding, only the 8 most frequent chars can save me space while the other 247 chars will cost extra space, is that true? If it isn't, why?

The codes I wrote was following the instructions in this link: http://www.cs.cornell.edu/Courses/cs3110/2012sp/hw/ps3/ps3.html

This is my code of encoding function:

type huffmantree = Node of huffmantree*(int*int)*huffmantree 
 | Leaf of char*int | Nil
type encoding = char * (int list)

let look_up (chr: char) (encl : encoding list) : int list =
  let rec look_up_rec encl = 
    match encl with
    | [] -> raise (Failure "Not found")
    | (ch,theL)::tl -> if ch = chr then theL
                       else look_up_rec tl
    in
    look_up_rec encl
;;

let get_codes (hm : huffmantree): encoding list = 
  let rec get_codes_rec aTree word=
  match aTree with
  | Nil -> []
  | Node (Leaf(lKey,lFreq),value,Nil) -> [(lKey,[0])]
  | Node (Leaf(lKey,lFreq),value,Leaf(rKey,rFreq)) ->  
    [(lKey,List.append word [0]);(rKey,List.append word [1])]
  | Node (Leaf(lKey,lFreq),value,rNode) -> 
    (lKey,List.append word [0])::(get_codes_rec rNode (List.append     word [1]))
  in
  get_codes_rec hm []
;;

let encode (text : char list) : huffmantree * int list = 
  let sortedT = List.fast_sort (fun ch1 ch2->   
    if (int_of_char ch1)>=(int_of_char) ch2 then 1 else -1) text
  in
  let rec cre_freq_list aList m = 
    match aList with
    | [] -> []
    | hd::[] -> [(hd,m+1)]
    | hd1::hd2::tl -> if hd1=hd2 then cre_freq_list (hd2::tl) (m+1)
                      else (hd1,(m+1))::(cre_freq_list  (hd2::tl) 0)
  in
  let sortedF = List.fast_sort (fun (ch1,fr1) (ch2,fr2) ->
    if fr1>=fr2 then 1 else -1) (cre_freq_list sortedT 0)
  in
  let rec createHuff sortedF= 
    match sortedF with
    | [] -> Nil
    | (ch,va)::[] -> Node (Leaf (ch,va),(256,va),Nil)
    | (ach,aval)::tl -> 
      let rec creH_rec the_tl sib n freq= 
        match the_tl with
        | (bch,bval)::[] -> Node(Leaf (bch,bval),(n,bval+freq),sib)
        | (bch,bval)::btl -> creH_rec btl 
          (Node (Leaf (bch,bval),(n,bval+freq),sib)) (n+1) 
          (freq+bval)
    in creH_rec tl (Leaf(ach,aval)) 256 aval
  in
  let huff = createHuff sortedF
  in
  let rec make_codes text = 
    match text with
    | [] -> []
    | hd::tl -> List.append (look_up hd (get_codes huff)) 
      (make_codes tl)
  in
  (huff,(make_codes text))
1
Give us your code so that we can help you. - Thomash
... and maybe the instructions as well... - gasche

1 Answers

3
votes

Looking at the resulting tree, it appears that you don't implement the Huffman's algorithm. I doubt the 'e' is more frequent in your text than any other letter. Without your code I can only guess but maybe when merging the two lightest trees you inserted the resulting tree at the end of the list of trees to merge instead of inserting it at the right place according to its weight.

In your code createHuff is declared recursive but there is no recursive call. Your function createHuff never compares the values inside the sortedF list don't you think this is a problem? It means that createHuff will always yield the same tree (with different labels but with the same structure).