0
votes

I am designing a code for SAO filtering and my code is taking too long to synthesize. I am taking a 66x66 pixels (1 CTU) and producing a 64x64 output for a whole frame. I am considering 8 CTus in 1 frame.

   module saocalc(input clk,input[7:0]sao_offset1,input[7:0]sao_offset2,
          input[7:0]sao_offset3,input[7:0]sao_offset4,
           outputreg[7:0]saoval);

 reg[7:0]mem[0:34855];
 reg[7:0]mem1[0:32767];
 reg[7:0]bu[0:65][0:65];
   reg[7:0]sao_out[0:63][0:63];
   integer i;
   reg[7:0]k=8'd0;
   integer j;
 reg[7:0]sao_type;

   initial
  begin
 $readmemh("0-7ctus.txt",mem);
   end

  always@(posedge clk)
    begin
   if(k<8)
      begin
     for(j=0;j<66;j=j+1)
       begin  
     for(i=0;i<66;i=i+1)
     bu[j][i]<=mem[i+(j*66)+4357*k+1];
     end
   sao_type<=mem[4357*k];
    end
    else
    k<=0;
     k<=k+1;
       end

   always@(posedge clk)
  begin
  if(sao_type==8'd0)
   begin
   for(j=0;j<64;j=j+1)
  begin
 for(i=1;i<64;i=i+1)
    begin
 if((bu[j][i]>bu[j][i-1])&&(bu[j][i]>bu[j][i+1]))
      saoval=bu[j][i]-sao_offset4;
       else if((bu[j][i]<bu[j][i-1])&&(bu[j][i]<bu[j][i+1]))
      saoval=bu[j][i]+sao_offset1;
  else if(((bu[j][i]<bu[j][i-1])&&(bu[j][i]==bu[j][i+1]))||
     ((bu[j][i]==bu[j][i-1])&&(bu[j][i]<bu[j][i+1])))
        saoval=bu[j][i]+sao_offset2;
      else if(((bu[j][i]==bu[j][i-1])&&(bu[j][i]>bu[j][i+1]))||
              ((bu[j][i]>bu[j][i-1])&&(bu[j][i]==bu[j][i+1])))
  saoval=bu[j][i]-sao_offset3;
    else
    saoval=bu[j][i];

    sao_out[j][i]=saoval;

   end

   sao_out[j][0]=bu[j][0];

   end


       end


  else if(sao_type==8'd1)
   begin
   for(i=0;i<64;i=i+1)
    begin
   for(j=1;j<64;j=j+1)
   begin
     if((bu[j][i]>bu[j-1][i])&&(bu[j][i]>bu[j+1][i]))
     saoval=bu[j][i]-sao_offset4;
      else if((bu[j][i]<bu[j-1][i])&&(bu[j][i]<bu[j+1][i]))
       saoval=bu[j][i]+sao_offset1;
   else if(((bu[j][i]<bu[j-1][i])&&(bu[j][i]==bu[j+1][i]))||
       ((bu[j][i]==bu[j-1][i])&&(bu[j][i]<bu[j+1][i])))
     saoval=bu[j][i]+sao_offset2;
   else if(((bu[j][i]==bu[j-1][i])&&(bu[j][i]>bu[j+1][i]))||
      ((bu[j][i]>bu[j-1][i])&&(bu[j][i]==bu[j+1][i])))
  saoval=bu[j][i]-sao_offset3;
   else
   saoval=bu[j][i];

 sao_out[j][i]=saoval;

  end

sao_out[0][i]=bu[0][i];

   end

    end


   else if(sao_type==8'd2)
 begin
   for(j=1;j<64;j=j+1)
     begin
    for(i=1;i<64;i=i+1)
     begin
   if((bu[j][i]>bu[j-1][i-1])&&(bu[j][i]>bu[j+1][i+1]))
     saoval=bu[j][i]-sao_offset4;
     else if((bu[j][i]<bu[j-1][i-1])&&(bu[j][i]<bu[j+1][i+1]))
      saoval=bu[j][i]+sao_offset1;
 else if(((bu[j][i]<bu[j-1][i-1])&&(bu[j][i]==bu[j+1][i+1]))||
        ((bu[j][i]==bu[j-1][i-1])&&(bu[j][i]<bu[j+1][i+1])))
     saoval=bu[j][i]+sao_offset2;
     else if(((bu[j][i]==bu[j-1][i-1])&&(bu[j][i]>bu[j+1][i+1]))||
            ((bu[j][i]>bu[j-1][i-1])&&(bu[j][i]==bu[j+1][i+1])))
   saoval=bu[j][i]-sao_offset3;
     else
     saoval=bu[j][i];


 sao_out[j][i]=saoval;
  sao_out[0][i]=bu[0][i];

    end

   sao_out[j][0]=bu[j][0];



    sao_out[0][0]=bu[0][0];


  end

    end


    else if(sao_type==8'd3)
     begin
  for(j=1;j<64;j=j+1)
  begin
 for(i=1;i<64;i=i+1)
  begin
  if((bu[j][i]>bu[j-1][i+1])&&(bu[j][i]>bu[j+1][i-1]))
  saoval=bu[j][i]-sao_offset4;
   else if((bu[j][i]<bu[j-1][i+1])&&(bu[j][i]<bu[j+1][i-1]))
    saoval=bu[j][i]+sao_offset1;
  else if(((bu[j][i]<bu[j-1][i+1])&&(bu[j][i]==bu[j+1][i-1]))||
        ((bu[j][i]==bu[j-1][i+1])&&(bu[j][i]<bu[j+1][i-1])))
    saoval=bu[j][i]+sao_offset2;
         else if(((bu[j][i]==bu[j-1][i+1])&&(bu[j][i]>bu[j+1][i-1]))||
                ((bu[j][i]>bu[j-1][i+1])&&(bu[j][i]==bu[j+1][i-1])))
     saoval=bu[j][i]-sao_offset3;
    else
    saoval=bu[j][i];

   sao_out[j][i]=saoval;


    sao_out[0][i]=bu[0][i];
    end

   sao_out[j][0]=bu[j][0];

 sao_out[0][0]=bu[0][0];

   end

  end

else if(sao_type==8'd4)
    begin
 for(i=0;i<64;i=i+1)
   begin
  for(j=0;j<64;j=j+1)
  begin
if((bu[i][j]>7)&&(bu[i][j]<16))
saoval=bu[i][j]+sao_offset1;
else if((bu[i][j]>15)&&(bu[i][j]<24))
  saoval=bu[i][j]+sao_offset2;
   else if((bu[i][j]>23)&&(bu[i][j]<32))
    saoval=bu[i][j]-sao_offset3;
   else if((bu[i][j]>31)&&(bu[i][j]<40))
    saoval=bu[i][j]-sao_offset4;
  else
  saoval=bu[i][j];

    sao_out[i][j]=saoval;

   end
    end
     end

    else
   saoval=3;

    end


  always@(posedge clk)
  begin
   for(j=0;j<64;j=j+1)
   begin
   for(i=0;i<64;i=i+1)
    mem1[j+(i*64)+(4096*k)]=sao_out[j][i];
    end
    end


 endmodule
1
Synthesize with what software and target? - osgx
how long too long? how long does it take? how long do you expect it to take and why? - Greg

1 Answers

2
votes

initial block is not preferred within synthesizable code .

mem and mem1 look like test bench code. men does not get any input from outside and mem1 does not put out any output.

If they are only used to load and store data you could move them outside the saocalc module.

  always@(posedge clk)
    begin
   if(k<8)
      begin
     for(j=0;j<66;j=j+1)
       begin
     for(i=0;i<66;i=i+1)
     bu[j][i]<=mem[i+(j*66)+4357*k+1];
mem [sao_offset3][sao_offset4] = sao_offset1;
     end
   sao_type<=mem[4357*k];
    end
    else
    k<=0;
     k<=k+1;
       end

The code above is used to load the buffer. The synthesizer will unroll the loop . i.e. It will replace the loop by

buj[0][0] = mem[4357*k+1] ;
buj[0][1] = mem[1+4357*k+1] ;
.....
.....
 buj[65][65] = mem[65+66*65+4357*k+1] ;

creating 66*66 = 4356 lines of code. There are 6 other such loops in the code => 64*64*6 = 24576 more lines of code.[ not all will result in more registers as bu is a common element ]

It will connect mem to bu via a 8 bit 8 to 1 mux => 4356 ( instances ) * 8 * 8 * X = 278784*X ( 272K *X ) number of gates .( X is size of a mux ) There are many more such muxes in the design.

Number of flops 34855*8+32767*8+65*65*8+64*64*8 = 607544 ( 593 K ) .

By now this block is rivaling a full chip ( multi-million gate) itself.

To make matters tougher for the synthesizer now, the sizes and blocks are not in powers of 2 - 66 , 34855 , 4357

The synthesizer now has its handful trying to connect and workout the details of this design.

Mem1 and men again appear to be only for input and output storage can be moved outside the block. The men and mem1 will be finally be (after all the handwork ) optimized out as men has no input into it and mem1 no output. Also the block may need to re-written using state-machines to reduce interconnect complexity . This code attempts to perform all the operation within a single clock cycle ( and 8 operations within 8 clocks ) . The block needs a reset. The code will work functionally but from a synthesis perspective is huge.