3
votes

For a client project I need a simple sprite blitter that works on old hardware. OpenGL 1.1 seemed like an easy answer since I had old code I could reuse for that.

Anyway, all works just fine, but to my great surprise, it turns out that for blitting moving sprites (so rendering textured quads in ortho projection) that glBegin/glTexCoord2/glVertex2/glEnd mode is always as fast as glDrawArrays. This was tested on both old and new hardware and I'm a bit puzzled as I expected a different result!

To note though, both modes are certainly fast enough for requirements (So this is really curiousnerd talk rather then seriouswork talk!), but the demo for the client allowed pumping sprites count up to 10000 or more, and then we noticed that enabling the gldrawarrays option was usually same speed, and on some machines was half as fast as glbegin/glend.

Below is the bit of the code that renders. Note that for this demo the vertex and texture arrays are global variables that are next to each other.

for index:=0 to (sprite_list.count-1) do begin
 s:=sprite_list[index];
 s.update;
 glBindTexture(GL_TEXTURE_2D,s.sprite_id);
 glColor4b(127,127,127,s.ialpha);
 if immediate then begin
  glBegin(GL_QUADS);
   glTexCoord2f(0,0); glVertex2i(coords[0].x,coords[0].y);
   glTexCoord2f(0,1); glVertex2i(coords[1].x,coords[1].y);
   glTexCoord2f(1,1); glVertex2i(coords[2].x,coords[2].y);
   glTexCoord2f(1,0); glVertex2i(coords[3].x,coords[3].y);
  glEnd();
 end else 
  glDrawArrays(GL_QUADS, 0, 4);

edit: here's the code for the delphi unit. Make a new project with a form, add timer1 (enabled, interval=1) and timer2 (disabled, interval=1) objects on it, replace unit code with this, plug in form events: doubleclick/keydown/resize/destroy. Note that this was compiled in an older version of delphi so some opengl headers are added to the start of the unit. Also, press left/right to change number of sprites, and space to switch between glDrawArrays and glBegin/glEnd.

unit Unit1;

interface

uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, ExtCtrls;

type
  TForm1 = class(TForm)
    Timer1: TTimer;
    Timer2: TTimer;
    procedure Timer1Timer(Sender: TObject);
    procedure FormDestroy(Sender: TObject);
    procedure Timer2Timer(Sender: TObject);
    procedure FormResize(Sender: TObject);
    procedure FormKeyDown(Sender: TObject; var Key: Word; Shift: TShiftState);
    procedure FormDblClick(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.dfm}

uses
 opengl;

const
 GL_BGRA_EXT = $80E1;
 GL_VERTEX_ARRAY = $8074;
 GL_TEXTURE_COORD_ARRAY = $8078;

type
 PGLvoid = Pointer;

procedure glDeleteTextures(n: GLsizei; textures: pGLuint);stdcall;external opengl32;
procedure glGenTextures(n: GLsizei; textures: pGLuint);stdcall;external opengl32;
procedure glBindTexture(target: GLenum; texture: GLuint);stdcall;external opengl32;
procedure glEnableClientState(state: GLenum);stdcall;external opengl32;
procedure glDisableClientState(state: GLenum);stdcall;external opengl32;
procedure glTexCoordPointer(size: GLint; _type: GLenum; stride: GLsizei; const _pointer: PGLvoid);stdcall;external opengl32;
procedure glVertexPointer(size: GLint; _type: GLenum; stride: GLsizei; const _pointer: PGLvoid);stdcall;external opengl32;
procedure glDrawArrays(mode: GLenum; first: GLint; count: GLsizei);stdcall;external opengl32;

type
 tgeo_point=record
  x,y:longint;
 end;

var
 gl_Texture_Coordinates:array [0..7] of single=(0,0,0,1,1,1,1,0);
 coords:array [0..3] of tgeo_point;
 immediate:boolean=false;

type
 tsprite=class
  private
   ix,iy:longint;
   ix_dir,iy_dir:longint;
   ialpha:longint;
  public
   constructor create;
   destructor Destroy;override;

   procedure update(w,h:longint);
 end;

var
 gl_dc:hdc;
 gl_pixel_format:longint;
 gl_context:longint;
 gl_sprite_id:cardinal;
 sprite:array [0..1023] of dword;
 sprite_width:longint=32;
 sprite_height:longint=32;
 sprite_list:tlist;
 times:array [0..10] of longint=(0,0,0,0,0,0,0,0,0,0,0);

procedure gl_init;
var
 p,p2:tpixelformatdescriptor;
begin
 gl_dc:=getdc(form1.handle);

 zeromemory(@p,sizeof(p));
 p.nSize:=sizeof(p);
 p.nVersion:=1;
 p.dwFlags:=PFD_DRAW_TO_WINDOW or PFD_SUPPORT_OPENGL or PFD_DOUBLEBUFFER;
 p.iPixelType:=PFD_TYPE_RGBA;
 p.cColorBits:=32;
 p.iLayerType:=PFD_MAIN_PLANE;

 gl_pixel_format:=choosepixelformat(gl_dc,@p);
 if gl_pixel_format=0 then
  showmessage('error');
 if not setpixelformat(gl_dc,gl_pixel_format,@p) then
  showmessage('error');
 describepixelformat(gl_dc,gl_pixel_format,sizeof(p2),p2);
 if ((p.dwFlags and p2.dwFlags)<>p.dwFlags) or
    (p.iPixelType<>p2.iPixelType) or
    (p.cColorBits<>p2.cColorBits) or
    (p.iLayerType<>p2.iLayerType) then
  showmessage('errrrror');

 gl_context:=wglcreatecontext(gl_dc);
 if gl_context=0 then
  showmessage('error');
 if not wglmakecurrent(gl_dc,gl_context) then
  showmessage('error');

 glEnable(GL_BLEND);
 glEnable(GL_TEXTURE_2D);
 glBlendFunc(GL_SRC_ALPHA,GL_ONE_MINUS_SRC_ALPHA);

 glViewport(0,0,form1.clientwidth,form1.clientheight);
 glMatrixMode(GL_PROJECTION);
 glLoadIdentity();
 glOrtho(0,form1.clientwidth,0,form1.clientheight,-1,1);
 glMatrixMode(GL_MODELVIEW);

 glColor4f(1,1,1,1);

 glEnableClientState(GL_VERTEX_ARRAY);
   glVertexPointer(2, GL_INT, 0, @coords);
 glEnableClientState(GL_TEXTURE_COORD_ARRAY);
 glTexCoordPointer(2,gl_float,0,@gl_Texture_Coordinates);

 glClearColor(0,0,0,1);
 glClear(GL_COLOR_BUFFER_BIT);
 SwapBuffers(gl_dc);
end;

procedure gl_un_init;
begin
 glDisableClientState(GL_TEXTURE_COORD_ARRAY);
 glDisableClientState(GL_VERTEX_ARRAY);
 wgldeletecontext(gl_context);
 releasedc(form1.handle,gl_dc);
end;

procedure gl_resize;
begin
 glViewport(0,0,form1.clientwidth,form1.clientheight);
 glMatrixMode(GL_PROJECTION);
 glLoadIdentity();
 glOrtho(0,form1.clientwidth,0,form1.clientheight,-1,1);
 glMatrixMode(GL_MODELVIEW);
end;

function make_color(a,r,g,b:longint):cardinal;
begin
 result:=(a and 255) shl 24 or
         (r and 255) shl 16 or
         (g and 255) shl 8 or
         (b and 255);
end;

procedure sprite_init;
var
 x,y:longint;
begin
 for x:=0 to (sprite_width-1) do
  for y:=0 to (sprite_height-1) do
   sprite[y*(sprite_width)+x]:=
    make_color((x div 2+1)*(y div 2+1)-1,$ff,$ff,$ff);

 glgentextures(1,@gl_sprite_id);
 glBindTexture(GL_TEXTURE_2D,gl_sprite_id);
 glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_nearest);
 glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_nearest);
 glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_clamp);
 glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_clamp);

 glTexImage2D(GL_TEXTURE_2D,0,4,sprite_width,sprite_height,0,GL_BGRA_EXT,
              GL_UNSIGNED_BYTE,@sprite);
end;

procedure sprite_un_init;
begin
 gldeletetextures(1,@gl_sprite_id);
end;

constructor tsprite.create;
begin
 inherited create;

 ix:=random(form1.clientwidth);
 iy:=random(form1.clientheight);

 if random(2)=1 then
  ix_dir:=1
 else
  ix_dir:=-1;

 if random(2)=1 then
  iy_dir:=1
 else
  iy_dir:=-1;

 ialpha:=random(128);
end;

destructor tsprite.Destroy;
begin
 inherited destroy;
end;

procedure tsprite.update(w,h:longint);
begin
 if ix_dir=-1 then begin
  dec(ix);
  if ix<0 then begin
   ix:=0;
   ix_dir:=1;
  end;
 end else begin
  inc(ix);
  if ix>=w then begin
   ix:=w;
   ix_dir:=-1;
  end;
 end;

 if iy_dir=-1 then begin
  dec(iy);
  if iy<0 then begin
   iy:=0;
   iy_dir:=1;
  end;
 end else begin
  inc(iy);
  if iy>=h then begin
   iy:=h;
   iy_dir:=-1;
  end;
 end;

 coords[0].x:=ix;
 coords[0].y:=iy;
 coords[1].x:=ix;
 coords[1].y:=iy+sprite_height;
 coords[2].x:=ix+sprite_width;
 coords[2].y:=iy+sprite_height;
 coords[3].x:=ix+sprite_height;
 coords[3].y:=iy;
end;

procedure TForm1.FormDestroy(Sender: TObject);
var
 index:longint;
begin
 for index:=0 to (sprite_list.count-1) do
  tsprite(sprite_list[index]).free;
 sprite_list.free;
 sprite_un_init;
 gl_un_init;
end;

// --nVidia video card memory
//const
// GL_GPU_MEM_INFO_TOTAL_AVAILABLE_MEM_NVX=$9048;
// GL_GPU_MEM_INFO_CURRENT_AVAILABLE_MEM_NVX=$9049;

procedure TForm1.FormDblClick(Sender: TObject);
var
 a,b:longint;
begin
// glGetIntegerv(GL_GPU_MEM_INFO_TOTAL_AVAILABLE_MEM_NVX,@a);
// glGetIntegerv(GL_GPU_MEM_INFO_CURRENT_AVAILABLE_MEM_NVX,@b);
 showmessage(
  glgetstring(GL_VENDOR)+#13#10+
 glgetstring(GL_RENDERER)+#13#10+
 glgetstring(GL_VERSION)
 +#13#10+'Memory: '+inttostr(b)+'/'+inttostr(a)
// +#13#10+glgetstring(GL_EXTENSIONS)
 );
end;

procedure TForm1.FormKeyDown(Sender: TObject; var Key: Word; Shift: TShiftState);
var
 index:longint;
begin
 case key of
  vk_space:immediate:=not immediate;
  vk_escape:form1.close;
  vk_left:if sprite_list.count>0 then
            for index:=(sprite_list.count-1) downto (sprite_list.count-100) do begin
             tsprite(sprite_list[index]).free;
             sprite_list.delete(index);
            end;
  vk_right:for index:=1 to 100 do sprite_list.add(tsprite.create);
 end;
end;

procedure TForm1.FormResize(Sender: TObject);
begin
 gl_resize;
end;

procedure TForm1.Timer1Timer(Sender: TObject);
begin
 timer1.enabled:=false;
 timer2.enabled:=true;

 gl_init;
 sprite_init;
 sprite_list:=tlist.create;
end;

procedure TForm1.Timer2Timer(Sender: TObject);
var
 index,w,h,elapsed:longint;
 s:tsprite;
 ss:string;
begin
 glClear(GL_COLOR_BUFFER_BIT);

 w:=form1.clientwidth;
 h:=form1.clientheight;
 glBindTexture(GL_TEXTURE_2D,gl_sprite_id);
 for index:=0 to (sprite_list.count-1) do begin
  s:=sprite_list[index];
   s.update(w,h);
   glColor4b(127,127,127,s.ialpha);
   if immediate then begin
    glBegin(GL_QUADS);
     glTexCoord2f(0,0); glVertex2i(coords[0].x,coords[0].y);
     glTexCoord2f(0,1); glVertex2i(coords[1].x,coords[1].y);
     glTexCoord2f(1,1); glVertex2i(coords[2].x,coords[2].y);
     glTexCoord2f(1,0); glVertex2i(coords[3].x,coords[3].y);
    glEnd();
   end else
    glDrawArrays(GL_QUADS, 0, 4);
 end;
 glBindTexture(GL_TEXTURE_2D,0);

 SwapBuffers(gl_dc);

 for index:=10 downto 1 do
  times[index]:=times[index-1];
 times[0]:=gettickcount;
 elapsed:=times[0]-times[10];
 if elapsed=0 then elapsed:=1;

 if immediate then
  ss:='glBegin/glEnd '
 else
  ss:='glDrawArrays  ';
 form1.caption:=ss+'Sprites: '+inttostr(sprite_list.count)+' / FPS: '+inttostr(10*1000 div elapsed);
end;

end.

edit2: Big apologies to everyone, I had forgot to mention that in this case a single texture per sprite is essential to the end result, even if I simplified the code by removing it to focus the render loop on glBegin/glEnd vs glDrawArrays. Sorry for misleading by omission!

2
a driver can implement immediate mode on top of a VBO streaming architecture, using a fixed attribute layout and an optimized shader pipeline. it also was the standard for several years so it's not surprising that it can be fastratchet freak
I don't really understand what you just wrote, but we ran this experiment on machines recent and also up to 13 years old and in this shown case glDrawArrays was never faster then glBegin/glEnd. Generally performance was about the same on old machines, while on newer machines glBegin/glEnd was generally much faster.Marladu
glBegin always faster than glDrawArrays ? Try rendering something more than just a couple of quads, lets say, rendering something like 1 millions quads then you should clearly be able to see that glDrawArrays is faster, a lot faster.vallentin
Hum? I guess post wasn't clear but I was talking about having 10000 moving textured quads, there is no difference with less then 1000. When I get home tonight I'll post the code for the delphi unit that makes the little demo. I think it's about ~400 lines long, can I put that many lines in first post so everyone can download and check it out?Marladu
Ok I put in the code to test the cheapo demo thing to the first post. If glDrawArrays can be made faster then glBegin/glEnd in this situation I'm willing to learn how!Marladu

2 Answers

1
votes

I was actually working with OpenGL 13 years ago and I can tell you even back then if your application was structured correctly vertex arrays were usually quicker.

We did not have Vertex Buffer Objects way back then, but we had Interleaved Vertex Arrays (I literally mean glInterleavedArrays (...)) and Compiled Vertex Arrays. NVIDIA later created an extension (Vertex Array Ranges) that allowed vertex data to be stored in virtual memory (the address range for this virtual memory was designed to allow efficient DMA transfer).

AMD, who was still known as ATI back then, also had its own extension that improved vertex array performance (by storing vertex data on the server-side) called Vertex Array Object. Do not confuse AMD's extension with what we call VAOs in modern OpenGL. In actuality AMD's extension laid the foundation for Vertex Buffer Objects, it just has the unfortunate honor of sharing its name with something completely unrelated.

Now, all of the things I just discussed actually chart the evolution of vertex arrays in OpenGL. In particular, they show that the trend has been toward (1) storing vertex data in a user-defined memory organization (2) stored in server (GPU) memory for maximal re-use and (3) with as few API calls as possible. Immediate mode (glBegin / glEnd) violates all of those principles, all you can do with immediate mode is put the commands into a display list and hope that the driver takes care of points 2 and 3.


Update:

Also note, since we are talking about hardware dating back to the OpenGL 1.1 era, that graphics hardware did not always handle vertex transform. For a long time the ancestors of "GPUs" only accelerated rasterization and vertex transform was handled on the CPU. Before we had GPUs that could implement the entire graphics pipeline the separation between vertex and rasterization meant that it really did not matter how efficient your vertices could be passed into the pipeline because some of the work was done on the CPU. Once hardware T&L came along, server-side vertices were crucial.

This is where the crux of your problem lies. Your software is not setup in such a way that it would benefit from hardware acceleration of vertex transform. You are doing all of the transformation on the CPU and sending the GPU new data any time something needs to change. Modern applications use vertex shaders and static vertex data to do the same thing while minimizing the amount of data that needs to be sent to the GPU each frame. In modern software, most transforms can be done by updating a 4x4 matrix or two for use in a vertex shader.

On top of that, unless your software is vertex bound increasing the vertex efficiency will not improve performance appreciably. "Sprite blitting" sounds more like a fragment bound scenario to me, especially if the sprites are alpha blended.

0
votes
glDrawArrays(GL_QUADS, 0, 4);

You gotta increase your batch size to really see the performance advantage of vertex arrays.

The downside is that you generally have to re-architect your code a bit and be willing to tolerate some "redundant" storage and computation.

Example: (C++, but nothing too fancy beyond the GLM operator overloads for vector math)

#include <GL/glut.h>

#include <vector>
#include <iostream>
using namespace std;

#include <glm/glm.hpp>
#include <glm/gtc/random.hpp>
using namespace glm;

class Sprites
{
public:
    struct State
    {
        State() {}
        State( const vec2& pos, const vec2& vel ) : pos(pos), vel(vel) {}
        vec2 pos;
        vec2 vel;
    };

    struct Vertex
    {
        Vertex() {}
        Vertex( const vec4& color ) : color(color) {}
        vec2 pos;
        vec4 color;
    };

    size_t Size()
    {
        return states.size();
    }

    void PushBack( const State& state, const vec4& color )
    {
        states.push_back( state );
        verts.push_back( Vertex( color ) );
        verts.push_back( Vertex( color ) );
        verts.push_back( Vertex( color ) );
        verts.push_back( Vertex( color ) );
    }

    void Add( unsigned int number )
    {
        const float w = (float)glutGet( GLUT_WINDOW_WIDTH );
        const float h = (float)glutGet( GLUT_WINDOW_HEIGHT );
        for( unsigned int i = 0; i < number; ++i )
        {
            State state( glm::linearRand( vec2(-w,-h), vec2(w,h) ), glm::diskRand( 100.0f ) );
            vec4 color( glm::linearRand( vec4(1,1,1,1) * 0.1f, vec4(1,1,1,1) ) );
            PushBack( state, color );
        }
    }

    void Remove( unsigned int number )
    {
        if( states.size() >= number ) 
            states.resize( states.size() - number );
        if( verts.size() >= number * 4 )
            verts.resize( verts.size() - number * 4 );
    }

    void Step( float dt )
    {
        // run physics
        const float w = (float)glutGet( GLUT_WINDOW_WIDTH );
        const float h = (float)glutGet( GLUT_WINDOW_HEIGHT );
        const vec2 minExts = vec2(-w, -h);
        const vec2 maxExts = vec2(w, h);
        for( int i = 0; i < (int)states.size(); ++i )
        {
            State& state = states[i];

            if( state.pos.x < minExts.x || state.pos.x > maxExts.x )
                state.vel.x = -state.vel.x;
            if( state.pos.y < minExts.y || state.pos.y > maxExts.y )
                state.vel.y = -state.vel.y;

            state.pos += state.vel * dt;
        }

        // update geometry
        const vec2 spriteDims( 32, 32 );
        const vec2 offsets[4] =
        {
            vec2( -1, -1 ) * 0.5f * spriteDims,
            vec2(  1, -1 ) * 0.5f * spriteDims,
            vec2(  1,  1 ) * 0.5f * spriteDims,
            vec2( -1,  1 ) * 0.5f * spriteDims,
        };
        for( int i = 0; i < (int)states.size(); ++i )
        {
            verts[i*4 + 0].pos = states[i].pos + offsets[0];
            verts[i*4 + 1].pos = states[i].pos + offsets[1];
            verts[i*4 + 2].pos = states[i].pos + offsets[2];
            verts[i*4 + 3].pos = states[i].pos + offsets[3];
        }
    }

    void Draw( bool useVertexArrays )
    {
        if( verts.empty() ) return;

        if( useVertexArrays )
        {
            glEnableClientState( GL_VERTEX_ARRAY );
            glVertexPointer( 2, GL_FLOAT, sizeof(Vertex), &verts[0].pos );

            glEnableClientState( GL_COLOR_ARRAY );
            glColorPointer( 4, GL_FLOAT, sizeof(Vertex), &verts[0].color );

            glDrawArrays( GL_QUADS, 0, verts.size() );

            glDisableClientState( GL_VERTEX_ARRAY );
            glDisableClientState( GL_COLOR_ARRAY );
        }
        else
        {
            glBegin( GL_QUADS );
            for( size_t i = 0; i < states.size(); ++i )
            {
                glColor4fv(  &verts[i*4 + 0 ].color.r );
                glVertex2fv( &verts[i*4 + 0 ].pos.x );
                glColor4fv(  &verts[i*4 + 1 ].color.r );
                glVertex2fv( &verts[i*4 + 1 ].pos.x );
                glColor4fv(  &verts[i*4 + 2 ].color.r );
                glVertex2fv( &verts[i*4 + 2 ].pos.x );
                glColor4fv(  &verts[i*4 + 3 ].color.r );
                glVertex2fv( &verts[i*4 + 3 ].pos.x );
            }
            glEnd();
        }
    }

private:
    vector< State > states;
    vector< Vertex > verts;
};

Sprites sprites;
bool useVAs = false;
void keyboard( unsigned char key, int x, int y )
{
    switch( key )
    {
    case 'a':   sprites.Add( 10000 );       break;
    case 'z':   sprites.Remove( 10000 );    break;
    case 'v':   useVAs = !useVAs;           break;
    case 27:    exit( 1 );                  break;
    default:    break;
    }
}

void display()
{
    static int prvTime = glutGet( GLUT_ELAPSED_TIME );
    const int curTime = glutGet( GLUT_ELAPSED_TIME );
    const float dt = ( curTime - prvTime ) / 1000.0f;
    prvTime = curTime;

    cout << "Sprites: " << sprites.Size() << "; "; 
    cout << "dt: " << dt * 1000.0f << "ms ";
    cout << endl;

    sprites.Step( dt );

    glClear( GL_COLOR_BUFFER_BIT );

    glMatrixMode( GL_PROJECTION );
    glLoadIdentity();
    double w = glutGet( GLUT_WINDOW_WIDTH );
    double h = glutGet( GLUT_WINDOW_HEIGHT );
    glOrtho( -w, w, -h, h, -1, 1 );

    glMatrixMode( GL_MODELVIEW );
    glLoadIdentity();

    sprites.Draw( useVAs );

    glutSwapBuffers();
}

int main( int argc, char** argv )
{
    glutInit( &argc, argv );
    glutInitDisplayMode( GLUT_RGBA | GLUT_DOUBLE );
    glutInitWindowSize( 640, 480 );
    glutCreateWindow( "GLUT" );
    glutDisplayFunc( display );
    glutIdleFunc( display );
    glutKeyboardFunc( keyboard );

    sprites.Add( 10000 );

    glutMainLoop();
    return 0;
}