best rendering solution for my particular game

Started by jakethesnake, April 21, 2015, 13:42:29

Previous topic - Next topic

jakethesnake

SO, I'm going to have a top down 2d game like diablo 2, age of empires, warcraft etc.

I've gotten quite a bit using immediate mode and lwjgl < 3, but now it's time for a change, so I upgraded to lwjgl 3.0 and inted to use openGl v 3.2

I've kind of decided for VBO's and shaders. I'll mostly be rendering textured and non-textured quads, so I'm hoping I can put most things in a single VBO.

Also, I want my VBO to be dynamic, e.g. I'll upload a new one each render frame.

Now I've just managed to render two colored quads on my screen, using this technique. My question is whether I'm doing it right.

These are my steps:

1. Create a static index buffer and fill it will indices. Upload it to the GPU.
2. Create a floatBuffer, in which I'll store the vertices.
3. Render the game, which will fill the floatbuffer with values.
4. upload the floatbuffer to the GPU and render it all with glDrawElements.

My major concern is how I fill the floatbuffer with floats in the render()-method. Would I gain by just adding a big float-array?
Another question mark is whether I really need such a big index-buffer. Shouldn't 6 entries be enough?

Below is my code. Does it look efficient enough for my needs? Do you have any tips?

import org.lwjgl.BufferUtils;

import dataTypes.Color;
import util.ShaderProgram;

import java.nio.FloatBuffer;
import java.nio.ShortBuffer;

import static org.lwjgl.opengl.GL11.*;
import static org.lwjgl.opengl.GL15.*;
import static org.lwjgl.opengl.GL20.*;
import static org.lwjgl.opengl.GL30.*;

public class Renderer2
{
    private util.ShaderProgram shaderProgram;

    private int vaoID;
    private int vboVertID;
    private int eboID;

    FloatBuffer vboBuff;
    
    private final int floatsPerQuad = 6;
    private int nrOfVert = 0;
    private static final int MAX_ELEMENTS = 1000;
    private static final int FLOATS_PER_ELEMENT = 4*6;
    private static final int BUFFER_SIZE = FLOATS_PER_ELEMENT ;
    private static final int ELEMENT_SIZE = 4*8 + 4*16; //4 bytes/floats 8 poss, 16 color
    
    public Renderer2(){
      	String VERTEX = "#version 330 core" + "\n"
    			+ "layout(location = 0) in vec2 position;" + "\n"
    			+ "layout(location = 1) in vec4 color;" + "\n"
    			+ "out vec4 vColor;" + "\n"
    			+ "void main(){" + "\n"
    			+ "vColor = color;" + "\n"
    			+ "gl_Position = vec4(position, 0.0, 1.0);" + "\n"
    			+ "}";
    	
    	String FRAGMENT = "#version 330 core" + "\n"
    			+ "in vec4 vColor;" + "\n"
    			+ "out vec4 fragColor;" + "\n"
    			+ "void main(){" + "\n"
    			+ "fragColor = vColor;" + "\n"
    			+ "}";
    	
        shaderProgram = new ShaderProgram();
        shaderProgram.attachVertexShader(VERTEX);
        shaderProgram.attachFragmentShader(FRAGMENT);
        shaderProgram.link();

        vboBuff = BufferUtils.createFloatBuffer(25000);

        // Generate and bind a Vertex Array
        vaoID = glGenVertexArrays();
        glBindVertexArray(vaoID);

        short[] indices = new short[MAX_ELEMENTS*6];
        
        short tmp = 0;
        
        for (short i = 0; i < indices.length; i+=6){
        	indices[i] = tmp++;
        	indices[i+1] = tmp++;
        	indices[i+2] = tmp--;
        	indices[i+3] = tmp++;
        	indices[i+4] = tmp++;
        	indices[i+5] = tmp++;
        }

        // Create a Buffer Object and upload the vertices buffer
        vboVertID = glGenBuffers();
        glBindBuffer(GL_ARRAY_BUFFER, vboVertID);

        // Point the buffer at location 0, the location we set
        // inside the vertex shader. You can use any location
        // but the locations should match
        glVertexAttribPointer(0, 2, GL_FLOAT, false, 24, 0);
        glVertexAttribPointer(1, 4, GL_FLOAT, false, 24, 8);
        // Create a Buffer Object and upload the colors buffer

        // Create a ShortBuffer of indices
        ShortBuffer indicesBuffer = BufferUtils.createShortBuffer(indices.length);
        indicesBuffer.put(indices).flip();

        // Create the Element Buffer object
        eboID = glGenBuffers();
        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, eboID);
        glBufferData(GL_ELEMENT_ARRAY_BUFFER, indicesBuffer, GL_STATIC_DRAW);

        glBindVertexArray(0);
    }
    
    public void bind(){
    	vboBuff.clear();
    	shaderProgram.bind();
    	nrOfVert = 0;
    }
    
    public void render(float x1, float x2, float y1, float y2, Color top, Color bottom){
        
        vboBuff.put(x1).put(y1);
        vboBuff.put(top.r).put(top.g).put(top.b).put(top.a);
        vboBuff.put(x2).put(y1);
        vboBuff.put(top.r).put(top.g).put(top.b).put(top.a);
        vboBuff.put(x1).put(y2);
        vboBuff.put(bottom.r).put(bottom.g).put(bottom.b).put(bottom.a);
        vboBuff.put(x2).put(y2);
        vboBuff.put(bottom.r).put(bottom.g).put(bottom.b).put(bottom.a);

        nrOfVert += floatsPerQuad;
    }
    
    public void flush(float delta){
    	
    	vboBuff.flip();
    	
    	glBindVertexArray(vaoID);
    	glBindBuffer(GL_ARRAY_BUFFER, vboVertID);
    	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, eboID);
    	shaderProgram.bind();
        
        glEnableVertexAttribArray(0);
        glEnableVertexAttribArray(1);
    	
        glBufferData(GL_ARRAY_BUFFER, vboBuff, GL_DYNAMIC_DRAW);

        //glDrawArrays(GL_QUADS, 0, 1000);
        
        glDrawElements(GL_TRIANGLES, 16, GL_UNSIGNED_SHORT, 0);

    	glBindBuffer(GL_ARRAY_BUFFER, 0);
    	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
        glBindVertexArray(0);
        ShaderProgram.unbind();
    }

    public void dispose()
    {
        // Dispose the program
        shaderProgram.dispose();

        // Dispose the vertex array
        glBindVertexArray(0);
        glDeleteVertexArrays(vaoID);

        // Dispose the buffer object
        glBindBuffer(GL_ARRAY_BUFFER, 0);
        glDeleteBuffers(vboVertID);

        // Dispose the element buffer object
        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
        glDeleteBuffers(eboID);
    }

}
 

Kai

Which technique to use in OpenGL depends 100% on your specific problem to solve and on every dimension of that problem:
- do you want to render just 2 quads or 100.000 quads per frame?
- do you want to texture all quads with the same texture or each with a different one?
- do you want to use the same shader/material for all quads or many separate shaders for each quad?
- do you want to have hundreds or thousands of small point lights in the scene or just 2 lights?
- do you want hard or soft shadows?
- do you want your vertices to be dynamic, that is, no static geometry, but somehow dynamically changing not based on the camera settings?

Every answer you give to every question above will decide the particular road you should go in architecturing your rendering pipeline and which OpenGL methods to use and will also likely narrow the answers possible on every other question.
Example:
- you want just 2 quads -> stick with immediate mode rendering
- you want 100.000 quads -> might want to use VBO with instancing (small geometry buffer to hold fixed geometry for all quads, and additionally a buffer object for per-instance data, such as position, color, etc.)
- you want many different textures that exceed the maximum texture binding targets -> you must use shaders with texture arrays (that will however limit you to one format specification for all texture layers in the array) or better use "bindless textures" (that however limits you to very recent cards)
- you want different shaders/materials -> you cannot use instancing anymore (or might use a big uber shader)
- you want many small point lights -> you may want to use deferred shading, which is entirely different from standard forward rendering
- you want hard shadows -> you should use stencil shadows and that is a big topic on its own with various shader tricks and optimizations
- you want dynamic geometry -> might want to use a VBO with streaming usage OR if that is possible, use a shader to do the transformation instead of having the CPU push the whole vertices list every frame to the GPU via VBO uploads

So, first be very clear and specific on every aspect of your game design, because that will affect your specific render engine capabilities and the design thereof.

But to answer your specific question about filling a VBO:
You can use a float[] array of course. Or just use putFloat or FloatBuffer.put methods directly. I think neither approach will yield any significant performance advantage over the other.
GC pressure however will be higher with a float[] array and you might notice lags every now and then.
In general: Try to avoid allocating anything on your hot path (render loop) of your application. If possible, allocate everything beforehand during initialization.

And to answer your particular question about how many indices your element buffer needs: For two quads you need of course only 12 indices in your element buffer: 3 vertices per triangle times 2 triangles per face times 2 faces.

jakethesnake


jakethesnake

I'm going to continue on my journey for the perfect renderer in my particular case. I might get some more input an it might help someone else.

First I'll answer the particular questions given me:

- do you want to render just 2 quads or 100.000 quads per frame?
100.000!

- do you want to texture all quads with the same texture or each with a different one?
I can do the same, using a sprite atlas.
However, should I'm going to use some kind of normal/height map. Should this be a separate texture, so that I can use the same textureCoords, or a seperate one? I know one can bind several textures at once in openGL. Which is the optimal way to go here? 

- do you want to use the same shader/material for all quads or many separate shaders for each quad?
Perhaps 2-3 shaders/render call

- do you want to have hundreds or thousands of small point lights in the scene or just 2 lights?
I'd like support for a dynamic amount of lights. (0-100) maybe.

- do you want hard or soft shadows?
Soft is pretty.

- do you want your vertices to be dynamic, that is, no static geometry, but somehow dynamically changing not based on the camera settings?
The way I see it, the only solution is for it to be dynamic at this point. I want big, big maps and don't have the space to store all of the vertices in the GPU.

Now, I've made some changes to my renderer class.

    SpriteRenderer(int elements) throws Exception{
    	
    	MAX_ELEMENTS = elements;
    	BUFFER_SIZE = FLOATS_PER_ELEMENT*MAX_ELEMENTS;
    	
      	String VERTEX = "#version 330 core" + "\n"
    			+ "uniform vec2 screen = vec2(0.004, -0.004);" + "\n" //the screen trans vector
      			+ "layout(location = 0) in vec2 position;" + "\n"
    			+ "layout(location = 1) in vec2 texCoo;" + "\n"
    			+ "vec2 trans = vec2(-1.0,1.0);" + "\n"
    			+ "out vec2 vTexCoo;" + "\n"
    			+ "void main(){" + "\n"
    			+ "vTexCoo = texCoo;" + "\n"
    			+ "gl_Position = vec4((position * screen)+trans, 0.0, 1.0);" + "\n"
    			+ "}";
      	
    	String FRAGMENT = "#version 330 core" + "\n"
    			+ "in vec2 vTexCoo;" + "\n"
    			+ "uniform sampler2D u_texture;" + "\n"
    			+ "out vec4 fragColor;" + "\n"
    			+ "void main(){" + "\n"
    			+ "vec4 texColor = texture2D(u_texture, vTexCoo);" + "\n"
    			+ "fragColor = texColor;" + "\n"
    			+ "}";
    	
        shaderProgram = new ShaderProgram();
        shaderProgram.attachVertexShader(VERTEX);
        shaderProgram.attachFragmentShader(FRAGMENT);
        shaderProgram.link();
        U_SCREEN_ID = shaderProgram.getUniformLocation("screen");
        vboBuff = BufferUtils.createByteBuffer(BUFFER_SIZE*4);

        // vaoID
        vaoID = glGenVertexArrays();
        glBindVertexArray(vaoID);

        //VBO
        	vboID = glGenBuffers();
        	glBindBuffer(GL_ARRAY_BUFFER, vboID);
            glVertexAttribPointer(0, 2, GL_INT, false, 2*4, 0);
            glVertexAttribPointer(1, 2, GL_FLOAT, false, 2*4, 8*4);
        
        //EBO
        int[] indices = new int[MAX_ELEMENTS*6];
        int tmp = 0;
        
        for (int i = 0; i < indices.length; i+=6){
        	indices[i] = tmp++;
        	indices[i+1] = tmp++;
        	indices[i+2] = tmp--;
        	indices[i+3] = tmp++;
        	indices[i+4] = tmp++;
        	indices[i+5] = tmp++;
        }
        
        IntBuffer indicesBuffer = BufferUtils.createIntBuffer(indices.length);
        indicesBuffer.put(indices).flip();

        eboID = glGenBuffers();
        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, eboID);
        glBufferData(GL_ELEMENT_ARRAY_BUFFER, indicesBuffer, GL_STATIC_DRAW);

        glBindVertexArray(0);
    }


The crucial thing here is the Vertex shader. All I've done so far is to map real screen coordinates to "openGl ones". I've seen many examples of this where all have used 4x4 matrices. I'm using 2d vectors instead, skipping the z-coordinate entirely. I hope this will give me some performance gain, but might en up problematic when I get to shadows and light.

    void render(float tx1, float tx2, float ty1, float ty2, float x1, float x2, float y1, float y2){
      
    	vboBuff.putInt((int)x1).putInt((int)y2);
    	vboBuff.putInt((int)x2).putInt((int)y2);
        vboBuff.putInt((int)x1).putInt((int)y1);
        vboBuff.putInt((int)x2).putInt((int)y1);
        vboBuff.putFloat(tx1).putFloat(ty2);
        vboBuff.putFloat(tx2).putFloat(ty2);
        vboBuff.putFloat(tx1).putFloat(ty1);
        vboBuff.putFloat(tx2).putFloat(ty1);
    	
    }


All my renderable objects in the game uses floats as coordinates. However, floats are filthy and unreliable. I encountered a bug where I got this strobe-like effect on moving objects. After debugging I found that it was the vertices coordinates that were the culprit. I cast them to ints and added them like that instead. Casting might be costly, what do I know? But there isn't much to be done.

    void flush(){
    	
    	vboBuff.flip();
    	
    	glBindVertexArray(vaoID);
    	glBindBuffer(GL_ARRAY_BUFFER, vboID);
    	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, eboID);
    	shaderProgram.bind();
        
        glEnableVertexAttribArray(0);
        glEnableVertexAttribArray(1);
        
        glBufferData(GL_ARRAY_BUFFER, vboBuff.limit(), vboBuff, GL_DYNAMIC_DRAW);
        
        glDrawElements(GL_TRIANGLES, vboBuff.limit(), GL11.GL_UNSIGNED_INT, 0);
        glBindTexture(GL_TEXTURE_2D, 0);
    	vboBuff.clear();
    	
    }


It doesn't feel good sending that much data over the bus 60 times per second, but I can't see another solution.

BUG: the only problem right now is when I try to render the sprite which is located in the top left corner of my sprite atlas. The thing that happens is that the entire screen is filled with the top-left pixel of that sprite - pixel 0,0. If i move the sprite one pixel to the right, it renders fine. Very, very strange and undoubtedly the works of floats.

Kai

Even though you are using VBO's right now, you are still thinking in terms of immediate mode rendering and just seem to translate your usual glVertex* and glNormal* and glColor* calls into Buffer.putInt's and putFloat's which you then upload each frame to your own VBO.
Even though you can do that, it is not supposed to improve on immediate mode rendering.
Ideally, you would build up your VBO just once, which I belief is quite possible in your simple case of having just a bunch of quads (even if you had 1.000.000 quads), and then do any transformations on your vertices' positions in your vertex shader.
So I would advice you to build up your VBO once in some kind of "init" method for all quads in your level and then just issue a draw call sourcing from that VBO when rendering and just update some uniform matrices or vectors (as you like) in your shader to change the position of your camera over the quads.

Cornix

Quote from: Kai on April 24, 2015, 14:03:26
Even though you are using VBO's right now, you are still thinking in terms of immediate mode rendering and just seem to translate your usual glVertex* and glNormal* and glColor* calls into Buffer.putInt's and putFloat's which you then upload each frame to your own VBO.
Even though you can do that, it is not supposed to improve on immediate mode rendering.
It is still an improvement since you use less OpenGL calls, but the rest you say is valid.

Still, even if you re-upload all of your vertices in each frame you get a huge improvement over immediate mode. This is because now you upload in bulk instead of one-by-one. You also only use 1 draw call instead of 1 draw call per quad.

jakethesnake

So, I've gotten some way since last. Now I have lightning and shadows, yay! But I have a few questions regarding my shaders for this.

I've got three textures I'm sampling from. Diffuse, Normal Map and a "Shadow map" that I've drawn to earlier.

     	String VERTEX = "#version 330 core" + "\n"
    			+ "uniform vec2 screen;" + "\n"
      			+ "layout(location = 0) in vec2 position;" + "\n"
    			+ "layout(location = 1) in vec2 texCoo;" + "\n"
    			+ "layout(location = 2) in float opacity;" + "\n"
    			+ "const vec2 trans = vec2(-1.0,1.0);" + "\n"
    			
    			+ "out vec2 vTexCoo;" + "\n"
    			+ "out float vOpacity;" + "\n"
    			+ "out vec2 vShadowCoo;" + "\n"
    			
    			+ "void main(){" + "\n"
    				+ "vTexCoo = texCoo;" + "\n"
    				+ "vOpacity = opacity;" + "\n"
    				+ "vShadowCoo = position*screen*0.5;" + "\n" // this texture has the same dimensions as the screen.
    				+ "gl_Position = vec4((position * screen)+trans, 0.0, 1.0);" + "\n"
    			+ "}";


1. My question regarding the vertex shader: As I've stated before, I'm using a truly dynamic VBO. As of now, I use a byte-buffer to add floats for all vertex information (I've noticed that byte-buffers are faster than float-buffers). Do you think it's better to send all vertex information as shorts? Thus saving 2x buffer space? I'm thinking about this, but it would require I modify a lot in my engine-code and I'm wondering if it's worth it.

  
    	String FRAGMENT = "#version 330 core" + "\n"
    			+ "in vec2 vTexCoo;" + "\n"
    			+ "in float vOpacity;" + "\n"
    			+ "in vec2 vShadowCoo;" + "\n"
    			+ "uniform sampler2D tex1;" + "\n"
    			+ "uniform sampler2D tex2;" + "\n"
    			+ "uniform sampler2D texShadow;" + "\n"
    			+ "uniform vec3 ambientDir;" + "\n"
    			+ "uniform vec3 ambientColor;" + "\n"
    			+ "uniform vec3[] lightPoss;" + "\n"
    			+ "uniform vec3[] lightColor;" + "\n"
    			+ "uniform int lightCount = 0;" + "\n"
    			+ "out vec4 fragColor;" + "\n"
    			
				+ "vec4 texColor;" + "\n"
				+ "vec3 normal;" + "\n"
				+ "vec4 shadowColor;" + "\n"
				+ "vec3 tempColor;" + "\n"
				+ "vec3 finalColor = vec3(0.0,0.0,0.0);" + "\n"
				+ "vec3 lightDirection;" + "\n"
				+ "float lightDistance;" + "\n"
				+ "float dottis;" + "\n"
				
				+ "void calcLight(vec3 lightP, vec3 lightC){" + "\n"
					+ "lightDirection = vec3(lightP.xy - gl_FragCoord.xy, lightP.z);" + "\n"
					+ "lightDistance = length(lightDirection);" + "\n"
					+ "dottis = dot(normal.xyz, normalize(lightDirection));" + "\n"
					+ "if (dottis > 0) {" + "\n"
						+ "tempColor = lightC.xyz*dottis;" + "\n"
						+ "tempColor.xyz /= pow(lightDistance, 3)*0.00012;" + "\n"
						+ "if (shadowColor.x > 0.0)" + "\n"
							+ "tempColor-= tempColor*shadowColor.x;" + "\n"
						+ "finalColor += texColor.xyz * tempColor.xyz;" + "\n"
					+ "}" + "\n"
				+ "}" + "\n"
    			
    			+ "void main(){" + "\n"
    				
    				+ "texColor = texture2D(tex1, vTexCoo);" + "\n"
    				+ "if (texColor.w == 0){fragColor = vec4(0.0,0.0,0.0,0.0); return;}" + "\n"
    				+ "normal = normalize(texture2D(tex2, vTexCoo)*2.0 -1).xyz;" + "\n"
    				+ "shadowColor = texture2D(texShadow, vShadowCoo);" + "\n"
    				
    				+ "tempColor;" + "\n"
    				+ "finalColor = vec3(0.0,0.0,0.0);" + "\n"
    				
    				//Ambient
    				+ "dottis = dot(normal.xyz, ambientDir);" + "\n"
    				+ "if (dottis > 0)" + "\n"
					+ "finalColor += texColor.xyz * ambientColor*dottis;" + "\n"
    				
					//lights
					+ "if (lightCount > 0){" + "\n"
						+ "calcLight(lightPoss[0], lightColor[0]);" + "\n"
						+ "if (lightCount > 1){" + "\n"
						+ "calcLight(lightPoss[1], lightColor[1]);" + "\n"
						+ "if (lightCount > 2){" + "\n"
						+ "calcLight(lightPoss[2], lightColor[2]);" + "\n"	
						+ "if (lightCount > 3){" + "\n"
						+ "calcLight(lightPoss[3], lightColor[3]);" + "\n"	
						+ "if (lightCount > 4){" + "\n"
						+ "calcLight(lightPoss[4], lightColor[4]);" + "\n"	
						+ "if (lightCount > 5){" + "\n"
						+ "calcLight(lightPoss[5], lightColor[5]);" + "\n"	
						+ "if (lightCount > 6){" + "\n"
						+ "calcLight(lightPoss[6], lightColor[6]);" + "\n"	
						+ "if (lightCount > 7){" + "\n"
						+ "calcLight(lightPoss[7], lightColor[7]);" + "\n"	
						+ "if (lightCount > 8){" + "\n"
						+ "calcLight(lightPoss[8], lightColor[8]);" + "\n"	
						+ "if (lightCount > 9){" + "\n"
						+ "calcLight(lightPoss[9], lightColor[9]);" + "\n"	
						+ "if (lightCount > 10){" + "\n"
						+ "calcLight(lightPoss[10], lightColor[10]);" + "\n"	
						+ "if (lightCount > 11){" + "\n"
						+ "calcLight(lightPoss[11], lightColor[11]);" + "\n"	
						+ "if (lightCount > 12){" + "\n"
						+ "calcLight(lightPoss[12], lightColor[12]);" + "\n"	
						+ "if (lightCount > 13){" + "\n"
						+ "calcLight(lightPoss[13], lightColor[13]);" + "\n"	
						+ "if (lightCount > 14){" + "\n"
						+ "calcLight(lightPoss[14], lightColor[14]);" + "\n"	
						+ "if (lightCount > 15){" + "\n"
						+ "calcLight(lightPoss[15], lightColor[15]);" + "\n"	
						+ "if (lightCount > 16){" + "\n"
						+ "calcLight(lightPoss[16], lightColor[16]);" + "\n"	
						+ "if (lightCount > 17){" + "\n"
						+ "calcLight(lightPoss[17], lightColor[17]);" + "\n"	
						+ "if (lightCount > 18){" + "\n"
						+ "calcLight(lightPoss[18], lightColor[18]);" + "\n"	
						+ "if (lightCount > 19){" + "\n"
						+ "calcLight(lightPoss[19], lightColor[19]);" + "\n"	
					+ "}}}}}}}}}}}}}}}}}}}}" + "\n"
					+ "fragColor = vec4(finalColor, texColor.w*vOpacity);" + "\n"
    			+ "}" + "\n";


2. first of all. Do I need this:

"if (texColor.w == 0){fragColor = vec4(0.0,0.0,0.0,0.0); return;}" + "\n"


Or do you think the compiler is smart enough to manage without this?

3. What about the calculation of light distance. Now, I'm doing this in the fragment shader. I could do this in the vertex shader, but I'm thinking that the vetex would basically do the same number of operations when it interpolates, so now sure there'd be a gain.

4. My biggest concern is whether the pipeline is smart enough not to process vertices and fragments that are obscured by others. Like if I draw one background picture first, then another that covers the first completely, then swap buffers. Will the GPU process the obscured one, then the overlaying one? I'm not using the depth buffer.   

Kai

1. What do you mean with "shorts"? Do you mean signed 16-bit integers?
Or do you mean what GLSL calls "half"s, which are signed 16-bit floating-point numbers?
In the former case: You realize that those are integral/integer numbers.
In the latter case: Only quite recent cards support halfs as in attributes.

But that is a very uncommon optimization I might add, unless you are handling many tens of millions of vertices.

2. First of all, your fragment shader looks horrible... :)
Please replace that deeply nested if-statement with a simple for-loop, ranging from zero to lightCount.

Regarding the texColor.w == 0 comparison:
The compiler is smart enough to do what exactly here?
You mean magically make zeroes out of the first three RGB channels if texColor.w happens to be 0.0?
No, it does not do this.
But if you use blending with src alpha it does not matter, since then the source color is not being used.

3. Currently you are using gl_FragCoord (which is in window coordinates) presumably as the coordinates of your shaded point. gl_FragCoord is already being interpolated. So there is nothing you can do in the vertex shader.
If on the other hand you mean changing your shading model from something Phong-like to Gouraud shading, then yes that would be presumably faster. But you would also only compute lighting per vertex and the computed colors would be interpolated.
But currently I don't quite know what you mean with doing lighting in the vertex shader.
Maybe you could layout your desired shading model a bit more.

4. Vertices are always processed, because they need to be transformed in order to see whether the primitive they generate is actually visible. As for the fragments, the fragment shader will only be executed for fragments that are actullay generated within the XY bounds of the viewport. And had you been using the depth buffer the fragments of an occluded surface would also not be generated. But this only happens reliably if you actually draw from front to back.
But since you don't use the depth buffer (for whatever reason) you don't get that "early-z out" optimization, so every generated fragment within the viewport is computed.

jakethesnake

Thank you!

1. I mean that instead of:

glVertexAttribPointer(0, 2, GL_FLOAT, false, 24, 0);


I can do this:

glVertexAttribPointer(0, 2, GL_SHORT, true, 24, 0);


And reap the benefit that, from what I gathered, the hardware will make a float ranging 0-1 from the short ranging 0 - short's max value, basically for free. But I've also gathered that it's not so much as how much data you send over the wires between CPU/GPU, it's how frequent, so it might not matter.

2. I should have mentioned that for some reason loops, or rather dynamic indexing of arrays (lightposs[ i ]), won't compile for me. I've googled and found some with the same problem, but unfortunately no solution. It sucks. I should also have mentioned that I use blending with source alpha. If I were openGL then I'd first of all look at the alpha channel and discard those pixels that were transparent before processing them. But I realize that the alpha can be manipulated at a later stage in the fragment shader, so maybe that's a stupid assumption.

3. Yes, I realize this was a stupid question. if a lightsource is in the middle of a triangle, then things will be ugly. I think I can interpolate the light direction though, enabling me to skip this in the fragment shader:
+ "lightDirection = vec3(lightP.xy - gl_FragCoord.xy, lightP.z);" + "\n"

I don't know if there'd be a gain since it's a simple subtraction operation.

4. First I'd like to ask about the viewport. I've skipped setting the viewport in my current implementation, because I didn't know what it did and it made no visual difference. Will this make the GPU process stuff that's outside of the window?

And second, I am rendering everything from back to front. Is it so that if I enable the depth buffer, do I need to send z-values in my with my VBO for each vertex? Or is there some switch that can do the same thing by simply looking at the order in which fragments are drawn?   


Kai

1. But that would make your host program alot more cumbersome and harder to understand as necessary, as you would have to convert the interval [-1.0..+1.0] to [min_short..max_short] in your program before writing the values to a ByteBuffer or ShortBuffer just so that the graphics card driver would then do the opposite conversion before feeding the data to the GPU.
And it also requires that your vertex attribute values are in fact always in the interval [-1.0..+1.0] (which is true of normals and colors, but not of positions).

4. Have a look at this wonderful site: http://www.songho.ca/opengl/gl_transform.html
Basically, the viewport defines which portion of the client part of a window gets mapped to the output from OpenGL rendering. For example whenever you resize your window, you must issue a glViewport in order for OpenGL to know that the viewport changed.

As for the depth buffer: Generating depth values is done completely automatically by the GPU, regardless of any z-component of your vertices. If you have a quad that is on z = 0.0 (which is the default if you don't provide any z-component in your vertex attribute yourself) and you have a view transformation that rotates that quad around the Y-axis 45 degrees, then the indivdual fragments generated for that quad will have different depth values.
The order of the fragments is also not relevant. And the order in which the fragments are being generated for the individual primitives/triangles is not specified and can be random within the same draw call.
That's why OpenGL and the GPU implement the Z-buffer algorithm as the primary rendering technique.
Everything that has already been drawn at a particular fragment/pixel position will not be overwritten by subsequent renderings if the stored depth value at that point is nearest to the viewer. This is to solve the "visibility problem" also known as the hidden surface determination which every rendering algorithm has to solve.
In general when doing 3D rendering you should be using the depth buffer to solve this problem.

Cornix

About using shorts:
My own experience has thought me to always use what the rest of the world is using: vec4 floats.
When I did performance testing with all different kinds of data I found out that vec4 floats were by far the fastest, even if I had to transmit more data it was still faster.
I would guess this is because the hardware is optimized for floating point operations and 4 component vectors. Trying to do early optimization by playing around with data types will bring you no good, especially if you have no evidence that it actually has a positive effect on the performance.
Just stick to what is standard. Chances are, its standard for a good reason.