OK,
I knocked up a test program. (See below for code)
It's a single static class, run via main method.
Shader is stored as a String to keep matters simple.
I've run this with VM options: -Xms512m -Xmx512m
And I consistently find the non-FloatBuffer variant runs 10 times faster than the FloatBuffer.
Here's my results:
Running with GL20.glUniform4f(loc, buf.get(0), buf.get(1), buf.get(2), buf.get(3))
Average elapsed in nanosecs:1964.0
Running with GL20.glUniform4(loc, buf)
Average elapsed in nanosecs:19258.0
I let it run for a few thousand iterations to give the hotspot time to compile before summing for an average elapsed time.
I've also tried altering the shader to accept an array of 40 vec4 and the code to pass in a FloatBuffer of 160 floats.
This ran in much the same time as the "GL20.glUniform4(loc, buf)" call above (approx. 19000 nanosecs)
It's also interesting to note that this test code shows only a 10 fold difference in performance rather than the 50 fold difference I see when this processing runs inside my code proper.
Spasi,
1) Is there any chance you could look through this code and see if anything jumps out at you as odd or problemmatic?
2) Do you initialise/process your vertex shaders in the same way as this test code or do you do anything significantly different?
3) It would also be interesting to see what kind of results you get when you run this test code - is that possible?
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import org.lwjgl.BufferUtils;
import org.lwjgl.LWJGLException;
import org.lwjgl.opengl.Display;
import org.lwjgl.opengl.DisplayMode;
import org.lwjgl.opengl.GL20;
public class Test {
private static long timeStart = 0;
private static long timeEnd = 0;
private static int GL_FALSE = 0;
private static IntBuffer pBuf = BufferUtils.createIntBuffer(1);
private static ByteBuffer fBuf = BufferUtils.createByteBuffer(100);
private static ByteBuffer source;
private static int shaderID;
private static int programID;
private static final long NUM_ITERATIONS = 1000000L;
private static long sum = 0L;
public static void main(String[] args) {
try {
Display.destroy();
Display.setDisplayMode(new DisplayMode(800, 600));
Display.setFullscreen(false);
Display.create();
} catch (LWJGLException e) {
System.exit(10);
}
String s = "uniform vec4 testUniform; void main() { gl_Position = gl_ModelViewProjectionMatrix * testUniform * gl_Vertex;}";
source = BufferUtils.createByteBuffer(s.length());
source.put(s.getBytes());
source.flip();
shaderID = GL20.glCreateShader(GL20.GL_VERTEX_SHADER);
GL20.glShaderSource(shaderID, source);
GL20.glCompileShader(shaderID);
GL20.glGetShader(shaderID, GL20.GL_COMPILE_STATUS, pBuf);
if (pBuf.get(0) == GL_FALSE) {
System.exit(102);
}
programID = GL20.glCreateProgram();
GL20.glAttachShader(programID, shaderID);
GL20.glLinkProgram(programID);
GL20.glGetProgram(programID, GL20.GL_LINK_STATUS, pBuf);
if (pBuf.get(0) == GL_FALSE) {
System.exit(103);
}
int loc = getUniformLocation("testUniform");
FloatBuffer buf = BufferUtils.createFloatBuffer(4);
buf.put(1).put(2).put(3).put(4);
buf.flip();
for(int i = 0; i < NUM_ITERATIONS; i++) {
timeStart = System.nanoTime();
// I'm fast
// GL20.glUniform4f(loc, buf.get(0), buf.get(1), buf.get(2), buf.get(3));
// I'm slow
GL20.glUniform4(loc, buf);
timeEnd = System.nanoTime();
GL20.glUseProgram(0);
if(i > 9999) { // Give hotspot time to compile code
sum += timeEnd - timeStart;
}
}
System.out.println("Average elapsed in nanosecs:" + (float)(sum / (NUM_ITERATIONS - 9999)));
}
private static int getUniformLocation(String name) {
fBuf.clear();
int length = name.length();
char[] charArray = new char[length];
name.getChars(0, length, charArray, 0);
for ( int i = 0; i < length; i++ )
fBuf.put((byte)charArray[i]);
fBuf.put((byte)0); // Must be null-terminated.
fBuf.flip();
GL20.glGetUniformLocation(programID, fBuf);
int location = GL20.glGetUniformLocation(programID, fBuf);
if ( location == -1 )
throw new IllegalArgumentException("The uniform \"" + name + "\" does not exist in the Shader Program.");
return location;
}
}