comparing floats and floatbuffer

Started by chris_c, November 22, 2014, 23:01:53

Previous topic - Next topic

chris_c

I've made a very simple (possibly naive) test of doing matrix math using either a bunch of floats or a floatbuffer

doing 10 million matrix multiplies there was just 300ms difference in total (so close it seems practically identical?)

being as a bunch of floats would need to be turned into a buffer to feed into a shader I'm leaning towards just using buffer matrices.

I'd very much appreciate peoples thoughts and also... have I made any silly mistakes...

here's the test code I used: - note I did A=A*B as C=A*B would create an additional object and I only need A=A*B
import java.util.Random;
import java.nio.FloatBuffer;

public class mat4test {

    mat4test() { }

    public static void main(String[] args) {
        mat4test matTest = new mat4test();
        matTest.run();
    }

    public void run() {
        /*
         * allow jvm start up to settle
         */
        Thread t = Thread.currentThread();
        long s = System.currentTimeMillis();
        long e = s;
        while (e-s < 4000) {    // 4 seconds should be enough to settle down
            e = System.currentTimeMillis();
            t.yield(); 
        }

        /*
         * first test a collection of floats as a matrix
         */

        floatMat m1 = new floatMat();
        floatMat m2 = new floatMat();

        m1.rnd();

        System.out.println("start");
        /* test 10,000,000 multiplies */
        s = System.currentTimeMillis();
        
        for (int n=0;n<10000000;n++) {
            m2.rnd();
            m1.mult(m2);
        }
        
        e = System.currentTimeMillis();
        System.out.println("10,000,000 floatMat multiplies took "+(e-s)+ " ms");

        bufferMat m3 = new bufferMat();
        bufferMat m4 = new bufferMat();

        m3.rnd();

        s = System.currentTimeMillis();
        
        for (int n=0;n<10000000;n++) {
            m4.rnd();
            m3.mult(m4);
        }
        
        e = System.currentTimeMillis();
        System.out.println("10,000,000 bufferMat multiplies took "+(e-s)+ " ms");

        
    }

    public class floatBundle {
        public float m00,m01,m02,m03;
        public float m04,m05,m06,m07;
        public float m08,m09,m10,m11;
        public float m12,m13,m14,m15;
    }


    public class floatMat {

        private floatBundle mat = new floatBundle();
        private floatBundle tmp = new floatBundle();
        private Random rng = new Random();
        
        floatMat() {
            
        }

        void mult(floatMat m2) {
           
            tmp.m00 = mat.m00 * m2.mat.m00 + mat.m04 * m2.mat.m01 + mat.m08 * m2.mat.m02 + mat.m12 * m2.mat.m03;
            tmp.m01 = mat.m01 * m2.mat.m00 + mat.m05 * m2.mat.m01 + mat.m09 * m2.mat.m02 + mat.m13 * m2.mat.m03;
            tmp.m02 = mat.m02 * m2.mat.m00 + mat.m06 * m2.mat.m01 + mat.m10 * m2.mat.m02 + mat.m14 * m2.mat.m03;
            tmp.m03 = mat.m03 * m2.mat.m00 + mat.m07 * m2.mat.m01 + mat.m11 * m2.mat.m02 + mat.m15 * m2.mat.m03;

            tmp.m04 = mat.m00 * m2.mat.m04 + mat.m04 * m2.mat.m05 + mat.m08 * m2.mat.m06 + mat.m12 * m2.mat.m07;
            tmp.m05 = mat.m01 * m2.mat.m04 + mat.m05 * m2.mat.m05 + mat.m09 * m2.mat.m06 + mat.m13 * m2.mat.m07;
            tmp.m06 = mat.m02 * m2.mat.m04 + mat.m06 * m2.mat.m05 + mat.m10 * m2.mat.m06 + mat.m14 * m2.mat.m07;
            tmp.m07 = mat.m03 * m2.mat.m04 + mat.m07 * m2.mat.m05 + mat.m11 * m2.mat.m06 + mat.m15 * m2.mat.m07;

            tmp.m08 = mat.m00 * m2.mat.m08 + mat.m04 * m2.mat.m09 + mat.m08 * m2.mat.m10 + mat.m12 * m2.mat.m11;
            tmp.m09 = mat.m01 * m2.mat.m08 + mat.m05 * m2.mat.m09 + mat.m09 * m2.mat.m10 + mat.m13 * m2.mat.m11;
            tmp.m10 = mat.m02 * m2.mat.m08 + mat.m06 * m2.mat.m09 + mat.m10 * m2.mat.m10 + mat.m14 * m2.mat.m11;
            tmp.m11 = mat.m03 * m2.mat.m08 + mat.m07 * m2.mat.m09 + mat.m11 * m2.mat.m10 + mat.m15 * m2.mat.m11;

            tmp.m12 = mat.m00 * m2.mat.m12 + mat.m04 * m2.mat.m13 + mat.m08 * m2.mat.m14 + mat.m12 * m2.mat.m15;
            tmp.m13 = mat.m01 * m2.mat.m12 + mat.m05 * m2.mat.m13 + mat.m09 * m2.mat.m14 + mat.m13 * m2.mat.m15;
            tmp.m14 = mat.m02 * m2.mat.m12 + mat.m06 * m2.mat.m13 + mat.m10 * m2.mat.m14 + mat.m14 * m2.mat.m15;
            tmp.m15 = mat.m03 * m2.mat.m12 + mat.m07 * m2.mat.m13 + mat.m11 * m2.mat.m14 + mat.m15 * m2.mat.m15;

            mat.m00 = tmp.m00;
            mat.m01 = tmp.m01;
            mat.m02 = tmp.m02;
            mat.m03 = tmp.m03;
            mat.m04 = tmp.m04;
            mat.m05 = tmp.m05;
            mat.m06 = tmp.m06;
            mat.m07 = tmp.m07;
            mat.m08 = tmp.m08;
            mat.m09 = tmp.m09;
            mat.m10 = tmp.m10;
            mat.m11 = tmp.m11;
            mat.m12 = tmp.m12;
            mat.m13 = tmp.m13;
            mat.m14 = tmp.m14;
            mat.m15 = tmp.m15;
        }

        void rnd() {
            mat.m00 = rng.nextFloat();
            mat.m01 = rng.nextFloat();
            mat.m02 = rng.nextFloat();
            mat.m03 = rng.nextFloat();
            mat.m04 = rng.nextFloat();
            mat.m05 = rng.nextFloat();
            mat.m06 = rng.nextFloat();
            mat.m07 = rng.nextFloat();
            mat.m08 = rng.nextFloat();
            mat.m09 = rng.nextFloat();
            mat.m10 = rng.nextFloat();
            mat.m11 = rng.nextFloat();
            mat.m12 = rng.nextFloat();
            mat.m13 = rng.nextFloat();
            mat.m14 = rng.nextFloat();
            mat.m15 = rng.nextFloat();
        }
        
    }

    public class bufferMat {
        FloatBuffer mat = FloatBuffer.allocate(16);
        FloatBuffer tmp = FloatBuffer.allocate(16);
        private Random rng = new Random();

        bufferMat() {

        }

        void mult(bufferMat m2) {
            
         tmp.put( 0, mat.get( 0)*m2.mat.get( 0)+mat.get( 4)*m2.mat.get( 1)+mat.get( 8)*m2.mat.get( 2)+mat.get(12)*m2.mat.get( 3));
         tmp.put( 1, mat.get( 1)*m2.mat.get( 0)+mat.get( 5)*m2.mat.get( 1)+mat.get( 9)*m2.mat.get( 2)+mat.get(13)*m2.mat.get( 3));
         tmp.put( 2, mat.get( 2)*m2.mat.get( 0)+mat.get( 6)*m2.mat.get( 1)+mat.get(10)*m2.mat.get( 2)+mat.get(14)*m2.mat.get( 3));
         tmp.put( 3, mat.get( 3)*m2.mat.get( 0)+mat.get( 7)*m2.mat.get( 1)+mat.get(11)*m2.mat.get( 2)+mat.get(15)*m2.mat.get( 3));

         tmp.put( 4, mat.get( 0)*m2.mat.get( 4)+mat.get( 4)*m2.mat.get( 5)+mat.get( 8)*m2.mat.get( 6)+mat.get(12)*m2.mat.get( 7));
         tmp.put( 5, mat.get( 1)*m2.mat.get( 4)+mat.get( 5)*m2.mat.get( 5)+mat.get( 9)*m2.mat.get( 6)+mat.get(13)*m2.mat.get( 7));
         tmp.put( 6, mat.get( 2)*m2.mat.get( 4)+mat.get( 6)*m2.mat.get( 5)+mat.get(10)*m2.mat.get( 6)+mat.get(14)*m2.mat.get( 7));
         tmp.put( 7, mat.get( 3)*m2.mat.get( 4)+mat.get( 7)*m2.mat.get( 5)+mat.get(11)*m2.mat.get( 6)+mat.get(15)*m2.mat.get( 7));

         tmp.put( 8, mat.get( 0)*m2.mat.get( 8)+mat.get( 4)*m2.mat.get( 9)+mat.get( 8)*m2.mat.get(10)+mat.get(12)*m2.mat.get(11));
         tmp.put( 9, mat.get( 1)*m2.mat.get( 8)+mat.get( 5)*m2.mat.get( 9)+mat.get( 9)*m2.mat.get(10)+mat.get(13)*m2.mat.get(11));
         tmp.put(10, mat.get( 2)*m2.mat.get( 8)+mat.get( 6)*m2.mat.get( 9)+mat.get(10)*m2.mat.get(10)+mat.get(14)*m2.mat.get(11));
         tmp.put(11, mat.get( 3)*m2.mat.get( 8)+mat.get( 7)*m2.mat.get( 9)+mat.get(11)*m2.mat.get(10)+mat.get(15)*m2.mat.get(11));

         tmp.put(12, mat.get( 0)*m2.mat.get(12)+mat.get( 4)*m2.mat.get(13)+mat.get( 8)*m2.mat.get(14)+mat.get(12)*m2.mat.get(15));
         tmp.put(13, mat.get( 1)*m2.mat.get(12)+mat.get( 5)*m2.mat.get(13)+mat.get( 9)*m2.mat.get(14)+mat.get(13)*m2.mat.get(15));
         tmp.put(14, mat.get( 2)*m2.mat.get(12)+mat.get( 6)*m2.mat.get(13)+mat.get(10)*m2.mat.get(14)+mat.get(14)*m2.mat.get(15));
         tmp.put(15, mat.get( 3)*m2.mat.get(12)+mat.get( 7)*m2.mat.get(13)+mat.get(11)*m2.mat.get(14)+mat.get(15)*m2.mat.get(15));

         mat.put(tmp); 
        }

        void rnd() {
            mat.put( 0,rng.nextFloat());
            mat.put( 1,rng.nextFloat());
            mat.put( 2,rng.nextFloat());
            mat.put( 3,rng.nextFloat());
            mat.put( 4,rng.nextFloat());
            mat.put( 5,rng.nextFloat());
            mat.put( 6,rng.nextFloat());
            mat.put( 7,rng.nextFloat());
            mat.put( 8,rng.nextFloat());
            mat.put( 9,rng.nextFloat());
            mat.put(10,rng.nextFloat());
            mat.put(11,rng.nextFloat());
            mat.put(12,rng.nextFloat());
            mat.put(13,rng.nextFloat());
            mat.put(14,rng.nextFloat());
            mat.put(15,rng.nextFloat());
        }
    }
}
 

spasi

If you want to get serious with (micro-)benchmarking, you have to use JMH. Anything else is just telling lies to yourself. Make sure you read and understand the samples first.

Just by glancing at your code, aside from the obvious mistakes (no proper warm-up, etc) that will be taken care of by JMH, I think you'll find that rng.nextFloat() dominates the execution time. You're not really comparing what you want to compare (heap vs off-heap access).

chris_c

not entirely sure it wouldn't be warmed up but hey we'll see with JMH, but defiantly take your point about the rng ! (good catch)

I don't have time today to check out JMH but as a quick improvement i replaced the rnd function with (considering how pseudo it is you have to wonder why it takes so long!!!)

        float r=0;
        void rnd() {
            m00 = r;r+=0.0001;
            m01 = r;r+=0.0001;
            m02 = r;r+=0.0001;
            m03 = r;r+=0.0001;
            m04 = r;r+=0.0001;
            m05 = r;r+=0.0001;
            m06 = r;r+=0.0001;
            m07 = r;r+=0.0001;
            m08 = r;r+=0.0001;
            m09 = r;r+=0.0001;
            m10 = r;r+=0.0001;
            m11 = r;r+=0.0001;
            m12 = r;r+=0.0001;
            m13 = r;r+=0.0001;
            m14 = r;r+=0.0001;
            m15 = r;r+=0.0001;
        }


which you could argue is similar to a matrix function and gives the multiply different values to play with...

I will look at JMH but in the mean time I'm seeing a variation of only 30ms over multiple runs over 10,000,000 iterations thats what 3ns variation per iteration, unless JMH can guarantee no GC and no other alien threads running its cpu core I do wonder if there will be that much difference...

I also noticed a slight improvement replacing the floatbundle class with a bunch of float properties, so I think thanks to that and your suggestion of ditching the rng I have something reasonably worthwhile to test with JHM - thanks!