LWJGL 3.1.1 - Wiki OpenCL Sum Example - Crashes JVM on clSetKernelArg

Started by officialhopsof, May 23, 2017, 22:07:55

Previous topic - Next topic

officialhopsof

Im trying to re implement the OpenCL sum example from the wiki: http://wiki.lwjgl.org/wiki/OpenCL_in_LWJGL.html
using LWJGL 3.1.1

On the line 77:
clSetKernelArg(clKernel, 0, aMemory);

the JVM crashes and I get the following error

Quote#
# A fatal error has been detected by the Java Runtime Environment:
#
#  EXCEPTION_ACCESS_VIOLATION (0xc0000005) at pc=0x00007ff87cea7923, pid=5528, tid=0x00000000000027d8
#
# JRE version: Java(TM) SE Runtime Environment (8.0_131-b11) (build 1.8.0_131-b11)
# Java VM: Java HotSpot(TM) 64-Bit Server VM (25.131-b11 mixed mode windows-amd64 compressed oops)
# Problematic frame:
# C  [nvopencl.dll+0x387923]
#
# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
#
# An error report file with more information is saved as:
# hs_err_pid5528.log
#
# If you would like to submit a bug report, please visit:
#   http://bugreport.java.com/bugreport/crash.jsp
# The crash happened outside the Java Virtual Machine in native code.
# See problematic frame for where to report the bug.
#

Here is my implementation. I am sure I am missing something here, but I am unsure what it is. Any help would be very much appreciated! Thanks!

import static org.lwjgl.opencl.CL10.CL_CONTEXT_PLATFORM;
import static org.lwjgl.opencl.CL10.CL_DEVICE_NOT_FOUND;
import static org.lwjgl.opencl.CL10.CL_DEVICE_TYPE_GPU;
import static org.lwjgl.opencl.CL10.clBuildProgram;
import static org.lwjgl.opencl.CL10.clCreateCommandQueue;
import static org.lwjgl.opencl.CL10.clCreateContext;
import static org.lwjgl.opencl.CL10.clCreateKernel;
import static org.lwjgl.opencl.CL10.clEnqueueNDRangeKernel;
import static org.lwjgl.opencl.CL10.clGetDeviceIDs;
import static org.lwjgl.opencl.CL10.clGetPlatformIDs;
import static org.lwjgl.opencl.CL10.clSetKernelArg;
import static org.lwjgl.opencl.InfoUtil.checkCLError;
import static org.lwjgl.system.MemoryStack.stackPush;
import static org.lwjgl.system.MemoryUtil.NULL;
import static org.lwjgl.system.MemoryUtil.memUTF8;

import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.lwjgl.BufferUtils;
import org.lwjgl.PointerBuffer;
import org.lwjgl.opencl.CL;
import org.lwjgl.opencl.CL10;
import org.lwjgl.opencl.CLCapabilities;
import org.lwjgl.opencl.CLContextCallback;
import org.lwjgl.opencl.CLProgramCallback;
import org.lwjgl.system.MemoryStack;

public final class SumOpenCL {

  private static final String sumProgramSource =
      "kernel void sum(global const float* a, global const float* b, global float* result, int const size) {"
          + "  const int itemId = get_global_id(0);" + "  if(itemId < size) {"
          + "    result[itemId] = a[itemId] + b[itemId];" + "  }" + "}";

  private CLContextCallback clContextCB;
  private long clContext;
  private IntBuffer errcode_ret;
  private long clKernel;
  private long clDevice;
  private CLCapabilities deviceCaps;
  private long clQueue;
  private long sumProgram;
  private long aMemory;
  private long bMemory;
  private long clPlatform;
  private CLCapabilities clPlatformCapabilities;
  private long resultMemory;
  private static final int size = 100;


  public void run() {
    initializeCL();


    sumProgram = CL10.clCreateProgramWithSource(clContext, sumProgramSource, errcode_ret);

    CLProgramCallback buildCallback;
    int errcode =
        clBuildProgram(sumProgram, clDevice, "",
            buildCallback =
                CLProgramCallback.create((program, user_data) -> System.out.println("Building")),
            NULL);
    checkCLError(errcode);

    buildCallback.free();

    // init kernel with constants
    clKernel = clCreateKernel(sumProgram, "sum", errcode_ret);
    checkCLError(errcode_ret);

    createMemory();

    clSetKernelArg(clKernel, 0, aMemory);
    clSetKernelArg(clKernel, 1, bMemory);
    clSetKernelArg(clKernel, 2, resultMemory);
    clSetKernelArg(clKernel, 3, size);



    final int dimensions = 1;
    PointerBuffer globalWorkSize = BufferUtils.createPointerBuffer(dimensions); // In here we put
                                                                                // the total number
                                                                                // of work items we
                                                                                // want in each
                                                                                // dimension.
    globalWorkSize.put(0, size); // Size is a variable we defined a while back showing how many
                                 // elements are in our arrays.


    // Run the specified number of work units using our OpenCL program kernel
    errcode = clEnqueueNDRangeKernel(clQueue, clKernel, dimensions, null, globalWorkSize, null,
        null, null);

    CL10.clFinish(clQueue);

    cleanup();
  }

  private void createMemory() {
    // Create OpenCL memory object containing the first buffer's list of numbers
    aMemory = CL10.clCreateBuffer(clContext, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR,
        getABuffer(), errcode_ret);
    checkCLError(errcode_ret);

    // Create OpenCL memory object containing the second buffer's list of numbers
    bMemory = CL10.clCreateBuffer(clContext, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR,
        getBBuffer(), errcode_ret);
    checkCLError(errcode_ret);

    // Remember the length argument here is in bytes. 4 bytes per float.
    resultMemory = CL10.clCreateBuffer(clContext, CL10.CL_MEM_READ_ONLY, size * 4, errcode_ret);
    checkCLError(errcode_ret);
  }

  private FloatBuffer getABuffer() {
    // Create float array from 0 to size-1.
    FloatBuffer aBuff = BufferUtils.createFloatBuffer(size);
    float[] tempData = new float[size];
    for (int i = 0; i < size; i++) {
      tempData[i] = i;
    }
    aBuff.put(tempData);
    aBuff.rewind();
    return aBuff;
  }

  private FloatBuffer getBBuffer() {
    // Create float array from size-1 to 0. This means that the result should be size-1 for each
    // element.
    FloatBuffer bBuff = BufferUtils.createFloatBuffer(size);
    float[] tempData = new float[size];
    for (int j = 0, i = size - 1; j < size; j++, i--) {
      tempData[j] = i;
    }
    bBuff.put(tempData);
    bBuff.rewind();
    return bBuff;
  }


  private void cleanup() {
    // Destroy our kernel and program
    CL10.clReleaseCommandQueue(clQueue);
    CL10.clReleaseKernel(clKernel);
    CL10.clReleaseProgram(sumProgram);
    CL.destroy();
    // Destroy our memory objects
    CL10.clReleaseMemObject(aMemory);
    CL10.clReleaseMemObject(bMemory);
    CL10.clReleaseMemObject(resultMemory);
  }

  public void initializeCL() {
    errcode_ret = BufferUtils.createIntBuffer(1);
    // Create OpenCL
    // CL.create();
    // Get the first available platform
    try (MemoryStack stack = stackPush()) {
      IntBuffer pi = stack.mallocInt(1);
      checkCLError(clGetPlatformIDs(null, pi));
      if (pi.get(0) == 0) {
        throw new IllegalStateException("No OpenCL platforms found.");
      }

      PointerBuffer platformIDs = stack.mallocPointer(pi.get(0));
      checkCLError(clGetPlatformIDs(platformIDs, (IntBuffer) null));

      for (int i = 0; i < platformIDs.capacity() && i == 0; i++) {
        long platform = platformIDs.get(i);
        clPlatformCapabilities = CL.createPlatformCapabilities(platform);
        clPlatform = platform;
      }
    }


    clDevice = getDevice(clPlatform, clPlatformCapabilities, CL_DEVICE_TYPE_GPU);

    // Create the context
    PointerBuffer ctxProps = BufferUtils.createPointerBuffer(7);
    ctxProps.put(CL_CONTEXT_PLATFORM).put(clPlatform).put(NULL).flip();

    clContext = clCreateContext(ctxProps,
        clDevice, clContextCB = CLContextCallback.create((errinfo, private_info, cb,
            user_data) -> System.out.printf("cl_context_callback\n\tInfo: %s", memUTF8(errinfo))),
        NULL, errcode_ret);

    // create command queue
    clQueue = clCreateCommandQueue(clContext, clDevice, NULL, errcode_ret);
    checkCLError(errcode_ret);
  }

  private static long getDevice(long platform, CLCapabilities platformCaps, int deviceType) {
    try (MemoryStack stack = stackPush()) {
      IntBuffer pi = stack.mallocInt(1);
      checkCLError(clGetDeviceIDs(platform, deviceType, null, pi));

      PointerBuffer devices = stack.mallocPointer(pi.get(0));
      checkCLError(clGetDeviceIDs(platform, deviceType, devices, (IntBuffer) null));

      for (int i = 0; i < devices.capacity(); i++) {
        long device = devices.get(i);

        CLCapabilities caps = CL.createDeviceCapabilities(device, platformCaps);
        if (!(caps.cl_khr_gl_sharing || caps.cl_APPLE_gl_sharing)) {
          continue;
        }

        return device;
      }
    }

    return NULL;
  }



  public static void main(String... args) {
    SumOpenCL clApp = new SumOpenCL();
    clApp.run();
  }

}

spasi

The arg_value parameter of the clSetKernelArg function expects a pointer to data. That data is copied when clSetKernelArg is called and passed to the OpenCL kernel when executed. The pointer address is not copied or stored anywhere, so you can reuse it after calling clSetKernelArg.

This interface design is fine in C, where you can do this:

int myValue = 1234;
clSetKernalArg(myKernel, offset, &myValue);


This is not possible in Java, you'd have to use off-heap memory (i.e. NIO buffers) to store the value, then pass the buffer address to clSetKernelArg. For this reason LWJGL provides clSetKernelArg overloads that can be used for simple kernel argument values. See the clSetKernalArg{1,2,3,4}{b,s,i,l,p,f,d} methods. In your case, the code becomes:

clSetKernelArg1p(clKernel, 0, aMemory);
clSetKernelArg1p(clKernel, 1, bMemory);
clSetKernelArg1p(clKernel, 2, resultMemory);
clSetKernelArg1i(clKernel, 3, size);


You have two more bugs:

- Passing a CLProgramCallback to clBuildProgram makes it asynchronous. You cannot call clCreateKernel until the callback has been invoked. In your case it happens to work correctly because the Nvidia implementation does not seem to support asynchronous kernel compilation, but it would fail on other implementations without external synchronization (e.g. a CountDownLatch).

- The cleanup method calls CL.destroy() before releasing the memory objects. It must be moved to the end. Also note that calling CL.destroy() is not strictly required.

officialhopsof

Quote from: spasi on May 24, 2017, 08:09:23
In your case, the code becomes:

clSetKernelArg1p(clKernel, 0, aMemory);
clSetKernelArg1p(clKernel, 1, bMemory);
clSetKernelArg1p(clKernel, 2, resultMemory);
clSetKernelArg1i(clKernel, 3, size);



Thanks for the help! I made the other two fixes you mentioned, however, regarding the clSetKernelArg1p method, it appears to not exist, or any of the other overloaded clSetKernelArg methods. I double checked and I am using LWJGL 3.1.1

EDIT: Nevermind, It was just a missing import and eclipse didn't seem to be able to auto import it. Thanks again!

officialhopsof

For anyone looking around for this the Wiki's OpenCL Sum Example using LWJGL 3.1.1


import static org.lwjgl.opencl.CL10.CL_CONTEXT_PLATFORM;
import static org.lwjgl.opencl.CL10.CL_DEVICE_TYPE_GPU;
import static org.lwjgl.opencl.CL10.clBuildProgram;
import static org.lwjgl.opencl.CL10.clCreateCommandQueue;
import static org.lwjgl.opencl.CL10.clCreateContext;
import static org.lwjgl.opencl.CL10.clCreateKernel;
import static org.lwjgl.opencl.CL10.clEnqueueNDRangeKernel;
import static org.lwjgl.opencl.CL10.clGetDeviceIDs;
import static org.lwjgl.opencl.CL10.clGetPlatformIDs;
import static org.lwjgl.opencl.CL10.clSetKernelArg1i;
import static org.lwjgl.opencl.CL10.clSetKernelArg1p;
import static org.lwjgl.opencl.InfoUtil.checkCLError;
import static org.lwjgl.system.MemoryStack.stackPush;
import static org.lwjgl.system.MemoryUtil.NULL;
import static org.lwjgl.system.MemoryUtil.memUTF8;

import java.nio.FloatBuffer;
import java.nio.IntBuffer;

import org.lwjgl.BufferUtils;
import org.lwjgl.PointerBuffer;
import org.lwjgl.opencl.CL;
import org.lwjgl.opencl.CL10;
import org.lwjgl.opencl.CLCapabilities;
import org.lwjgl.opencl.CLContextCallback;
import org.lwjgl.opencl.CLProgramCallback;
import org.lwjgl.system.MemoryStack;

public final class SumOpenCL {

  private static final String sumProgramSource =
      "kernel void sum(global const float* a, global const float* b, global float* result, int const size) {"
          + "  const int itemId = get_global_id(0);" + "  if(itemId < size) {"
          + "    result[itemId] = a[itemId] + b[itemId];" + "  }" + "}";

  private CLContextCallback clContextCB;
  private long clContext;
  private IntBuffer errcode_ret;
  private long clKernel;
  private long clDevice;
  private long clQueue;
  private long sumProgram;
  private long aMemory;
  private long bMemory;
  private long clPlatform;
  private CLCapabilities clPlatformCapabilities;
  private long resultMemory;
  private static final int size = 100;


  public void run() {
    initializeCL();


    sumProgram = CL10.clCreateProgramWithSource(clContext, sumProgramSource, errcode_ret);

    int errcode = clBuildProgram(sumProgram, clDevice, "", null, NULL);
    checkCLError(errcode);


    // init kernel with constants
    clKernel = clCreateKernel(sumProgram, "sum", errcode_ret);
    checkCLError(errcode_ret);

    createMemory();


    clSetKernelArg1p(clKernel, 0, aMemory);
    clSetKernelArg1p(clKernel, 1, bMemory);
    clSetKernelArg1p(clKernel, 2, resultMemory);
    clSetKernelArg1i(clKernel, 3, size);



    final int dimensions = 1;
    PointerBuffer globalWorkSize = BufferUtils.createPointerBuffer(dimensions); // In here we put
                                                                                // the total number
                                                                                // of work items we
                                                                                // want in each
                                                                                // dimension.
    globalWorkSize.put(0, size); // Size is a variable we defined a while back showing how many
                                 // elements are in our arrays.


    // Run the specified number of work units using our OpenCL program kernel
    errcode = clEnqueueNDRangeKernel(clQueue, clKernel, dimensions, null, globalWorkSize, null,
        null, null);

    CL10.clFinish(clQueue);

    printResults();

    cleanup();
  }

  private void printResults() {
    // This reads the result memory buffer
    FloatBuffer resultBuff = BufferUtils.createFloatBuffer(size);
    // We read the buffer in blocking mode so that when the method returns we know that the result
    // buffer is full
    CL10.clEnqueueReadBuffer(clQueue, resultMemory, CL10.CL_TRUE, 0, resultBuff, null, null);
    // Print the values in the result buffer
    for (int i = 0; i < resultBuff.capacity(); i++) {
      System.out.println("result at " + i + " = " + resultBuff.get(i));
    }
    // This should print out 100 lines of result floats, each being 99.
  }

  private void createMemory() {
    // Create OpenCL memory object containing the first buffer's list of numbers
    aMemory = CL10.clCreateBuffer(clContext, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR,
        getABuffer(), errcode_ret);
    checkCLError(errcode_ret);

    // Create OpenCL memory object containing the second buffer's list of numbers
    bMemory = CL10.clCreateBuffer(clContext, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR,
        getBBuffer(), errcode_ret);
    checkCLError(errcode_ret);

    // Remember the length argument here is in bytes. 4 bytes per float.
    resultMemory = CL10.clCreateBuffer(clContext, CL10.CL_MEM_READ_ONLY, size * 4, errcode_ret);
    checkCLError(errcode_ret);
  }

  private FloatBuffer getABuffer() {
    // Create float array from 0 to size-1.
    FloatBuffer aBuff = BufferUtils.createFloatBuffer(size);
    float[] tempData = new float[size];
    for (int i = 0; i < size; i++) {
      tempData[i] = i;
      System.out.println("a[" + i + "]=" + i);
    }
    aBuff.put(tempData);
    aBuff.rewind();
    return aBuff;
  }

  private FloatBuffer getBBuffer() {
    // Create float array from size-1 to 0. This means that the result should be size-1 for each
    // element.
    FloatBuffer bBuff = BufferUtils.createFloatBuffer(size);
    float[] tempData = new float[size];
    for (int j = 0, i = size - 1; j < size; j++, i--) {
      tempData[j] = i;
      System.out.println("b[" + j + "]=" + i);
    }
    bBuff.put(tempData);
    bBuff.rewind();
    return bBuff;
  }


  private void cleanup() {
    // Destroy our kernel and program
    CL10.clReleaseCommandQueue(clQueue);
    CL10.clReleaseKernel(clKernel);
    CL10.clReleaseProgram(sumProgram);

    // Destroy our memory objects
    CL10.clReleaseMemObject(aMemory);
    CL10.clReleaseMemObject(bMemory);
    CL10.clReleaseMemObject(resultMemory);

    // Not strictly necessary
    CL.destroy();
  }

  public void initializeCL() {
    errcode_ret = BufferUtils.createIntBuffer(1);

    // Get the first available platform
    try (MemoryStack stack = stackPush()) {
      IntBuffer pi = stack.mallocInt(1);
      checkCLError(clGetPlatformIDs(null, pi));
      if (pi.get(0) == 0) {
        throw new IllegalStateException("No OpenCL platforms found.");
      }

      PointerBuffer platformIDs = stack.mallocPointer(pi.get(0));
      checkCLError(clGetPlatformIDs(platformIDs, (IntBuffer) null));

      for (int i = 0; i < platformIDs.capacity() && i == 0; i++) {
        long platform = platformIDs.get(i);
        clPlatformCapabilities = CL.createPlatformCapabilities(platform);
        clPlatform = platform;
      }
    }


    clDevice = getDevice(clPlatform, clPlatformCapabilities, CL_DEVICE_TYPE_GPU);

    // Create the context
    PointerBuffer ctxProps = BufferUtils.createPointerBuffer(7);
    ctxProps.put(CL_CONTEXT_PLATFORM).put(clPlatform).put(NULL).flip();

    clContext = clCreateContext(ctxProps,
        clDevice, clContextCB = CLContextCallback.create((errinfo, private_info, cb,
            user_data) -> System.out.printf("cl_context_callback\n\tInfo: %s", memUTF8(errinfo))),
        NULL, errcode_ret);

    // create command queue
    clQueue = clCreateCommandQueue(clContext, clDevice, NULL, errcode_ret);
    checkCLError(errcode_ret);
  }

  private static long getDevice(long platform, CLCapabilities platformCaps, int deviceType) {
    try (MemoryStack stack = stackPush()) {
      IntBuffer pi = stack.mallocInt(1);
      checkCLError(clGetDeviceIDs(platform, deviceType, null, pi));

      PointerBuffer devices = stack.mallocPointer(pi.get(0));
      checkCLError(clGetDeviceIDs(platform, deviceType, devices, (IntBuffer) null));

      for (int i = 0; i < devices.capacity(); i++) {
        long device = devices.get(i);

        CLCapabilities caps = CL.createDeviceCapabilities(device, platformCaps);
        if (!(caps.cl_khr_gl_sharing || caps.cl_APPLE_gl_sharing)) {
          continue;
        }

        return device;
      }
    }

    return NULL;
  }



  public static void main(String... args) {
    SumOpenCL clApp = new SumOpenCL ();
    clApp.run();
  }

}