Learning OpenCL programming

This post is to record the steps how I run my first “Hello World” OpenCL C++ program.

  1. To make things easier, I created this, OpenCL.zip, OpenCL library and C/C++ header files.
  2. Create a Visual Studio C++ project
  3. The following code is to add two 2^25 array altogether using GPU:
#include <iostream>
#include <vector>
#include <string>
using namespace std;

#define __CL_ENABLE_EXCEPTIONS
#include <CL\cl.hpp>

// Compute c = a + b.
static const char source[] =
"kernel void add(\n"
"       ulong n,\n"
"       global const float *a,\n"
"       global const float *b,\n"
"       global float *c\n"
"       )\n"
"{\n"
"    size_t i = get_global_id(0);\n"
"    if (i < n) {\n"
"       c[i] = a[i] + b[i];\n"
"    }\n"
"}\n";

int main() {

	const size_t N = 1 << 25;

	try {
		// Get list of OpenCL platforms.
		std::vector platform;
		cl::Platform::get(&platform);

		if (platform.empty()) {
			std::cerr << "OpenCL platforms not found." << std::endl;
			return 1;
		}

		// Get first available GPU device.
		cl::Context context;
		std::vector device;
		for (auto p = platform.begin(); device.empty() && p != platform.end(); p++) {
			std::vector pldev;

			try {
				p->getDevices(CL_DEVICE_TYPE_DEFAULT, &pldev);

				for (auto d = pldev.begin(); device.empty() && d != pldev.end(); d++) {
					if (!d->getInfo()) continue;

					std::string ext = d->getInfo();

					device.push_back(*d);
					context = cl::Context(device);
				}
			}
			catch (...) {
				device.clear();
			}
		}

		if (device.empty()) {
			std::cerr << "GPUs device not found." << std::endl;
			return 1;
		}

		std::cout << device[0].getInfo() << std::endl;

		// Create command queue.
		cl::CommandQueue queue(context, device[0]);

		// Compile OpenCL program for found device.
		cl::Program program(context, cl::Program::Sources(
			1, std::make_pair(source, strlen(source))
			));

		try {
			program.build(device);
		}
		catch (const cl::Error&) {
			std::cerr
				<< "OpenCL compilation error" << std::endl
				<< program.getBuildInfo(device[0])
				<< std::endl;
			return 1;
		}

		cl::Kernel add(program, "add");

		// Prepare input data.
		std::vector a(N, 1);
		std::vector b(N, 2);
		std::vector c(N);

		// Allocate device buffers and transfer input data to device.
		cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			a.size() * sizeof(float), a.data());

		cl::Buffer B(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
			b.size() * sizeof(float), b.data());

		cl::Buffer C(context, CL_MEM_READ_WRITE,
			c.size() * sizeof(float));

		// Set kernel parameters.
		add.setArg(0, static_cast(N));
		add.setArg(1, A);
		add.setArg(2, B);
		add.setArg(3, C);

		// Launch kernel on the compute device.
		queue.enqueueNDRangeKernel(add, cl::NullRange, N, cl::NullRange);

		// Get result back to host.
		queue.enqueueReadBuffer(C, CL_TRUE, 0, c.size() * sizeof(float), c.data());

		// Should get '3' here.
		std::cout << c[42] << std::endl;
	}
	catch (const cl::Error &err) {
		std::cerr
			<< "OpenCL error: "
			<< err.what() << "(" << err.err() << ")"
			<< std::endl;
		return 1;
	}
}