/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* $Rev$ $Date$ */ #ifndef tuscany_opencl_eval_hpp #define tuscany_opencl_eval_hpp /** * OpenCL kernel evaluation logic. */ #ifdef IS_DARWIN #include #else #include #endif #include "list.hpp" #include "value.hpp" #include "perf.hpp" namespace tuscany { namespace opencl { /** * Convert an OpenCL error code to a string. */ const string clError(const cl_int e) { ostringstream s; s << "error " << e; return str(s); } /** * OpenCL profiling counters. */ #ifdef WANT_MAINTAINER_OPENCL_PROF cl_ulong memtime = 0; cl_ulong kernelqtime = 0; cl_ulong kerneltime = 0; cl_ulong preptime = 0; cl_ulong evaltime = 0; /** * Reset the OpenCL profiling counters. */ const bool resetOpenCLCounters() { memtime = kernelqtime = kerneltime = preptime = evaltime = 0; return true; } /** * Print the OpenCL profiling counters. */ const bool printOpenCLCounters(const long n) { cout << " kernelq " << ((double)kernelqtime / 1000000.0) / (double)n << " ms kernel " << ((double)kerneltime / 1000000.0) / (double)n << " ms memory " << ((double)memtime / 1000000.0) / (double)n << " ms prep " << ((double)preptime / 1000000.0) / (double)n << " ms eval " << ((double)evaltime / 1000000.0) / (double)n << " ms"; return true; } /** * Profile a memory event. */ const failable profileMemEvent(const cl_event evt) { cl_ulong start = 0; const cl_int serr = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); if (serr != CL_SUCCESS) return mkfailure("Couldn't profile memory event start: " + clError(serr)); cl_ulong end = 0; const cl_int eerr = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (eerr != CL_SUCCESS) return mkfailure("Couldn't profile memory event end: " + clError(eerr)); const cl_ulong t = end - start; memtime += t; return t; } /** * Profile a kernel event. */ const failable profileKernelEvent(const cl_event evt) { cl_ulong queued = 0; const cl_int qerr = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); if (qerr != CL_SUCCESS) return mkfailure("Couldn't profile kernel event queue: " + clError(qerr)); cl_ulong start = 0; const cl_int serr = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); if (serr != CL_SUCCESS) return mkfailure("Couldn't profile kernel event start: " + clError(serr)); cl_ulong end = 0; const cl_int eerr = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (eerr != CL_SUCCESS) return mkfailure("Couldn't profile kernel event end: " + clError(eerr)); const cl_ulong q = start - queued; kernelqtime += q; const cl_ulong t = end - start; kerneltime += t; return t; } /** * Profile an array of memory events. */ const failable profileMemEvents(const cl_uint n, const cl_event* const evt) { if (n == 0) return 0; const failable t = profileMemEvent(*evt); if (!hasContent(t)) return t; const failable r = profileMemEvents(n - 1, evt + 1); if (!hasContent(r)) return r; return content(t) + content(r); } #else #define resetOpenCLCounters() #define printOpenCLCounters(n) #endif class OpenCLContext; class OpenCLProgram; class OpenCLKernel; class OpenCLBuffer; /** * Represent an OpenCL context. */ class OpenCLContext { public: #define OPENCL_MAX_DEVICES 64 enum DeviceType { DEFAULT = 0, CPU = 1, GPU = 2 }; OpenCLContext(const OpenCLContext::DeviceType devtype) : dev(OpenCLContext::DEFAULT), ndevs(0), ctx(0) { debug("opencl::OpenCLContext"); for (int i = 0; i < OPENCL_MAX_DEVICES; i++) cq[i] = 0; // Get the available platforms cl_uint nplatforms; cl_platform_id platforms[16]; const cl_int gperr = clGetPlatformIDs(16, platforms, &nplatforms); if(nplatforms == 0 || gperr != CL_SUCCESS) { mkfailure("Couldn't get OpenCL platforms: " + clError(gperr)); return; } for(cl_uint i = 0; i < nplatforms; ++i) { char vendor[256]; const cl_int gverr = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); if(gverr != CL_SUCCESS) { mkfailure("Couldn't get OpenCL platform: " + clError(gverr)); return; } debug(vendor, "opencl::OpenCLContext::vendor"); } // Get the available devices of the requested type if (devtype == OpenCLContext::DEFAULT || devtype == OpenCLContext::GPU) { for(cl_uint i = 0; i < nplatforms; ++i) { const cl_int err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, OPENCL_MAX_DEVICES, devid, &ndevs); if (err == CL_SUCCESS) { debug(ndevs, "opencl::OpenCLContext::gcpus"); dev = OpenCLContext::GPU; break; } } } if ((devtype == OpenCLContext::DEFAULT && ndevs == 0) || devtype == OpenCLContext::CPU) { for(cl_uint i = 0; i < nplatforms; ++i) { const cl_int err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, OPENCL_MAX_DEVICES, devid, &ndevs); if (err == CL_SUCCESS) { debug(ndevs, "opencl::OpenCLContext::ncpus"); dev = OpenCLContext::CPU; break; } } } if (ndevs == 0) return; // Initialize OpenCL context and command queues cl_int ccerr; ctx = clCreateContext(0, ndevs, devid, NULL, NULL, &ccerr); if(!ctx || ccerr != CL_SUCCESS) { mkfailure("Couldn't create OpenCL context: " + clError(ccerr)); return; } for (cl_uint i = 0; i < ndevs; i++) { cl_int cqerr; #ifdef WANT_MAINTAINER_OPENCL_PROF cq[i] = clCreateCommandQueue(ctx, devid[i], CL_QUEUE_PROFILING_ENABLE, &cqerr); #else cq[i] = clCreateCommandQueue(ctx, devid[i], 0, &cqerr); #endif if (!cq[i] || cqerr != CL_SUCCESS) { mkfailure("Couldn't create OpenCL command queue: " + clError(cqerr)); return; } } } OpenCLContext(const OpenCLContext& c) : dev(c.dev), ndevs(c.ndevs), ctx(c.ctx) { for (cl_uint i = 0; i < ndevs; i++) { devid[i] = c.devid[i]; cq[i] = c.cq[i]; if (cq[i] != 0) { const cl_int rcqerr = clRetainCommandQueue(cq[i]); if (rcqerr != CL_SUCCESS) mkfailure(string("Couldn't retain OpenCL command queue: ") + clError(rcqerr)); } } const cl_int rcerr = clRetainContext(ctx); if (rcerr != CL_SUCCESS) mkfailure(string("Couldn't retain OpenCL context: ") + clError(rcerr)); } ~OpenCLContext() { for (cl_uint i = 0; i < ndevs; i++) { if (cq[i] != 0) clReleaseCommandQueue(cq[i]); } if (ctx != 0) clReleaseContext(ctx); } private: OpenCLContext::DeviceType dev; cl_uint ndevs; cl_device_id devid[OPENCL_MAX_DEVICES]; cl_context ctx; cl_command_queue cq[OPENCL_MAX_DEVICES]; friend const cl_uint devices(const OpenCLContext& cl); friend const cl_command_queue commandq(const OpenCLContext& cl); friend const failable readOnlyBuffer(const size_t size, const void* p, const OpenCLContext& cl, cl_command_queue cq); friend const failable writeOnlyBuffer(const size_t size, const OpenCLContext& cl); friend const failable evalKernel(const failable& kernel, const value& expr, const size_t gwsize, const value::ValueType type, const size_t n, const OpenCLContext& cl); friend const failable readProgram(const string& path, istream& is, const OpenCLContext& cl); }; /** * Return the number of computing devices available in a context. */ const cl_uint devices(const OpenCLContext& cl) { return cl.ndevs; } /** * Return a command queue from a context. */ const cl_command_queue commandq(const OpenCLContext& cl) { return cl.cq[0]; } /** * Represents an OpenCL program. */ class OpenCLProgram { public: OpenCLProgram() : prog(0) { } OpenCLProgram(const cl_program prog) : prog(prog) { } OpenCLProgram(const OpenCLProgram& c) : prog(c.prog) { const cl_int rperr = clRetainProgram(prog); if (rperr != CL_SUCCESS) mkfailure(string("Couldn't retain OpenCL program: ") + clError(rperr)); } ~OpenCLProgram() { if (!prog) return; const cl_int rperr = clReleaseProgram(prog); if (rperr != CL_SUCCESS) mkfailure(string("Couldn't release OpenCL program: ") + clError(rperr)); } private: const cl_program prog; friend const failable createKernel(const value& expr, const OpenCLProgram& clprog); friend const failable evalKernel(const failable& kernel, const value& expr, const size_t gwsize, const value::ValueType type, const size_t n, const OpenCLContext& cl); }; /** * Represents an OpenCL kernel. */ class OpenCLKernel { public: OpenCLKernel() : k(0) { } OpenCLKernel(const cl_kernel k) : k(k) { } OpenCLKernel(const OpenCLKernel& c) : k(c.k) { const cl_int rkerr = clRetainKernel(k); if (rkerr != CL_SUCCESS) mkfailure(string("Couldn't retain OpenCL kernel: ") + clError(rkerr)); } ~OpenCLKernel() { if (!k) return; const cl_int rkerr = clReleaseKernel(k); if (rkerr != CL_SUCCESS) mkfailure(string("Couldn't release OpenCL kernel: ") + clError(rkerr)); } private: const cl_kernel k; friend const failable valueToKernelArg(const cl_uint i, const size_t size, const void* p, const failable& buf, const OpenCLKernel& kernel); friend const failable evalKernel(const failable& kernel, const value& expr, const size_t gwsize, const value::ValueType type, const size_t n, const OpenCLContext& cl); }; /** * Represents an OpenCL buffer. */ class OpenCLBuffer { public: OpenCLBuffer() : mem(0), evt(0) { } OpenCLBuffer(const cl_mem mem, const cl_event evt) : mem(mem), evt(evt) { } OpenCLBuffer(const OpenCLBuffer& c) : mem(c.mem), evt(c.evt) { if (mem != 0) { const cl_int rmerr = clRetainMemObject(mem); if (rmerr != CL_SUCCESS) mkfailure(string("Couldn't retain OpenCL buffer: ") + clError(rmerr)); } if (evt != 0) { const cl_int reerr = clRetainEvent(evt); if (reerr != CL_SUCCESS) mkfailure(string("Couldn't retain OpenCL event: ") + clError(reerr)); } } ~OpenCLBuffer() { if (mem != 0) { const cl_int rmerr = clReleaseMemObject(mem); if (rmerr != CL_SUCCESS) mkfailure(string("Couldn't release OpenCL buffer: ") + clError(rmerr)); } if (evt != 0) { const cl_int reerr = clReleaseEvent(evt); if (reerr != CL_SUCCESS) mkfailure(string("Couldn't release OpenCL event: ") + clError(reerr)); } } private: const cl_mem mem; const cl_event evt; friend const cl_uint writeBufferEvents(const list& buf, cl_event* evt); friend const failable valueToKernelArg(const cl_uint i, const size_t size, const void* p, const failable& buf, const OpenCLKernel& kernel); friend const failable evalKernel(const failable& kernel, const value& expr, const size_t gwsize, const value::ValueType type, const size_t n, const OpenCLContext& cl); }; /** * Return a read-only memory buffer. */ const failable readOnlyBuffer(const size_t size, const void* const p, const OpenCLContext& cl, const cl_command_queue cq) { if (cl.dev == OpenCLContext::CPU) { cl_int err; const cl_mem buf = clCreateBuffer(cl.ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, size, const_cast(p), &err); if (!buf || err != CL_SUCCESS) return mkfailure(string("Couldn't map OpenCL host memory: ") + clError(err)); return OpenCLBuffer(buf, 0); } cl_int berr; const cl_mem buf = clCreateBuffer(cl.ctx, CL_MEM_READ_ONLY, size, NULL, &berr); if (!buf || berr != CL_SUCCESS) return mkfailure(string("Couldn't allocate OpenCL device memory: ") + clError(berr)); cl_event wevt; const cl_int werr = clEnqueueWriteBuffer(cq, buf, CL_FALSE, 0, size, p, 0, NULL, &wevt); if (werr != CL_SUCCESS) { clReleaseMemObject(buf); return mkfailure(string("Couldn't enqueue write to device memory: ") + clError(werr)); } return OpenCLBuffer(buf, wevt); } /** * Fill an array of write events for a given list of buffers. */ const cl_uint writeBufferEvents(const list& buf, cl_event* const evt) { if (isNull(buf)) return 0; const cl_event e = car(buf).evt; if (e == 0) return writeBufferEvents(cdr(buf), evt); *evt = e; return 1 + writeBufferEvents(cdr(buf), evt + 1); } /** * Return a write-only memory buffer. */ const failable writeOnlyBuffer(const size_t size, const OpenCLContext& cl) { if (cl.dev == OpenCLContext::CPU) { cl_int err; const cl_mem buf = clCreateBuffer(cl.ctx, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err); if (!buf || err != CL_SUCCESS) return mkfailure(string("Couldn't map OpenCL host memory: ") + clError(err)); return OpenCLBuffer(buf, 0); } cl_int err; const cl_mem buf = clCreateBuffer(cl.ctx, CL_MEM_WRITE_ONLY, size, NULL, &err); if (!buf || err != CL_SUCCESS) return mkfailure(string("Couldn't allocate OpenCL device memory: ") + clError(err)); return OpenCLBuffer(buf, 0); } /** * Convert a value to a kernel arg. */ const failable valueToKernelArg(const cl_uint i, const size_t size, const void* const p, const failable& buf, const OpenCLKernel& kernel) { if (!hasContent(buf)) return buf; if (p != NULL) { const cl_int err = clSetKernelArg(kernel.k, (cl_uint)i, size, p); if (err != CL_SUCCESS) return mkfailure(string("Couldn't set OpenCL simple kernel arg: ") + clError(err)); return buf; } const OpenCLBuffer b = content(buf); const cl_int err = clSetKernelArg(kernel.k, i, sizeof(cl_mem), &b.mem); if (err != CL_SUCCESS) return mkfailure(string("Couldn't set OpenCL buffer kernel arg: ") + clError(err)); return buf; } const failable valueToKernelArg(const value& v, const cl_uint i, const OpenCLKernel& kernel, const OpenCLContext& cl, const cl_command_queue cq) { switch (type(v)) { case value::Symbol: { const string s = string("'") + (string)v; return valueToKernelArg(i, 0, NULL, readOnlyBuffer(length(s) + 1, c_str(s), cl, cq), kernel); } case value::String: { const string s = (string)v; return valueToKernelArg(i, 0, NULL, readOnlyBuffer(length(s) + 1, c_str(s), cl, cq), kernel); } case value::Number: { const cl_float d = (cl_float)((double)v); return valueToKernelArg(i, sizeof(cl_float), &d, OpenCLBuffer(), kernel); } case value::Bool: { const cl_int b = (cl_int)((bool)v); return valueToKernelArg(i, sizeof(cl_int), &b, OpenCLBuffer(), kernel); } default: { return valueToKernelArg(i, sizeof(cl_mem), NULL, readOnlyBuffer(sizeof(value), &v, cl, cq), kernel); } } } /** * Convert a list of values to kernel args. */ const failable> valuesToKernelArgsListHelper(const list& v, const cl_uint i, const OpenCLKernel& kernel, const OpenCLContext& cl, const cl_command_queue cq) { if (isNull(v)) return list(); const failable a = valueToKernelArg(car(v), i, kernel, cl, cq); if (!hasContent(a)) return mkfailure>(a); const failable> al = valuesToKernelArgsListHelper(cdr(v), i + 1, kernel, cl, cq); if (!hasContent(al)) return al; return cons(content(a), content(al)); } const failable> valuesToKernelArgs(const list& v, const OpenCLKernel& kernel, const OpenCLContext& cl, const cl_command_queue cq) { return valuesToKernelArgsListHelper(v, 0, kernel, cl, cq); } /** * Convert a kernel result to a value. */ const value kernelResultToValue(const void* const p, const value::ValueType type) { switch(type) { case value::Symbol: { const char* const s = (const char*)p; const size_t l = strlen(s); if (l != 0 && *s == '\'') return value(s + 1); return value(s); } case value::String: { const char* const s = (const char*)p; const size_t l = strlen(s); if (l != 0 && *s == '\'') return value(s + 1); return value(string(s, l)); } case value::Number: return (double)(*(const cl_float*)p); case value::Bool: return (bool)(*(const cl_int*)p); default: return *(const value*)p; } } /** * Return the value type corresponding to a C99 type name. */ const value::ValueType valueType(const string& t) { if (t == "float") return value::Number; if (t == "int") return value::Bool; if (t == "char") return value::String; return value::Nil; } /** * Return the size of a C99 type corresponding to a value type. */ const size_t valueSize(const value::ValueType type) { switch(type) { case value::Number: return sizeof(cl_float); case value::Bool: return sizeof(cl_int); case value::Symbol: return sizeof(cl_char); case value::String: return sizeof(cl_char); default: return sizeof(value); } } /** * Return the result type of a kernel. */ class OpenCLResultType { public: OpenCLResultType(const value::ValueType type, const size_t n, const size_t size) : type(type), n(n), size(size) {} const value::ValueType type; const size_t n; const size_t size; }; const OpenCLResultType kernelResultType(const string& fn, value::ValueType type, const size_t n) { if (type != value::Nil) return OpenCLResultType(type, n, valueSize(type)); const string s = car(tokenize("_", fn)); const size_t d = find_first_of(s, "0123456789"); if (d == length(s)) { const value::ValueType vt = valueType(s); return OpenCLResultType(vt, 1, valueSize(vt)); } const value::ValueType vt = valueType(substr(s, 0, d)); return OpenCLResultType(vt, atoi(c_str(substr(s, d))), valueSize(vt)); } /** * Create the kernel implementing an expression. */ const failable createKernel(const value& expr, const OpenCLProgram& clprog) { // Create an OpenCL kernel for the requested function const value fn = car(expr); cl_int ckerr; const cl_kernel k = clCreateKernel(clprog.prog, c_str(fn), &ckerr); if (k == NULL || ckerr != CL_SUCCESS) { // The start, stop, and restart functions are optional //if (fn == "start" || fn == "stop") //return value(lvvlambda()); return mkfailure(string("Couldn't find function: ") + (string)car(expr) + " : " + clError(ckerr)); } return OpenCLKernel(k); } /** * Evaluate an expression implemented by a kernel. */ const failable evalKernel(const failable& fkernel, const value& expr, const size_t gwsize, const value::ValueType type, const size_t n, const OpenCLContext& cl) { #ifdef WANT_MAINTAINER_OPENCL_PROF const cl_uint estart = (cl_uint)timens(); const cl_uint pstart = estart; #endif if (!hasContent(fkernel)) return mkfailure(fkernel); const OpenCLKernel kernel = content(fkernel); // Get a command queue for the specified device type const cl_command_queue cq = commandq(cl); // Set the kernel input args const failable> args = valuesToKernelArgs(cdr(expr), kernel, cl, cq); if (!hasContent(args)) { return mkfailure(args); } // Allocate result buffer in device memory const value fn = car(expr); const OpenCLResultType rtype = kernelResultType(fn, type, n); const size_t rsize = rtype.n * rtype.size; const failable rbuf = writeOnlyBuffer(rsize, cl); if (!hasContent(rbuf)) return mkfailure(rbuf); // Set it as a kernel output arg const cl_mem rmem = content(rbuf).mem; const failable rarg = valueToKernelArg((cl_uint)length(cdr(expr)), sizeof(cl_mem), &rmem, rbuf, kernel); if (!hasContent(rarg)) return mkfailure(rarg); // Enqueue the kernel, to be executed after all the writes complete cl_event wevt[32]; const cl_uint nwevt = writeBufferEvents(content(args), wevt); cl_event kevt; const cl_int qerr = clEnqueueNDRangeKernel(cq, kernel.k, 1, NULL, &gwsize, NULL, nwevt, nwevt != 0? wevt : NULL, &kevt); if (qerr != CL_SUCCESS) return mkfailure(string("Couldn't enqueue kernel task: ") + clError(qerr)); // Enqueue result buffer read, to be executed after the kernel completes char res[rsize]; cl_event revt; const cl_int rerr = clEnqueueReadBuffer(cq, rmem, CL_FALSE, 0, rsize, res, 1, &kevt, &revt); if (rerr != CL_SUCCESS) { clReleaseEvent(kevt); return mkfailure(string("Couldn't read from OpenCL device memory: ") + clError(rerr)); } #ifdef WANT_MAINTAINER_OPENCL_PROF const cl_uint pend = (cl_uint)timens(); preptime += (pend - pstart); #endif // Wait for completion const cl_int werr = clWaitForEvents(1, &revt); if (werr != CL_SUCCESS) { clReleaseEvent(revt); clReleaseEvent(kevt); return mkfailure(string("Couldn't wait for kernel completion: ") + clError(werr)); } #ifdef WANT_MAINTAINER_OPENCL_PROF profileMemEvents(nwevt, wevt); profileKernelEvent(kevt); profileMemEvent(revt); #endif // Convert the result to a value const value v = kernelResultToValue(res, rtype.type); // Release OpenCL resources clReleaseEvent(revt); clReleaseEvent(kevt); #ifdef WANT_MAINTAINER_OPENCL_PROF const cl_uint eend = (cl_uint)timens(); evaltime += (eend - estart); #endif return v; } const failable evalKernel(const failable& kernel, const value& expr, const OpenCLContext& cl) { return evalKernel(kernel, expr, 1, value::Nil, 0, cl); } /** * Read an opencl program from an input stream. */ const failable readProgram(const string& path, istream& is, const OpenCLContext& cl) { // Read the program source const list ls = streamList(is); ostringstream os; write(ls, os); const char* cs = c_str(str(os)); // Create the OpenCL program cl_int cperr; const cl_program prog = clCreateProgramWithSource(cl.ctx, 1, (const char **)&cs, NULL, &cperr); if (!prog || cperr != CL_SUCCESS) return mkfailure(string("Couldn't create OpenCL program from source: ") + path + " : " + clError(cperr)); // Built it const cl_int bperr = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); if(bperr != CL_SUCCESS) { size_t l; char b[2048]; clGetProgramBuildInfo(prog, cl.devid[0], CL_PROGRAM_BUILD_LOG, sizeof(b), b, &l); return mkfailure(string("Couldn't build OpenCL program: ") + path + " : " + clError(bperr) + "\n" + string(b)); } return OpenCLProgram(prog); } /** * Evaluate an expression against an OpenCL program provided as an input stream. */ const failable evalKernel(const value& expr, istream& is, const OpenCLContext& cl) { failable clprog = readProgram("program.cl", is, cl); if (!hasContent(clprog)) return mkfailure(clprog); return evalKernel(createKernel(expr, content(clprog)), expr, 1, value::Nil, 0, cl); } } } #endif /* tuscany_opencl_eval_hpp */