/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* $Rev$ $Date$ */ /** * Test OpenCL kernel evaluator. */ #include #include "stream.hpp" #include "string.hpp" #include "driver.hpp" #include "parallel.hpp" #include "perf.hpp" namespace tuscany { namespace opencl { const string testFloatsCPU = "kernel void add(const float x, const float y, global float* r) {\n" " float l_x;\n" " float l_y;\n" " l_x = x;\n" " l_y = y;\n" " *r = l_x + l_y;\n" "}\n" "kernel void float_add(const float x, const float y, global float* r) {\n" " add(x, y, r);\n" "}\n"; const string testFloatsGPU = "kernel void add(const float x, const float y, global float* r) {\n" " local float l_x;\n" " local float l_y;\n" " l_x = x;\n" " l_y = y;\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " *r = l_x + l_y;\n" "}\n" "kernel void float_add(const float x, const float y, global float* r) {\n" " add(x, y, r);\n" "}\n"; const string testBoolsCPU = "kernel void int_or(const int x, const int y, global int* r) {\n" " int l_x;\n" " int l_y;\n" " l_x = x;\n" " l_y = y;\n" " *r = l_x || l_y;\n" "}\n"; const string testBoolsGPU = "kernel void int_or(const int x, const int y, global int* r) {\n" " local int l_x;\n" " local int l_y;\n" " l_x = x;\n" " l_y = y;\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " *r = l_x || l_y;\n" "}\n"; const string testCharsCPU = "kernel void add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " const int ixl = (int)xl;\n" " const int iyl = (int)yl;\n" " for (int i = 0; i < ixl; i++)\n" " r[i] = x[i];\n" " for (int i = 0; i < iyl; i++)\n" " r[ixl + i] = y[i];\n" " r[ixl + iyl] = '\\0';\n" "}\n" "kernel void char128_add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " add(xl, x, yl, y, r);\n" "}\n"; const string testCharsCopyGPU = "kernel void add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " const int ixl = (int)xl;\n" " const int iyl = (int)yl;\n" " local char l_x[128];\n" " local char l_y[128];\n" " event_t re = async_work_group_copy(l_x, x, ixl, 0);\n" " async_work_group_copy(l_y, y, iyl, re);\n" " wait_group_events(1, &re);\n" " local char l_r[128];\n" " for (int i = 0; i < ixl; i++)\n" " l_r[i] = l_x[i];\n" " for (int i = 0; i < iyl; i++)\n" " l_r[ixl + i] = l_y[i];\n" " l_r[ixl + iyl] = '\\0';\n" " event_t we = async_work_group_copy(r, l_r, ixl + iyl + 1, 0);\n" " wait_group_events(1, &we);\n" "}\n" "kernel void char128(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " add(xl, x, yl, y, r);\n" "}\n"; const string testCharsGPU = "kernel void add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " const int ixl = (int)xl;\n" " const int iyl = (int)yl;\n" " for (int i = 0; i < ixl; i++)\n" " r[i] = x[i];\n" " for (int i = 0; i < iyl; i++)\n" " r[ixl + i] = y[i];\n" " r[ixl + iyl] = '\\0';\n" "}\n" "kernel void char128(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " add(xl, x, yl, y, r);\n" "}\n"; const string testCharsParallelCPU = "kernel void add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " const int i = get_global_id(0);\n" " const int ixl = (int)xl;\n" " const int iyl = (int)yl;\n" " r[i] = i < ixl? x[i] : i < ixl + iyl? y[i - ixl] : '\\0';\n" "}\n"; const string testCharsParallelCopyGPU = "kernel void add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " const int i = get_global_id(0);\n" " const int ixl = (int)xl;\n" " const int iyl = (int)yl;\n" " local char l_x[128];\n" " local char l_y[128];\n" " event_t re = async_work_group_copy(l_x, x, ixl, 0);\n" " async_work_group_copy(l_y, y, iyl, re);\n" " wait_group_events(1, &re);\n" " local char l_r[128];\n" " l_r[i] = i < ixl? l_x[i] : i < ixl + iyl? l_y[i - ixl] : '\\0';\n" " event_t we = async_work_group_copy(r, l_r, ixl + iyl + 1, 0);\n" " wait_group_events(1, &we);\n" "}\n"; const string testCharsParallelGPU = "kernel void add(const float xl, global const char* x, const float yl, global const char* y, global char* r) {\n" " const int i = get_global_id(0);\n" " const int ixl = (int)xl;\n" " const int iyl = (int)yl;\n" " r[i] = i < ixl? x[i] : i < ixl + iyl? y[i - ixl] : '\\0';\n" "}\n"; bool testTaskParallel(const OpenCLContext::DeviceType dev) { gc_scoped_pool pool; OpenCLContext cl(dev); if (!devices(cl) != 0) return true; { istringstream is(dev == OpenCLContext::CPU? testFloatsCPU : testFloatsGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); const value exp = mklist("float_add", 2, 3); const failable r = evalKernel(createKernel(exp, content(clprog)), exp, cl); assert(hasContent(r)); assert(content(r) == value(5)); } if (true) return true; { istringstream is(dev == OpenCLContext::CPU? testFloatsCPU : testFloatsGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); const value exp = mklist("add", 2, 3); const failable r = evalKernel(createKernel(exp, content(clprog)), exp, 1, value::Number, 1, cl); assert(hasContent(r)); assert(content(r) == value(5)); } { istringstream is(dev == OpenCLContext::CPU? testBoolsCPU : testBoolsGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); const value exp = mklist("int_or", true, false); const failable r = evalKernel(createKernel(exp, content(clprog)), exp, cl); assert(hasContent(r)); assert(content(r) == value(true)); const value exp2 = mklist("int_or", false, false); const failable r2 = evalKernel(createKernel(exp2, content(clprog)), exp2, cl); assert(hasContent(r2)); assert(content(r2) == value(false)); } { istringstream is(dev == OpenCLContext::CPU? testCharsCPU : testCharsGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); const value exp = mklist("char128", 60, string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello "), 60, string("World World World World World World World World World World ")); const failable r = evalKernel(createKernel(exp, content(clprog)), exp, cl); assert(hasContent(r)); assert(content(r) == value(string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello World World World World World World World World World World "))); } { istringstream is(dev == OpenCLContext::CPU? testCharsCPU : testCharsGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); const value exp = mklist("add", 60, string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello "), 60, string("World World World World World World World World World World ")); const failable r = evalKernel(createKernel(exp, content(clprog)), exp, 1, value::String, 128, cl); assert(hasContent(r)); assert(content(r) == value(string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello World World World World World World World World World World "))); } return true; } struct evalTaskParallelLoop { evalTaskParallelLoop(const OpenCLContext& cl, const OpenCLProgram& clprog) : cl(cl), clprog(clprog), exp(mklist("add", 60, string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello "), 60, string("World World World World World World World World World World "))) { } const bool operator()() const { const failable r = evalKernel(createKernel(exp, clprog), exp, 1, value::String, 128, cl); assert(hasContent(r)); assert(content(r) == value(string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello World World World World World World World World World World "))); return true; } const OpenCLContext& cl; const OpenCLProgram& clprog; const value exp; }; const bool testTaskParallelPerf(const OpenCLContext::DeviceType dev, const bool copy) { gc_scoped_pool pool; OpenCLContext cl(dev); if (!devices(cl) != 0) return true; istringstream is(dev == OpenCLContext::CPU? testCharsCPU : copy? testCharsCopyGPU : testCharsGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); resetOpenCLCounters(); const lambda el = evalTaskParallelLoop(cl, content(clprog)); cout << "OpenCL task-parallel eval " << (dev == OpenCLContext::CPU? "CPU" : "GPU") << (copy? " copy" : "") << " test " << time(el, 5, 500) << " ms"; printOpenCLCounters(500); cout << endl; return true; } bool testDataParallel(const OpenCLContext::DeviceType dev) { gc_scoped_pool pool; OpenCLContext cl(dev); if (!devices(cl) != 0) return true; { istringstream is(dev == OpenCLContext::CPU? testCharsParallelCPU : testCharsParallelGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); const value exp = mklist("add", 6, string("Hello "), 5, string("World")); const failable r = evalKernel(createKernel(exp, content(clprog)), exp, 121, value::String, 128, cl); assert(hasContent(r)); assert(content(r) == value(string("Hello World"))); } return true; } struct evalDataParallelLoop { evalDataParallelLoop(const OpenCLContext& cl, const OpenCLProgram& clprog) : cl(cl), clprog(clprog), exp(mklist("add", 60, string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello "), 60, string("World World World World World World World World World World "))) { } const bool operator()() const { const failable r = evalKernel(createKernel(exp, clprog), exp, 121, value::String, 128, cl); assert(hasContent(r)); assert(content(r) == value(string("Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello World World World World World World World World World World "))); return true; } const OpenCLContext& cl; const OpenCLProgram& clprog; const value exp; }; const bool testDataParallelPerf(const OpenCLContext::DeviceType dev, const bool copy) { gc_scoped_pool pool; OpenCLContext cl(dev); if (!devices(cl) != 0) return true; istringstream is(dev == OpenCLContext::CPU? testCharsParallelCPU : copy? testCharsParallelCopyGPU : testCharsParallelGPU); failable clprog = readProgram("kernel.cl", is, cl); assert(hasContent(clprog)); resetOpenCLCounters(); const lambda el = evalDataParallelLoop(cl, content(clprog)); cout << "OpenCL data-parallel eval " << (dev == OpenCLContext::CPU? "CPU" : "GPU") << (copy? " copy" : "") << " test " << time(el, 5, 500) << " ms"; printOpenCLCounters(500); cout << endl; return true; } } } int main() { tuscany::cout << "Testing..." << tuscany::endl; tuscany::opencl::testTaskParallel(tuscany::opencl::OpenCLContext::CPU); tuscany::opencl::testTaskParallelPerf(tuscany::opencl::OpenCLContext::CPU, false); tuscany::opencl::testDataParallel(tuscany::opencl::OpenCLContext::CPU); tuscany::opencl::testDataParallelPerf(tuscany::opencl::OpenCLContext::CPU, false); tuscany::opencl::testTaskParallel(tuscany::opencl::OpenCLContext::GPU); tuscany::opencl::testTaskParallelPerf(tuscany::opencl::OpenCLContext::GPU, false); tuscany::opencl::testTaskParallelPerf(tuscany::opencl::OpenCLContext::GPU, true); tuscany::opencl::testDataParallel(tuscany::opencl::OpenCLContext::GPU); tuscany::opencl::testDataParallelPerf(tuscany::opencl::OpenCLContext::GPU, false); tuscany::opencl::testDataParallelPerf(tuscany::opencl::OpenCLContext::GPU, true); tuscany::cout << "OK" << tuscany::endl; return 0; }