Stop leaking all ScPostIt instances.
[LibreOffice.git] / sc / source / core / opencl / opencl_device.cxx
blobde99e0ddc0dc771d04ca50919a22837b1b80d2ac
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #ifdef _WIN32
11 #include <windows.h>
12 #elif defined __MACH__
13 #include <mach/mach_time.h>
14 #else
15 #include <sys/time.h>
16 #endif
17 #include <time.h>
18 #include <math.h>
19 #include <float.h>
20 #include <iostream>
21 #include <sstream>
22 #include <vector>
23 #include <boost/scoped_ptr.hpp>
25 #include "opencl_device.hxx"
28 #define INPUTSIZE 15360
29 #define OUTPUTSIZE 15360
31 #define STRINGIFY(...) #__VA_ARGS__"\n"
32 //#define LOG_PRINTF(x) (std::cout << x << std::endl)
33 #define LOG_PRINTF(x)
35 #define DS_CHECK_STATUS(status, name) \
36 if (CL_SUCCESS != status) \
37 { \
38 LOG_PRINTF("[OCL] Error code is " << status << " at " << name); \
41 namespace sc { namespace OpenCLDevice {
43 bool bIsInited = false;
44 bool bIsDeviceSelected = false;
45 ds_device selectedDevice;
47 struct LibreOfficeDeviceScore
49 double fTime; // small time means faster device
50 bool bNoCLErrors; // were there any opencl errors
53 struct LibreOfficeDeviceEvaluationIO
55 std::vector<double> input0;
56 std::vector<double> input1;
57 std::vector<double> input2;
58 std::vector<double> input3;
59 std::vector<double> output;
60 unsigned long inputSize;
61 unsigned long outputSize;
64 struct timer
66 #ifdef _WIN32
67 LARGE_INTEGER start;
68 #else
69 long long start;
70 #endif
73 const char* source = STRINGIFY(
74 \n#if defined(KHR_DP_EXTENSION)
75 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable
76 \n#elif defined(AMD_DP_EXTENSION)
77 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable
78 \n#endif
80 int isNan(fp_t a) { return a != a; }
81 fp_t fsum(fp_t a, fp_t b) { return a + b; }
83 fp_t fAverage(__global fp_t* input)
85 fp_t sum = 0;
86 int count = 0;
87 for (int i = 0; i < INPUTSIZE; i++)
89 if (!isNan(input[i]))
91 sum = fsum(input[i], sum);
92 count += 1;
95 return sum / (fp_t)count;
97 fp_t fMin(__global fp_t* input)
99 fp_t min = MAXFLOAT;
100 for (int i = 0; i < INPUTSIZE; i++)
102 if (!isNan(input[i]))
104 min = fmin(input[i], min);
107 return min;
109 fp_t fSoP(__global fp_t* input0, __global fp_t* input1)
111 fp_t sop = 0.0;
112 for (int i = 0; i < INPUTSIZE; i++)
114 sop += (isNan(input0[i]) ? 0 : input0[i]) * (isNan(input1[i]) ? 0 : input1[i]);
116 return sop;
118 __kernel void DynamicKernel(
119 __global fp_t* result, __global fp_t* input0, __global fp_t* input1, __global fp_t* input2, __global fp_t* input3)
121 int gid0 = get_global_id(0);
122 fp_t tmp0 = fAverage(input0);
123 fp_t tmp1 = fMin(input1) * fSoP(input2, input3);
124 result[gid0] = fsum(tmp0, tmp1);
128 size_t sourceSize[] = { strlen(source) };
130 /*************************************************************************/
131 /* INTERNAL FUNCTIONS */
132 /*************************************************************************/
133 /* Timer functions - start timer */
134 void timerStart(timer* mytimer)
136 #ifdef _WIN32
137 QueryPerformanceCounter(&mytimer->start);
138 #elif defined __MACH__
139 mytimer->start = mach_absolute_time();
140 #else
141 struct timespec s;
142 clock_gettime(CLOCK_MONOTONIC, &s);
143 mytimer->start = (long long)s.tv_sec * (long long)1.0E6 + (long long)s.tv_nsec / (long long)1.0E3;
144 #endif
147 /* Timer functions - get current value */
148 double timerCurrent(timer* mytimer)
150 #ifdef _WIN32
151 LARGE_INTEGER stop, frequency;
152 QueryPerformanceCounter(&stop);
153 QueryPerformanceFrequency(&frequency);
154 double time = ((double)(stop.QuadPart - mytimer->start.QuadPart) / frequency.QuadPart);
155 #elif defined __MACH__
156 static mach_timebase_info_data_t info = { 0, 0 };
157 if (info.numer == 0)
158 mach_timebase_info(&info);
159 long long stop = mach_absolute_time();
160 double time = ((stop - mytimer->start) * (double) info.numer / info.denom) / 1.0E9;
161 #else
162 struct timespec s;
163 long long stop;
164 clock_gettime(CLOCK_MONOTONIC, &s);
165 stop = (long long)s.tv_sec * (long long)1.0E6 + (long long)s.tv_nsec / (long long)1.0E3;
166 double time = ((double)(stop - mytimer->start) / 1.0E6);
167 #endif
168 return time;
171 /* Random number generator */
172 double random(double min, double max)
174 return floor(((double)rand() / ((unsigned int)RAND_MAX + 1)) * (max - min + 1) + min);
177 /* Populate input */
178 void populateInput(LibreOfficeDeviceEvaluationIO* testData)
180 srand((unsigned int)time(NULL));
181 double* input0 = &testData->input0[0];
182 double* input1 = &testData->input1[0];
183 double* input2 = &testData->input2[0];
184 double* input3 = &testData->input3[0];
185 for (unsigned long i = 0; i < testData->inputSize; i++)
187 input0[i] = random(0, i);
188 input1[i] = random(0, i);
189 input2[i] = random(0, i);
190 input3[i] = random(0, i);
193 /* Encode score object as byte string */
194 ds_status serializeScore(ds_device* device, void** serializedScore, unsigned int* serializedScoreSize)
196 *serializedScoreSize = sizeof(LibreOfficeDeviceScore);
197 *serializedScore = (void*)new unsigned char[*serializedScoreSize];
198 memcpy(*serializedScore, device->score, *serializedScoreSize);
199 return DS_SUCCESS;
202 /* Parses byte string and stores in score object */
203 ds_status deserializeScore(ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize)
205 // check that serializedScoreSize == sizeof(LibreOfficeDeviceScore);
206 device->score = new LibreOfficeDeviceScore;
207 memcpy(device->score, serializedScore, serializedScoreSize);
208 return DS_SUCCESS;
211 /* Releases memory held by score */
212 ds_status releaseScore(void* score)
214 if (NULL != score)
216 delete (LibreOfficeDeviceScore*)score;
218 return DS_SUCCESS;
221 /* Evaluate devices */
222 ds_status evaluateScoreForDevice(ds_device* device, void* evalData)
224 if (DS_DEVICE_OPENCL_DEVICE == device->type)
226 /* Evaluating an OpenCL device */
227 LOG_PRINTF("[DS] Device: \"" << device->oclDeviceName << "\" (OpenCL) evaluation...");
228 cl_int clStatus;
229 cl_context clContext;
230 cl_command_queue clQueue;
231 cl_program clProgram;
232 cl_kernel clKernel;
234 /* Check for 64-bit float extensions */
235 size_t aDevExtInfoSize = 0;
236 clStatus = clGetDeviceInfo(device->oclDeviceID, CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize);
237 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clGetDeviceInfo");
239 char* aExtInfo = new char[aDevExtInfoSize];
240 clStatus = clGetDeviceInfo(device->oclDeviceID, CL_DEVICE_EXTENSIONS, sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
241 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clGetDeviceInfo");
243 bool bKhrFp64Flag = false;
244 bool bAmdFp64Flag = false;
245 const char* buildOption = NULL;
246 std::string tmpStr("-Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16 -DINPUTSIZE=");
247 std::ostringstream tmpOStrStr;
248 tmpOStrStr << std::dec << INPUTSIZE;
249 tmpStr.append(tmpOStrStr.str());
251 if ((std::string(aExtInfo)).find("cl_khr_fp64") != std::string::npos)
253 bKhrFp64Flag = true;
254 //buildOption = "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16";
255 tmpStr.append(" -DKHR_DP_EXTENSION");
256 buildOption = tmpStr.c_str();
258 else if ((std::string(aExtInfo)).find("cl_amd_fp64") != std::string::npos)
260 bAmdFp64Flag = true;
261 //buildOption = "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16";
262 tmpStr.append(" -DAMD_DP_EXTENSION");
263 buildOption = tmpStr.c_str();
265 delete[] aExtInfo;
267 if (!bKhrFp64Flag && !bAmdFp64Flag)
269 /* No 64-bit float support */
270 device->score = (void*)new LibreOfficeDeviceScore;
271 ((LibreOfficeDeviceScore*)device->score)->fTime = DBL_MAX;
272 ((LibreOfficeDeviceScore*)device->score)->bNoCLErrors = true;
274 else
276 /* 64-bit float support present */
278 /* Create context and command queue */
279 clContext = clCreateContext(NULL, 1, &device->oclDeviceID, NULL, NULL, &clStatus);
280 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateContext");
281 clQueue = clCreateCommandQueue(clContext, device->oclDeviceID, 0, &clStatus);
282 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateCommandQueue");
284 /* Build program */
285 clProgram = clCreateProgramWithSource(clContext, 1, &source, sourceSize, &clStatus);
286 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateProgramWithSource");
287 clStatus = clBuildProgram(clProgram, 1, &device->oclDeviceID, buildOption, NULL, NULL);
288 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clBuildProgram");
289 if (CL_SUCCESS != clStatus)
291 /* Build program failed */
292 size_t length;
293 char* buildLog;
294 clStatus = clGetProgramBuildInfo(clProgram, device->oclDeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
295 buildLog = (char*)malloc(length);
296 clStatus = clGetProgramBuildInfo(clProgram, device->oclDeviceID, CL_PROGRAM_BUILD_LOG, length, buildLog, &length);
297 LOG_PRINTF("[OCL] Build Errors" << std::endl << buildLog);
298 free(buildLog);
300 device->score = (void*)new LibreOfficeDeviceScore;
301 ((LibreOfficeDeviceScore*)device->score)->fTime = DBL_MAX;
302 ((LibreOfficeDeviceScore*)device->score)->bNoCLErrors = false;
304 else
306 /* Build program succeeded */
307 timer kernelTime;
308 timerStart(&kernelTime);
310 /* Run kernel */
311 LibreOfficeDeviceEvaluationIO* testData = (LibreOfficeDeviceEvaluationIO*)evalData;
312 clKernel = clCreateKernel(clProgram, "DynamicKernel", &clStatus);
313 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateKernel");
314 cl_mem clResult = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_double) * testData->outputSize, &testData->output[0], &clStatus);
315 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateBuffer::clResult");
316 cl_mem clInput0 = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_double) * testData->inputSize, &testData->input0[0], &clStatus);
317 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateBuffer::clInput0");
318 cl_mem clInput1 = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_double) * testData->inputSize, &testData->input1[0], &clStatus);
319 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateBuffer::clInput1");
320 cl_mem clInput2 = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_double) * testData->inputSize, &testData->input2[0], &clStatus);
321 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateBuffer::clInput2");
322 cl_mem clInput3 = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_double) * testData->inputSize, &testData->input3[0], &clStatus);
323 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clCreateBuffer::clInput3");
324 clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void*)&clResult);
325 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clSetKernelArg::clResult");
326 clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void*)&clInput0);
327 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clSetKernelArg::clInput0");
328 clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void*)&clInput1);
329 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clSetKernelArg::clInput1");
330 clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void*)&clInput2);
331 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clSetKernelArg::clInput2");
332 clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void*)&clInput3);
333 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clSetKernelArg::clInput3");
334 size_t globalWS[1] = { testData->outputSize };
335 size_t localSize[1] = { 64 };
336 clStatus = clEnqueueNDRangeKernel(clQueue, clKernel, 1, 0, globalWS, localSize, 0, NULL, NULL);
337 DS_CHECK_STATUS(clStatus, "evaluateScoreForDevice::clEnqueueNDRangeKernel");
338 clFinish(clQueue);
339 clReleaseMemObject(clInput3);
340 clReleaseMemObject(clInput2);
341 clReleaseMemObject(clInput1);
342 clReleaseMemObject(clInput0);
343 clReleaseMemObject(clResult);
344 clReleaseKernel(clKernel);
346 device->score = (void*)new LibreOfficeDeviceScore;
347 ((LibreOfficeDeviceScore*)device->score)->fTime = timerCurrent(&kernelTime);
348 ((LibreOfficeDeviceScore*)device->score)->bNoCLErrors = true;
351 clReleaseProgram(clProgram);
352 clReleaseCommandQueue(clQueue);
353 clReleaseContext(clContext);
356 else
358 /* Evaluating an Native CPU device */
359 LOG_PRINTF("[DS] Device: \"CPU\" (Native) evaluation...");
360 timer kernelTime;
361 timerStart(&kernelTime);
363 LibreOfficeDeviceEvaluationIO* testData = (LibreOfficeDeviceEvaluationIO*)evalData;
364 for (unsigned long j = 0; j < testData->outputSize; j++)
366 double fAverage = 0.0f;
367 double fMin = DBL_MAX;
368 double fSoP = 0.0f;
369 for (unsigned long i = 0; i < testData->inputSize; i++)
371 fAverage += testData->input0[i];
372 fMin = ((fMin < testData->input1[i]) ? fMin : testData->input1[i]);
373 fSoP += testData->input2[i] * testData->input3[i];
375 fAverage /= testData->inputSize;
376 testData->output[j] = fAverage + (fMin * fSoP);
379 // InterpretTail - the S/W fallback is nothing like as efficient
380 // as any good openCL implementation: no SIMD, tons of branching
381 // in the inner loops etc. Generously characterise it as only 10x
382 // slower than the above.
383 float fInterpretTailFactor = 10.0;
385 device->score = (void*)new LibreOfficeDeviceScore;
386 ((LibreOfficeDeviceScore*)device->score)->fTime = timerCurrent(&kernelTime);
387 ((LibreOfficeDeviceScore*)device->score)->bNoCLErrors = true;
389 ((LibreOfficeDeviceScore*)device->score)->fTime *= fInterpretTailFactor;
391 return DS_SUCCESS;
394 /* Pick best device */
395 ds_status pickBestDevice(ds_profile* profile, int* bestDeviceIdx)
397 double bestScore = DBL_MAX;
398 *bestDeviceIdx = -1;
400 for (unsigned int d = 0; d < profile->numDevices; d++)
402 ds_device device = profile->devices[d];
403 LibreOfficeDeviceScore *pScore = (LibreOfficeDeviceScore*)device.score;
405 double fScore = DBL_MAX;
406 if (pScore)
407 fScore = pScore->fTime;
408 else
410 LOG_PRINTF("Unusual null score");
413 if (DS_DEVICE_OPENCL_DEVICE == device.type)
415 LOG_PRINTF("[DS] Device[" << d << "] " << device.oclDeviceName << " (OpenCL) score is " << fScore);
417 else
419 LOG_PRINTF("[DS] Device[" << d << "] CPU (Native) score is " << fScore);
421 if (fScore < bestScore)
423 bestScore = fScore;
424 *bestDeviceIdx = d;
427 if (DS_DEVICE_OPENCL_DEVICE == profile->devices[*bestDeviceIdx].type)
429 LOG_PRINTF("[DS] Selected Device[" << *bestDeviceIdx << "]: " << profile->devices[*bestDeviceIdx].oclDeviceName << "(OpenCL).");
431 else
433 LOG_PRINTF("[DS] Selected Device[" << *bestDeviceIdx << "]: CPU (Native).");
436 return DS_SUCCESS;
439 /* Return device ID for matching device name */
440 int matchDevice(ds_profile* profile, char* deviceName)
442 int deviceMatch = -1;
443 for (unsigned int d = 0; d < profile->numDevices - 1; d++)
445 if ((std::string(profile->devices[d].oclDeviceName)).find(deviceName) != std::string::npos) deviceMatch = d;
447 if (std::string("NATIVE_CPU").find(deviceName) != std::string::npos) deviceMatch = profile->numDevices - 1;
448 return deviceMatch;
451 /*************************************************************************/
452 /* EXTERNAL FUNCTIONS */
453 /*************************************************************************/
454 ds_device getDeviceSelection(const char* sProfilePath, bool bForceSelection)
456 /* Run only if device is not yet selected */
457 if (!bIsDeviceSelected || bForceSelection)
459 /* Setup */
460 ds_status status;
461 ds_profile* profile = NULL;
462 status = initDSProfile(&profile, "LibreOffice v0.1");
464 if (!profile)
466 // failed to initialize profile.
467 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
468 return selectedDevice;
471 /* Try reading scores from file */
472 std::string tmpStr(sProfilePath);
473 const char* fileName = tmpStr.append("sc_opencl_device_profile.dat").c_str();
474 if (!bForceSelection)
476 status = readProfileFromFile(profile, deserializeScore, fileName);
478 else
480 status = DS_INVALID_PROFILE;
481 LOG_PRINTF("[DS] Performing forced profiling.");
483 if (DS_SUCCESS != status)
485 if (!bForceSelection)
487 LOG_PRINTF("[DS] Profile file not available (" << fileName << "); performing profiling.");
490 /* Populate input data for micro-benchmark */
491 boost::scoped_ptr<LibreOfficeDeviceEvaluationIO> testData(new LibreOfficeDeviceEvaluationIO);
492 testData->inputSize = INPUTSIZE;
493 testData->outputSize = OUTPUTSIZE;
494 testData->input0.resize(testData->inputSize);
495 testData->input1.resize(testData->inputSize);
496 testData->input2.resize(testData->inputSize);
497 testData->input3.resize(testData->inputSize);
498 testData->output.resize(testData->outputSize);
499 populateInput(testData.get());
501 /* Perform evaluations */
502 unsigned int numUpdates;
503 status = profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void*)testData.get(), &numUpdates);
505 if (DS_SUCCESS == status)
507 /* Write scores to file */
508 status = writeProfileToFile(profile, serializeScore, fileName);
509 if (DS_SUCCESS == status)
511 LOG_PRINTF("[DS] Scores written to file (" << fileName << ").");
513 else
515 LOG_PRINTF("[DS] Error saving scores to file (" << fileName << "); scores not written to file.");
518 else
520 LOG_PRINTF("[DS] Unable to evaluate performance; scores not written to file.");
523 else
525 LOG_PRINTF("[DS] Profile read from file (" << fileName << ").");
528 /* Pick best device */
529 int bestDeviceIdx;
530 pickBestDevice(profile, &bestDeviceIdx);
532 /* Overide if necessary */
533 char* overrideDeviceStr = getenv("SC_OPENCL_DEVICE_OVERRIDE");
534 if (NULL != overrideDeviceStr)
536 int overrideDeviceIdx = matchDevice(profile, overrideDeviceStr);
537 if (-1 != overrideDeviceIdx)
539 LOG_PRINTF("[DS] Overriding Device Selection (SC_OPENCL_DEVICE_OVERRIDE=" << overrideDeviceStr << ").");
540 bestDeviceIdx = overrideDeviceIdx;
541 if (DS_DEVICE_OPENCL_DEVICE == profile->devices[bestDeviceIdx].type)
543 LOG_PRINTF("[DS] Selected Device[" << bestDeviceIdx << "]: " << profile->devices[bestDeviceIdx].oclDeviceName << " (OpenCL).");
545 else
547 LOG_PRINTF("[DS] Selected Device[" << bestDeviceIdx << "]: CPU (Native).");
550 else
552 LOG_PRINTF("[DS] Ignoring invalid SC_OPENCL_DEVICE_OVERRIDE=" << overrideDeviceStr << ").");
556 /* Final device selection */
557 selectedDevice = profile->devices[bestDeviceIdx];
558 bIsDeviceSelected = true;
560 /* Release profile */
561 status = releaseDSProfile(profile, releaseScore);
563 return selectedDevice;
566 bool selectedDeviceIsOpenCL(ds_device device)
568 return (DS_DEVICE_OPENCL_DEVICE == device.type);
571 bool selectedDeviceIsNativeCPU(ds_device device)
573 return (DS_DEVICE_NATIVE_CPU == device.type);
578 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */