GraphicsMagick 的 OpenCL 开发记录（三十四）

文章目录

如何写`ScaleImage()`的硬件加速函数（八）

<2022-05-05 周四>

如何写`ScaleImage()`的硬件加速函数（八）

我觉得Y方向的缩放以下面这种ScaleFilter()的方法是实现不了的，我只能添加进X方向的处理，缩小正常，放大的话图片变亮。

STRINGIFY(__kernel __attribute__((reqd_work_group_size(256, 1, 1)))void ScaleFilter(const __global CLQuantum* inputImage, const unsigned int matte_or_cmyk,const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum* filteredImage,const unsigned int filteredColumns, const unsigned int filteredRows,const float resizeFilterScale,__local CLQuantum* inputImageCache, const int numCachedPixels,const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,__local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache)
{// calculate the range of resized image pixels computed by this workgroupconst unsigned int startX = get_group_id(0) * pixelPerWorkgroup;const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup, filteredColumns);const unsigned int actualNumPixelToCompute = stopX - startX;float xFactor = (float)filteredColumns / inputColumns;// calculate the range of input image pixels to cacheconst int cacheRangeStartX = MagickMax((int)((startX + 0.5f) / xFactor), (int)(0));const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);// cache the input pixels into local memoryconst unsigned int y = get_global_id(1);const unsigned int pos = getPixelIndex(4, inputColumns, cacheRangeStartX, y / xFactor);const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * 4;event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);wait_group_events(1, &e);unsigned int totalNumChunks = (actualNumPixelToCompute + pixelChunkSize - 1) / pixelChunkSize;for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++){const unsigned int chunkStartX = startX + chunk * pixelChunkSize;const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;// determine which resized pixel computed by this workitemconst unsigned int itemID = get_local_id(0);const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));float4 filteredPixel = (float4)0.0f;// -1 means this workitem doesn't participate in the computationif (pixelIndex != -1){// x coordinated of the resized pixel computed by this workitemconst int x = chunkStartX + pixelIndex;// calculate how many steps required for this pixelconst float bisect = (x + 0.5) / xFactor + MagickEpsilon;const unsigned int start = (unsigned int)MagickMax(bisect, 0.0f);const unsigned int stop = (unsigned int)MagickMin(bisect + 1, (float)inputColumns);const unsigned int n = stop - start;// calculate how many steps this workitem will contributeunsigned int numStepsPerWorkItem = n / numItems;numStepsPerWorkItem += ((numItems * numStepsPerWorkItem) == n ? 0 : 1);const unsigned int startStep = (itemID % numItems) * numStepsPerWorkItem;if (startStep < n){float x_scale = (float)filteredColumns / inputColumns;float x_span = 1.0;float x_volume = 0.0;float factor = 0.0;const unsigned int stopStep = MagickMin(startStep + numStepsPerWorkItem, n);unsigned int cacheIndex = start + startStep - cacheRangeStartX;for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++){float4 cp = (float4)0.0f;__local CLQuantum* p = inputImageCache + (cacheIndex * 4);cp.x = (float)*(p);cp.y = (float)*(p + 1);cp.z = (float)*(p + 2);cp.w = (float)*(p + 3);while (x_scale >= x_span) {if (x_volume > 0.0 && x_volume < 1.0) {factor = 1 / x_volume;filteredPixel.x *= factor;filteredPixel.y *= factor;filteredPixel.z *= factor;}if (cp.w < 255.0) {x_volume += x_span;}filteredPixel += x_span * cp;filteredPixel.x = filteredPixel.x > 255.0 ? 255.0 : filteredPixel.x;filteredPixel.y = filteredPixel.y > 255.0 ? 255.0 : filteredPixel.y;filteredPixel.z = filteredPixel.z > 255.0 ? 255.0 : filteredPixel.z;filteredPixel.w = filteredPixel.w > 255.0 ? 255.0 : filteredPixel.w;x_scale -= x_span;x_span = 1.0;}if (x_scale > 0.0) {if (x_volume > 0.0 && x_volume < 1.0) {factor = 1 / x_volume;filteredPixel.x *= factor;filteredPixel.y *= factor;filteredPixel.z *= factor;}if (cp.w < 255.0)x_volume += x_scale;filteredPixel += x_scale * cp;x_span -= x_scale;}if (x_span > 0.0) {if (cp.w < 255.0)x_volume += x_span;filteredPixel += x_span * cp;}filteredPixel.x = filteredPixel.x > 255.0 ? 255.0 : filteredPixel.x;filteredPixel.y = filteredPixel.y > 255.0 ? 255.0 : filteredPixel.y;filteredPixel.z = filteredPixel.z > 255.0 ? 255.0 : filteredPixel.z;filteredPixel.w = filteredPixel.w > 255.0 ? 255.0 : filteredPixel.w;}}}if (itemID < actualNumPixelInThisChunk) {outputPixelCache[itemID] = (float4)0.0f;}barrier(CLK_LOCAL_MEM_FENCE);for (unsigned int i = 0; i < numItems; i++) {if (pixelIndex != -1) {if (itemID % numItems == i) {outputPixelCache[pixelIndex] += filteredPixel;}}barrier(CLK_LOCAL_MEM_FENCE);}if (itemID < actualNumPixelInThisChunk){float4 filteredPixel = outputPixelCache[itemID];WriteAllChannels(filteredImage, 4, filteredColumns, chunkStartX + itemID, y, filteredPixel);}}
}
)