/******************************************************************** ** Image Component Library (ICL) ** ** ** ** Copyright (C) 2006-2013 CITEC, University of Bielefeld ** ** Neuroinformatics Group ** ** Website: www.iclcv.org and ** ** http://opensource.cit-ec.de/projects/icl ** ** ** ** File : ICLCV/src/ICLCV/CLSurfLibKernels.cpp ** ** Module : ICLCV ** ** Authors: Christof Elbrechter ** ** ** ** ** ** GNU LESSER GENERAL PUBLIC LICENSE ** ** This file may be used under the terms of the GNU Lesser General ** ** Public License version 3.0 as published by the ** ** ** ** Free Software Foundation and appearing in the file LICENSE.LGPL ** ** included in the packaging of this file. Please review the ** ** following information to ensure the license requirements will ** ** be met: http://www.gnu.org/licenses/lgpl-3.0.txt ** ** ** ** The development of this software was supported by the ** ** Excellence Cluster EXC 277 Cognitive Interaction Technology. ** ** The Excellence Cluster EXC 277 is a grant of the Deutsche ** ** Forschungsgemeinschaft (DFG) in the context of the German ** ** Excellence Initiative. ** ** ** ********************************************************************/ /****************************************************************************\ * Copyright (c) 2011, Advanced Micro Devices, Inc. * * All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * * are met: * * * * Redistributions of source code must retain the above copyright notice, * * this list of conditions and the following disclaimer. * * * * Redistributions in binary form must reproduce the above copyright notice, * * this list of conditions and the following disclaimer in the documentation * * and/or other materials provided with the distribution. * * * * Neither the name of the copyright holder nor the names of its contributors * * may be used to endorse or promote products derived from this software * * without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR * * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * If you use the software (in whole or in part), you shall adhere to all * * applicable U.S., European, and other export laws, including but not * * limited to the U.S. Export Administration Regulations ('EAR'), (15 C.F.R. * * Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 * * of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you * * hereby certify that, except pursuant to a license granted by the United * * States Department of Commerce Bureau of Industry and Security or as * * otherwise permitted pursuant to a License Exception under the U.S. Export * * Administration Regulations ("EAR"), you will not (1) export, re-export or * * release to a national of a country in Country Groups D:1, E:1 or E:2 any * * restricted technology, software, or source code you receive hereunder, * * or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such * * technology or software, if such foreign produced direct product is subject * * to national security controls as identified on the Commerce Control List * *(currently found in Supplement 1 to Part 774 of EAR). For the most current * * Country Group listings, or for additional information about the EAR or * * your obligations under those regulations, please refer to the U.S. Bureau * * of Industry and Security's website at http://www.bis.doc.gov/. * \****************************************************************************/ #include namespace icl{ namespace cv{ namespace clsurf{ const char *utilityKernels = "#ifdef M_PI_F \n" "#define pi M_PI_F \n" "#else \n" "#define pi 3.141592654f \n" "#endif \n" "typedef __global float* det_layer_t; \n" "typedef __global int* lap_t; \n" "//! Calculate the value of the 2d gaussian at x,y \n" "float gaussian(float x, float y, float sig) \n" "{ \n" " return (1.0f/(2.0f*pi*sig*sig)) * exp(-(x*x+y*y)/(2.0f*sig*sig)); \n" "} \n" "//! Calculate the integral of a region \n" "float BoxIntegral( \n" " __global float* data, \n" " int width, int height, int row, int col, \n" " int rows, int cols) \n" "{ \n" " float A = 0.0f; \n" " float B = 0.0f; \n" " float C = 0.0f; \n" " float D = 0.0f; \n" " // The subtraction by one for row/col is because row/col is inclusive. \n" " int r1 = min(row, height) - 1; \n" " int c1 = min(col, width) - 1; \n" " int r2 = min(row + rows, height) - 1; \n" " int c2 = min(col + cols, width) - 1; \n" " if (r1 >= 0 && c1 >= 0) A = data[r1 * width + c1]; \n" " if (r1 >= 0 && c2 >= 0) B = data[r1 * width + c2]; \n" " if (r2 >= 0 && c1 >= 0) C = data[r2 * width + c1]; \n" " if (r2 >= 0 && c2 >= 0) D = data[r2 * width + c2]; \n" " return max(0.0f, A - B - C + D); \n" "} \n" "//! Calculate Haar wavelet responses in X direction \n" "float haarX( \n" " __global float* img, \n" " int width, int height, int row, int column, int s) \n" "{ \n" " return BoxIntegral(img, width, height, row-s/2, column, s, s/2) - \n" " BoxIntegral(img, width, height, row-s/2, column-s/2, s, s/2); \n" "} \n" "//! Calculate Haar wavelet responses in Y direction \n" "float haarY( \n" " __global float* img, \n" " int width, int height, int row, int column, int s) \n" "{ \n" " return BoxIntegral(img, width, height, row, column-s/2, s/2, s) - \n" " BoxIntegral(img, width, height, row-s/2, column-s/2, s/2, s); \n" "} \n"; const char *normalizeDescriptors_kernel = "__kernel void \n" "normalizeDescriptors(__global float* surfDescriptors, \n" " __global float* descLengths) \n" "{ \n" " // Previous kernels have computed 64 descriptors (surfDescriptors) and \n" " // 16 lengths (descLengths) for each interesting point found by SURF. \n" " // This kernel will sum up all of the lengths for each interesting point \n" " // and take the square root. This value will then be used to scale the \n" " // descriptors. \n" " // Note that each work group contains 64 work items (one per descriptor). \n" " // This array is used to cache data that will be accessed multiple times. \n" " // Since it is declared __local, only one instance is created and shared \n" " // by the entire work group \n" " __local float ldescLengths[16]; \n" " // Get the offset for the descriptor that this work group will normalize \n" " int descOffset = get_group_id(0) * 64; \n" " // Get the offset for the descriptor lengths that this work group will \n" " // use to scale the descriptors \n" " int lenOffset = get_group_id(0) * 16; \n" " // Get this work items ID within the work group \n" " int tid = get_local_id(0); \n" " // Only 16 of the work items are needed to cache the lengths \n" " if(tid < 16) { \n" " // Have the first 16 work items cache the lengths for this \n" " // point in local memory \n" " ldescLengths[tid] = descLengths[lenOffset + tid]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " // Work items work together to perform a parallel reduction of the \n" " // descriptor lengths (result gets stored in ldescLength[0]) \n" " for(int i = 8; i > 0; i >>= 1) \n" " { \n" " if (tid < i) \n" " { \n" " ldescLengths[tid] += ldescLengths[tid + i]; \n" " \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " // Calculate the normalized length of the descriptors \n" " float lengthOfDescriptor = 1.0f/sqrt(ldescLengths[0]); \n" " // Scale each descriptor \n" " surfDescriptors[descOffset + tid] *= lengthOfDescriptor; \n" "} \n"; const char *nonMaxSuppression_kernel = "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable \n" " \n" "int getLaplacian(lap_t layer, int c, int r, int width) \n" "{ \n" " int laplacian; \n" " \n" " laplacian = layer[r*width+c]; \n" " return laplacian; \n" "} \n" "float getResponse(det_layer_t layer, int c, int r, int width, int scale) \n" "{ \n" " float val; \n" " \n" " int row = r*scale; \n" " val = layer[r*scale*width+c*scale]; \n" " return val; \n" "} \n" "bool interpolateExtremum( int r, \n" " int c, \n" " __global int* ipt_count, \n" " float2* pos, \n" " float* det_scale, \n" " int* laplacian, \n" " det_layer_t t, \n" " int tWidth, \n" " int tHeight, \n" " int tStep, \n" " det_layer_t m, \n" " lap_t mLaplacian, \n" " int mWidth, \n" " int mHeight, \n" " int mFilter, \n" " det_layer_t b, \n" " int bWidth, \n" " int bHeight, \n" " int bFilter) \n" "{ \n" " // --------------------------------------- \n" " // Step 1: Calculate the 3D derivative \n" " // --------------------------------------- \n" " int mScale = mWidth/tWidth; \n" " int bScale = bWidth/tWidth; \n" " \n" " float dx, dy, ds; \n" " dx = (getResponse(m, c+1, r, mWidth, mScale) - \n" " getResponse(m, c-1, r, mWidth, mScale)) / 2.0f; " " dy = (getResponse(m, c, r+1, mWidth, mScale) - \n" " getResponse(m, c, r-1, mWidth, mScale)) / 2.0f; \n" " ds = (getResponse(t, c, r, tWidth, 1) - \n" " getResponse(b, c, r, bWidth, bScale)) / 2.0f; \n" " \n" " // --------------------------------------- \n" " // Step 2: Calculate the inverse Hessian \n" " // --------------------------------------- \n" " \n" " float v; \n" " \n" " float dxx, dyy, dss, dxy, dxs, dys; \n" " v = getResponse(m, c, r, mWidth, mScale); \n" " dxx = getResponse(m, c+1, r, mWidth, mScale) + \n" " getResponse(m, c-1, r, mWidth, mScale) - 2.0f*v; \n" " dyy = getResponse(m, c, r+1, mWidth, mScale) + \n" " getResponse(m, c, r-1, mWidth, mScale) - 2.0f*v; \n" " dss = getResponse(t, c, r, tWidth, 1) + \n" " getResponse(b, c, r, bWidth, bScale) - 2.0f*v; \n" " dxy = (getResponse(m, c+1, r+1, mWidth, mScale) - \n" " getResponse(m, c-1, r+1, mWidth, mScale) - \n" " getResponse(m, c+1, r-1, mWidth, mScale) + \n" " getResponse(m, c-1, r-1, mWidth, mScale))/4.0f; \n" " dxs = (getResponse(t, c+1, r, tWidth, 1) - \n" " getResponse(t, c-1, r, tWidth, 1) - \n" " getResponse(b, c+1, r, bWidth, bScale) + \n" " getResponse(b, c-1, r, bWidth, bScale))/4.0f; \n" " dys = (getResponse(t, c, r+1, tWidth, 1) - \n" " getResponse(t, c, r-1, tWidth, 1) - \n" " getResponse(b, c, r+1, bWidth, bScale) + \n" " getResponse(b, c, r-1, bWidth, bScale))/4.0f; \n" " float H0 = dxx; \n" " float H1 = dxy; \n" " float H2 = dxs; \n" " float H3 = dxy; \n" " float H4 = dyy; \n" " float H5 = dys; \n" " float H6 = dxs; \n" " float H7 = dys; \n" " float H8 = dss; \n" " // NOTE Although the inputs are the same, the value of determinant (and \n" " // therefore invdet) vary from the CPU version \n" " \n" " float determinant = \n" " H0*(H4*H8-H7*H5) - \n" " H1*(H3*H8-H5*H6) + \n" " H2*(H3*H7-H4*H6); \n" " \n" " float invdet = 1.0f / determinant; \n" " \n" " float invH0 = (H4*H8-H7*H5)*invdet; \n" " float invH1 = -(H3*H8-H5*H6)*invdet; \n" " float invH2 = (H3*H7-H6*H4)*invdet; \n" " float invH3 = -(H1*H8-H2*H7)*invdet; \n" " float invH4 = (H0*H8-H2*H6)*invdet; \n" " float invH5 = -(H0*H7-H6*H1)*invdet; \n" " float invH6 = (H1*H5-H2*H4)*invdet; \n" " float invH7 = -(H0*H5-H3*H2)*invdet; \n" " float invH8 = (H0*H4-H3*H1)*invdet; \n" " \n" " // --------------------------------------- \n" " // Step 3: Multiply derivative and Hessian \n" " // --------------------------------------- \n" " \n" " float xi = 0.0f, xr = 0.0f, xc = 0.0f; \n" " \n" " xc = invH0 * dx * -1.0f; \n" " xc += invH1 * dy * -1.0f; \n" " xc += invH2 * ds * -1.0f; \n" " \n" " xr = invH3 * dx * -1.0f; \n" " xr += invH4 * dy * -1.0f; \n" " xr += invH5 * ds * -1.0f; \n" " \n" " xi = invH6 * dx * -1.0f; \n" " xi += invH7 * dy * -1.0f; \n" " xi += invH8 * ds * -1.0f; \n" " // Check if point is sufficiently close to the actual extremum \n" " if(fabs(xi) < 0.5f && fabs(xr) < 0.5f && fabs(xc) < 0.5f) \n" " { \n" " int filterStep = mFilter - bFilter; \n" " \n" " // Store values related to interest point \n" " (*pos).x = (float)((c + xc)*tStep); \n" " (*pos).y = (float)((r + xr)*tStep); \n" " *det_scale = (float)(0.1333f)*(mFilter + (xi*filterStep)); \n" " \n" " int scale = mWidth/tWidth; \n" " *laplacian = getLaplacian(mLaplacian, c*scale, r*scale, mWidth); \n" " \n" " return true; \n" " } \n" " return false; \n" "} \n" "//! Check whether point really is a maximum \n" "bool isExtremum( det_layer_t t, \n" " int tWidth, \n" " int tHeight, \n" " int tFilter, \n" " int tStep, \n" " det_layer_t m, \n" " int mWidth, \n" " int mHeight, \n" " det_layer_t b, \n" " int bWidth, \n" " int bHeight, \n" " int c, \n" " int r, \n" " float threshold) \n" "{ \n" " int layerBorder = (tFilter+1)/(2*tStep); \n" " // If any accesses would read out-of-bounds, this point is not a maximum \n" " if(r <= layerBorder || r >= tHeight - layerBorder || \n" " c <= layerBorder || c >= tWidth - layerBorder) { \n" " \n" " return false; \n" " } \n" " \n" " int mScale = mWidth/tWidth; \n" " \n" " // Candidate for local maximum \n" " float candidate = getResponse(m, c, r, mWidth, mScale); \n" " \n" " if(candidate < threshold) { \n" " return false; \n" " } \n" " \n" " // If any response in 3x3x3 is greater candidate not maximum \n" " float localMax = getResponse(t, c-1, r-1, tWidth, 1); \n" " localMax = fmax(localMax, getResponse(t, c, r-1, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c+1, r-1, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c-1, r, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c, r, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c+1, r, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c-1, r+1, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c, r+1, tWidth, 1)); \n" " localMax = fmax(localMax, getResponse(t, c+1, r+1, tWidth, 1)); \n" " \n" " int bScale = bWidth/tWidth; \n" " localMax = fmax(localMax, getResponse(b, c-1, r-1, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c, r-1, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c+1, r-1, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c-1, r, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c, r, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c+1, r, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c-1, r+1, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c, r+1, bWidth, bScale)); \n" " localMax = fmax(localMax, getResponse(b, c+1, r+1, bWidth, bScale)); \n" " \n" " //int mScale = mWidth/tWidth; \n" " localMax = fmax(localMax, getResponse(m, c-1, r-1, mWidth, mScale)); \n" " localMax = fmax(localMax, getResponse(m, c, r-1, mWidth, mScale)); \n" " localMax = fmax(localMax, getResponse(m, c+1, r-1, mWidth, mScale)); \n" " localMax = fmax(localMax, getResponse(m, c-1, r, mWidth, mScale)); \n" " // This is the candidate pixel \n" " localMax = fmax(localMax, getResponse(m, c+1, r, mWidth, mScale)); \n" " localMax = fmax(localMax, getResponse(m, c-1, r+1, mWidth, mScale)); \n" " localMax = fmax(localMax, getResponse(m, c, r+1, mWidth, mScale)); \n" " localMax = fmax(localMax, getResponse(m, c+1, r+1, mWidth, mScale)); \n" " \n" " // If localMax > candidate, candidate is not the local maxima \n" " if(localMax > candidate) { \n" " return false; \n" " } \n" " \n" " return true; \n" "} \n" " /* We don't know how many Ipoints there will be until after this kernel, \n" " * but we have to have enough space allocated beforehand. Before a work item \n" " * writes to a location, he should make sure that there is space available. \n" " * After the kernel we'll check to make sure ipt_count is less than the \n" " * allocated space, and if not, we'll run it again. \n" " */ \n" "__kernel \n" "void non_max_supression_kernel( \n" " det_layer_t tResponse, \n" " int tWidth, \n" " int tHeight, \n" " int tFilter, \n" " int tStep, \n" " det_layer_t mResponse, \n" " lap_t mLaplacian, \n" " int mWidth, \n" " int mHeight, \n" " int mFilter, \n" " det_layer_t bResponse, \n" " int bWidth, \n" " int bHeight, \n" " int bFilter, \n" " __global int* ipt_count, \n" " __global float2* d_pixPos, \n" " __global float* d_scale, \n" " __global int* d_laplacian, \n" " int maxPoints, \n" " float threshold) \n" "{ \n" " \n" " int r = get_global_id(1); \n" " int c = get_global_id(0); \n" " \n" " float2 pixpos; \n" " float scale; \n" " int laplacian; \n" " \n" " // Check the block extremum is an extremum across boundaries. \n" " if(isExtremum(tResponse, tWidth, tHeight, tFilter, tStep, mResponse, \n" " mWidth, mHeight, bResponse, bWidth, bHeight, c, r, threshold))\n" " { \n" " if(interpolateExtremum(r, c, ipt_count, &pixpos, &scale, \n" " &laplacian, tResponse, tWidth, tHeight, tStep, mResponse, \n" " mLaplacian, mWidth, mHeight, mFilter, bResponse, bWidth, bHeight, \n" " bFilter)) { \n" " int index = atom_add(&ipt_count[0], 1); \n" " if(index < maxPoints) { \n" " d_pixPos[index] = pixpos; \n" " d_scale[index] = scale; \n" " d_laplacian[index] = laplacian; \n" " } \n" " } \n" " } \n" " \n" "} \n"; const char *nearestNeighbor_kernel = "typedef struct{ \n" " int point1; \n" " float x1; \n" " float y1; \n" " int point2; \n" " float x2; \n" " float y2; \n" " float dist; \n" " float orientationDiff; \n" " float scaleDiff; \n" " int match; \n" " bool dup; \n" "} distPoint; \n" "typedef struct{ \n" " float x; \n" " float y; \n" " float scale; \n" " float orientation; \n" " int laplacian; \n" " int clusterIndex; \n" " float descriptor[64]; \n" "} Ipoint; \n" "__kernel void NearestNeighbor(__global Ipoint *d_ipts1, \n" " __global Ipoint *d_ipts2, \n" " __global distPoint *d_distancePoints, \n" " __local float* tempDistPts, \n" " const unsigned int ipts2Size) { \n" " int groupId = get_group_id(0); \n" " int localId = get_local_id(0); \n" " int localSize = get_local_size(0); \n" " __global Ipoint *ipts1 = d_ipts1 + groupId; \n" " __global Ipoint *ipts2; \n" " __global distPoint *dp = d_distancePoints + groupId; \n" " int point2Match = 0; \n" " float minDist = FLT_MAX; \n" " for (unsigned int i2 = 0; i2 < ipts2Size; i2++){ \n" " \n" " // Calculate the sum of squares distance \n" " ipts2 = d_ipts2 + i2; \n" " // TODO This metric may need to be improved. Currently \n" " // we determine matches based on the sum of descriptor \n" " // differences \n" " tempDistPts[localId] = fabs(ipts1->descriptor[localId] - \n" " ipts2->descriptor[localId]); \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // reduce \n" " for(int k=32; k > 0 ;k>>=1) { \n" " if(localId < k) \n" " { \n" " tempDistPts[localId] += tempDistPts[localId + k]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " } \n" " \n" " if (localId==0) { \n" " tempDistPts[0] /= 64; // Get the average descriptor difference \n" " if(tempDistPts[0] < minDist) \n" " { \n" " minDist = tempDistPts[0]; \n" " point2Match = i2; \n" " } \n" " } \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " if(localId == 0) \n" " { \n" " ipts2 = d_ipts2+point2Match; \n" " dp->point1 = groupId; \n" " dp->x1 = ipts1->x; \n" " dp->y1 = ipts1->y; \n" " dp->point2 = point2Match; \n" " dp->x2 = ipts2->x; \n" " dp->y2 = ipts2->y; \n" " dp->dist = minDist; \n" " dp->orientationDiff = ipts1->orientation - ipts2->orientation; \n" " dp->scaleDiff = ipts1->scale - ipts2->scale; \n" " } \n" "} \n"; const char *integralImage_kernels = "#define WG_SIZE 64 \n" "#define HALF_WG_SIZE (WG_SIZE/2) \n" " /* This kernel does a prefix scan on any size image. Each work group marches \n" " * down a row \n" " * of the image performing a sub-scan on WG_SIZE elements at a time. \n" " * The value obtained by the thread with the biggest ID is kept as the base \n" " * for the next sub-scan. \n" " */ \n" "__kernel \n" "void scan(__global float *input, \n" " __global float *output, \n" " int rows, \n" " int cols) { \n" " // Each work group is responsible for scanning a row \n" " // If we add additional elements to local memory (half the \n" " // work group size) and fill them with zeros, we don't need \n" " // conditionals in the code to check for out-of-bounds \n" " // accesses \n" " __local float lData[WG_SIZE+HALF_WG_SIZE]; \n" " __local float lData2[WG_SIZE+HALF_WG_SIZE]; \n" " int myGlobalY = get_global_id(1); \n" " int myLocalX = get_local_id(0); \n" " // Initialize out-of-bounds elements with zeros \n" " if(myLocalX < HALF_WG_SIZE) { \n" " lData[myLocalX] = 0.0f; \n" " lData2[myLocalX] = 0.0f; \n" " } \n" " \n" " // Start past out-of-bounds elements \n" " myLocalX += HALF_WG_SIZE; \n" " \n" " // This value needs to be added to the local data. It's the highest \n" " // value from the prefix scan of the previous elements in the row \n" " float prevMaxVal = 0; \n" " // March down a row WG_SIZE elements at a time \n" " int iterations = cols/WG_SIZE; \n" " if(cols % WG_SIZE != 0) \n" " { \n" " // If cols are not an exact multiple of WG_SIZE, then we need to do \n" " // one more iteration \n" " iterations++; \n" " } \n" " \n" " for(int i = 0; i < iterations; i++) { \n" " \n" " int columnOffset = i*WG_SIZE + get_local_id(0); \n" " // Don't do anything if a thread's index is past the end of the row. \n" " // We still need the thread available for memory barriers though. \n" " if(columnOffset < cols) { \n" " // Cache the input data in local memory \n" " lData[myLocalX] = input[myGlobalY*cols + columnOffset]; \n" " } \n" " else { \n" " // Otherwise just store zeros \n" " lData[myLocalX] = 0.0f; \n" " } \n" " \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 1 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-1]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 2 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-2]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 4 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-4]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 8 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-8]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 16 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-16]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 32 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-32]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // Write data out to global memory \n" " if(columnOffset < cols) { \n" " output[myGlobalY*cols + columnOffset] = \n" " lData[myLocalX] + prevMaxVal; \n" " } \n" " \n" " // Copy the value from the highest thread in the group to my local index\n" " prevMaxVal += lData[WG_SIZE+HALF_WG_SIZE-1]; \n" " } \n" "} \n" "__kernel \n" "void scan4(__global float4 *input, \n" " __global float4 *output, \n" " int rows, \n" " int cols) { \n" " // Each work group is responsible for scanning a row \n" " // If we add additional elements to local memory (half the \n" " // work group size) and fill them with zeros, we don't need \n" " // conditionals in the code to check for out-of-bounds \n" " // accesses \n" " __local float4 lData[WG_SIZE+HALF_WG_SIZE]; \n" " __local float4 lData2[WG_SIZE+HALF_WG_SIZE]; \n" " int myGlobalY = get_global_id(1); \n" " int myLocalX = get_local_id(0); \n" " // Initialize out-of-bounds elements with zeros \n" " if(myLocalX < HALF_WG_SIZE) { \n" " lData[myLocalX] = (float4)(0.0f, 0.0f, 0.0f, 0.0f); \n" " lData2[myLocalX] = (float4)(0.0f, 0.0f, 0.0f, 0.0f); \n" " } \n" " \n" " // Start past out-of-bounds elements \n" " myLocalX += HALF_WG_SIZE; \n" " \n" " // This value needs to be added to the local data. It's the highest \n" " // value from the prefix scan of the previous elements in the row \n" " float prevMaxVal = 0; \n" " \n" " // There will actually be only 1/4 of the columns since we've changed \n" " // the data type to float4 \n" " int columns = cols/4; \n" " // March down a row WG_SIZE elements at a time \n" " int iterations = columns/WG_SIZE; \n" " if(columns % WG_SIZE != 0) \n" " { \n" " // If cols are not an exact multiple of WG_SIZE, then we need to do \n" " // one more iteration \n" " iterations++; \n" " } \n" " \n" " for(int i = 0; i < iterations; i++) { \n" " \n" " int columnOffset = i*WG_SIZE + get_local_id(0); \n" " // Don't do anything if a thread's index is past the end of the row. \n" " // We still need the thread available for memory barriers though. \n" " if(columnOffset < columns) { \n" " // Cache the input data in local memory \n" " lData[myLocalX] = input[myGlobalY*columns + columnOffset]; \n" " } \n" " else { \n" " // Otherwise just store zeros \n" " lData[myLocalX] = (float4)(0.0f, 0.0f, 0.0f, 0.0f); \n" " } \n" " \n" " lData[myLocalX].y += lData[myLocalX].x; \n" " lData[myLocalX].z += lData[myLocalX].y; \n" " lData[myLocalX].w += lData[myLocalX].z; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 1 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-1].w; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 2 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-2].w; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 4 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-4].w; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 8 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-8].w; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 16 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-16].w; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 32 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-32].w; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // Write data out to global memory \n" " if(columnOffset < columns) { \n" " output[myGlobalY*columns + columnOffset] = \n" " lData[myLocalX] + prevMaxVal; \n" " } \n" " \n" " // Copy the value from the highest thread in the group to my local index\n" " prevMaxVal += lData[WG_SIZE+HALF_WG_SIZE-1].w; \n" " } \n" "} \n" "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n" " CLK_ADDRESS_CLAMP | \n" " CLK_FILTER_NEAREST; \n" " \n" "__kernel \n" "void scanImage(__read_only image2d_t input, \n" " __write_only image2d_t output, \n" " int rows, \n" " int cols) { \n" " // Each work group is responsible for scanning a row \n" " // If we add additional elements to local memory (half the \n" " // work group size) and fill them with zeros, we don't need \n" " // conditionals in the code to check for out-of-bounds \n" " // accesses \n" " __local float lData[WG_SIZE+HALF_WG_SIZE]; \n" " __local float lData2[WG_SIZE+HALF_WG_SIZE]; \n" " int myGlobalY = get_global_id(1); \n" " int myLocalX = get_local_id(0); \n" " // Initialize out-of-bounds elements with zeros \n" " if(myLocalX < HALF_WG_SIZE) { \n" " lData[myLocalX] = 0.0f; \n" " lData2[myLocalX] = 0.0f; \n" " } \n" " \n" " // Start past out-of-bounds elements \n" " myLocalX += HALF_WG_SIZE; \n" " \n" " // This value needs to be added to the local data. It's the highest \n" " // value from the prefix scan of the previous elements in the row \n" " float prevMaxVal = 0; \n" " // March down a row WG_SIZE elements at a time \n" " int iterations = cols/WG_SIZE; \n" " if(cols % WG_SIZE != 0) \n" " { \n" " // If cols are not an exact multiple of WG_SIZE, then we need to do \n" " // one more iteration \n" " iterations++; \n" " } \n" " \n" " for(int i = 0; i < iterations; i++) { \n" " \n" " int columnOffset = i*WG_SIZE + get_local_id(0); \n" " // Don't do anything if a thread's index is past the end of the row. \n" " // we will still need thread available for memory barriers though. \n" " \n" " // Cache the input data in local memory \n" " lData[myLocalX] = read_imagef(input, sampler, \n" " (int2)(columnOffset,myGlobalY)).x; \n" " \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 1 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-1]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 2 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-2]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 4 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-4]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 8 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-8]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 16 \n" " lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-16]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // 32 \n" " lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-32]; \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // Write data out to global memory \n" " if(columnOffset < cols) { \n" " write_imagef(output, (int2)(columnOffset, myGlobalY), \n" " (float4)(lData[myLocalX] + prevMaxVal, 0.0f, 0.0f, 0.0f)); \n" " } \n" " \n" " // Copy the value from the highest thread in the group to my local index\n" " prevMaxVal += lData[WG_SIZE+HALF_WG_SIZE-1]; \n" " } \n" "} \n" "__kernel \n" "void transpose(__global float* iImage, \n" " __global float* oImage, \n" " int inRows, \n" " int inCols) { \n" " \n" " __local float tmpBuffer[256]; \n" " // Work groups will perform the transpose \n" " // (i.e., an entire work group will be moved from \n" " // one part of the image to another) \n" " int myWgInX = get_group_id(0); \n" " int myWgInY = get_group_id(1); \n" " // The local index of a thread should not change (threads with adjacent IDs \n" " // within a work group should always perform adjacent memory accesses). \n" " // we will account for the clever indexing of local memory later. \n" " int myLocalX = get_local_id(0); \n" " int myLocalY = get_local_id(1); \n" " \n" " int myGlobalInX = myWgInX*16 + myLocalX; \n" " int myGlobalInY = myWgInY*16 + myLocalY; \n" " // Don't read out of bounds \n" " if(myGlobalInX < inCols && myGlobalInY < inRows) { \n" " \n" " // Cache data to local memory (coalesced reads) \n" " tmpBuffer[myLocalY*16 + myLocalX] = \n" " iImage[myGlobalInY*inCols + myGlobalInX]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // This avoids some confusion with dimensions, but otherwise aren't needed \n" " int outRows = inCols; \n" " int outCols = inRows; \n" " // Swap work group IDs for their location after the transpose \n" " int myWgOutX = myWgInY; \n" " int myWgOutY = myWgInX; \n" " int myGlobalOutX = myWgOutX*16 + myLocalX; \n" " int myGlobalOutY = myWgOutY*16 + myLocalY; \n" " // When writing back to global memory, we need to swap image dimensions \n" " // for the transposed size), and also need to swap thread X/Y indices \n" " // in local memory (to achieve coalesced memory writes) \n" " // Don't write out of bounds \n" " if(myGlobalOutX >= 0 && myGlobalOutX < outCols && \n" " myGlobalOutY >= 0 && myGlobalOutY < outRows) { \n" " // The read from tmpBuffer is going to conflict, \n" " // but the write should be coalesced \n" " oImage[myGlobalOutY*outCols + myGlobalOutX] = \n" " tmpBuffer[myLocalX*16 + myLocalY]; \n" " } \n" " return; \n" "} \n" "__kernel \n" "void transposeImage(__read_only image2d_t iImage, \n" " __write_only image2d_t oImage, \n" " int inRows, \n" " int inCols) { \n" " \n" " __local float tmpBuffer[256]; \n" " // Work groups will perform the transpose (i.e., an entire work \n" " // group will be moved fromone part of the image to another) \n" " int myWgInX = get_group_id(0); \n" " int myWgInY = get_group_id(1); \n" " // The local index of a thread should not change (threads with adjacent IDs \n" " // within a work group should always perform adjacent memory accesses). \n" " // We will account for the clever indexing of local memory later. \n" " int myLocalX = get_local_id(0); \n" " int myLocalY = get_local_id(1); \n" " \n" " int myGlobalInX = myWgInX*16 + myLocalX; \n" " int myGlobalInY = myWgInY*16 + myLocalY; \n" " // Cache data to local memory (coalesced reads) \n" " tmpBuffer[myLocalY*16 + myLocalX] = read_imagef(iImage, sampler, \n" " (int2)(myGlobalInX, myGlobalInY)).x; \n" " \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // This avoids some confusion with dimensions, but otherwise aren't needed \n" " int outRows = inCols; \n" " int outCols = inRows; \n" " // Swap work group IDs for their location after the transpose \n" " int myWgOutX = myWgInY; \n" " int myWgOutY = myWgInX; \n" " int myGlobalOutX = myWgOutX*16 + myLocalX; \n" " int myGlobalOutY = myWgOutY*16 + myLocalY; \n" " // When writing back to global memory, we need to swap image dimensions \n" " // (for the transposed size), and also need to swap thread X/Y indices \n" " // in local memory (to achieve coalesced memory writes) \n" " // Don't write out of bounds \n" " if(myGlobalOutX >= 0 && myGlobalOutX < outCols && \n" " myGlobalOutY >= 0 && myGlobalOutY < outRows) { \n" " // The read from tmpBuffer is going to conflict, but the write \n" " // should be coalesced \n" " write_imagef(oImage, (int2)(myGlobalOutX, myGlobalOutY), \n" " (float4)(tmpBuffer[myLocalX*16 + myLocalY], 0.0f, 0.0f, 0.0f)); \n" " } \n" "} \n"; const char *hessianDet_kernel = "typedef __global float* int_img_t; \n" "// Compute the hessian determinant \n" "__kernel void \n" "hessian_det( \n" " int_img_t img, // integral image \n" " int width, // integral image width \n" " int height, // integral image height \n" " det_layer_t responses, // hessian determinant \n" " lap_t laplacians, // laplacian values \n" " int layerWidth, \n" " int layerHeight, \n" " int step, // determinant step size \n" " int filter) // determinant filter size \n" "{ \n" " int l, w, b; \n" " float Dxx, Dyy, Dxy, inverse_area; \n" " int idx = get_global_id(0); \n" " int idy = get_global_id(1); \n" " w = filter; // filter size \n" " l = filter/3; // lobe for this filter \n" " b = (filter - 1)/ 2 + 1; // border for this filter \n" " inverse_area = 1.0f/(w * w); // normalization factor \n" " int r = idy * step; \n" " int c = idx * step; \n" " // Have threads accessing out-of-bounds data return immediately \n" " if(r >= height || c >= width) { \n" " return; \n" " } \n" " \n" " Dxx = BoxIntegral(img, width, height, r - l + 1, c - b, 2*l - 1, w) - \n" " BoxIntegral(img, width, height, r - l + 1, c - l / 2, 2*l - 1, l)*3; \n" " Dyy = BoxIntegral(img, width, height, r - b, c - l + 1, w, 2*l - 1) - \n" " BoxIntegral(img, width, height, r - l / 2, c - l + 1, l, 2*l - 1)*3; \n" " Dxy = BoxIntegral(img, width, height, r - l, c + 1, l, l) + \n" " BoxIntegral(img, width, height, r + 1, c - l, l, l) - \n" " BoxIntegral(img, width, height, r - l, c - l, l, l) - \n" " BoxIntegral(img, width, height, r + 1, c + 1, l, l); \n" " // Normalize the filter responses with respect to their size \n" " Dxx *= inverse_area; \n" " Dyy *= inverse_area; \n" " Dxy *= inverse_area; \n" " // Save the determinant of hessian response \n" " float determinant = (Dxx*Dyy - 0.81f*Dxy*Dxy); \n" " int laplacian = (Dxx + Dyy >= 0 ? 1 : 0); \n" " \n" " responses[idy*layerWidth+idx] = determinant; \n" " laplacians[idy*layerWidth+idx] = laplacian; \n" "} \n"; const char *getOrientation_kernels = "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable \n" "//! Get the angle from the +ve x-axis of the vector given by (X Y) \n" "float getAngle(float X, float Y) \n" "{ \n" " if(X >= 0 && Y >= 0) \n" " return atan(Y/X); \n" " if(X < 0 && Y >= 0) \n" " return pi - atan(-Y/X); \n" " if(X < 0 && Y < 0) \n" " return pi + atan(Y/X); \n" " if(X >= 0 && Y < 0) \n" " return 2*pi - atan(-Y/X); \n" " return 0; \n" "} \n" "//! \n" "__kernel void \n" "getOrientationStep1( \n" " __global float* d_img, \n" " __global float* d_scale, \n" " __global float2* d_pixPos, \n" " __global float* d_gauss25, \n" " __global unsigned int* d_id, \n" " int i_width, \n" " int i_height, \n" " __global float4* res) \n" "{ \n" " // Cache the gaussian data in local memory \n" " __local float l_gauss25[49]; \n" " __local unsigned int l_id[13]; \n" " \n" " int localId = get_local_id(0); \n" " int groupId = get_group_id(0); \n" " \n" " int i = (int)(localId/13) - 6; \n" " int j = (localId%13) - 6; \n" " \n" " // Buffer gauss data in local memory \n" " if(localId < 49) \n" " { \n" " l_gauss25[localId] = d_gauss25[localId]; \n" " } \n" " // Buffer d_ids in local memory \n" " if(localId < 13) \n" " { \n" " l_id[localId] = d_id[localId]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // NOTE Since this only reads once from global memory, it is \n" " // quicker just to leave the data there. There is a higher \n" " // overhead from placing it in constant memory. \n" " int s = round(d_scale[groupId]); \n" " int r = round(d_pixPos[groupId].y); \n" " int c = round(d_pixPos[groupId].x); \n" " float gauss = 0.f; \n" " float4 rs = {0.0f, 0.0f, FLT_MAX, 1.0f}; \n" " \n" " __local int angleCount[1]; \n" " if(localId == 0) { \n" " angleCount[0] = 0; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " // calculate haar responses for points within radius of 6*scale \n" " if(i*i + j*j < 36) \n" " { \n" " gauss = l_gauss25[7*l_id[i+6]+l_id[j+6]]; \n" " rs.x = gauss * haarX(d_img, i_width, i_height, r+j*s, c+i*s, 4*s); \n" " rs.y = gauss * haarY(d_img, i_width, i_height, r+j*s, c+i*s, 4*s); \n" " rs.z = getAngle(rs.x, rs.y); \n" " int index = atom_add(&angleCount[0], 1); \n" " \n" " res[groupId * 109 + index] = rs; \n" " } \n" "} \n" "float magnitude(float2 val) { \n" " return (val.x*val.x + val.y*val.y); \n" "} \n" " \n" "//! \n" "__kernel void getOrientationStep2(__global float *d_orientation, \n" " __global float4 *d_res) \n" "{ \n" " // There are 42 threads \n" " int tid = get_local_id(0); \n" " int groupId = get_group_id(0); \n" " // calculate the dominant direction \n" " float ang1= 0.15f * (float)tid; \n" " float ang2 = (ang1+(pi/3.0f) > 2*pi ? ang1-(5.0f*pi/3.0f) : ang1+(pi/3.0f));\n" " __local float4 res[109]; \n" " // Cache data in local memory \n" " for(uint k = tid; k < 109; k += get_local_size(0)) \n" " { \n" " res[k] = d_res[groupId * 109 + k]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // loop slides pi/3 window around feature point \n" " \n" " __local float2 sum[42]; \n" " sum[tid] = (float2)(0.f, 0.f); \n" " // If tid is 0-34, ang1 < ang2 \n" " if(tid <= 34) { \n" " for(uint k = 0; k < 109; ++k) \n" " { \n" " const float4 rs = res[k]; \n" " // get angle from the x-axis of the sample point \n" " const float ang = rs.z; \n" " // determine whether the point is within the window \n" " int check = ang1 < ang && ang < ang2; \n" " sum[tid].x += rs.x * check; \n" " sum[tid].y += rs.y * check; \n" " } \n" " } \n" " else { \n" " // If tid is 35-41, ang2 < ang1 \n" " for(uint k = 0; k < 109; ++k) \n" " { \n" " const float4 rs = res[k]; \n" " // get angle from the x-axis of the sample point \n" " const float ang = rs.z; \n" " // determine whether the point is within the window \n" " int check = ((ang > 0 && ang < ang2) || (ang > ang1 && ang < 2*pi));\n" " sum[tid].x += rs.x * check; \n" " sum[tid].y += rs.y * check; \n" " } \n" " } \n" " \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " \n" " // If the vector produced from this window is longer than all \n" " // previous vectors then this forms the new dominant direction \n" " for(int offset = 32; offset > 0; offset >>= 1) \n" " { \n" " if (tid < offset && tid + offset < 42) { \n" " if(magnitude(sum[tid]) < magnitude(sum[tid+offset])) { \n" " \n" " sum[tid] = sum[tid+offset]; \n" " } \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " } \n" " \n" " if (tid == 0) \n" " { \n" " // assign orientation of the dominant response vector \n" " d_orientation[groupId] = getAngle(sum[0].x, sum[0].y); \n" " } \n" "} \n"; const char *createDescriptors_kernel = "#define DES_THREADS 81 \n" "void sumDesc(__local float4* desc, int tha, int length) \n" "{ \n" " // do one loop to get all the > tha 64 values \n" " if(tha < length - 64) \n" " { \n" " desc[tha] += desc[tha + 64]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " int stride ; // 64/2 \n" " for (stride = 32; stride>0; stride>>=1) \n" " { \n" " if (tha < stride) \n" " { \n" " desc[tha] += desc[tha + stride]; \n" " } \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " } \n" "} \n" "__kernel void createDescriptors_kernel( \n" " __global float* intImage, \n" " int width, int height, \n" " __global float* scale, \n" " __global float4* surfDescriptor, \n" " __global float2* pos, \n" " __global float* orientation, \n" " __global float* descLength, \n" " __constant int* mj, \n" " __constant int* mi) \n" "{ \n" " __local float4 desc[DES_THREADS]; \n" " // There are 16 work groups per descriptor. The groups are arranged as \n" " // 16 x NumDescriptors, so each bIdy is a new descriptor. \n" " int bIdx = get_group_id(0); \n" " int bIdy = get_group_id(1); \n" " \n" " // get the x & y indexes and absolute \n" " int thx = get_local_id(0); \n" " int tha = get_local_id(1) * get_local_size(0) + thx; \n" " // init shared memory to zero \n" " desc[tha] =(float4)(0.0f,0.0f,0.0f,0.0f); \n" " // The 16 work groups are logically arranged in a 4x4 square \n" " float cx = 0.5f, cy = 0.5f; //Subregion centers for the 4x4 gaussian weig. \n" " cx += (float)((int)(bIdx/4)); \n" " cy += (float)((int)(bIdx%4)); \n" " \n" " //const int mj[16] = { \n" " // -12, -7, -2, 3, \n" " // -12, -7, -2, 3, \n" " // -12, -7, -2, 3, \n" " // -12, -7, -2, 3}; \n" " //const int mi[16] = { \n" " // -12,-12,-12,-12, \n" " // -7, -7, -7, -7, \n" " // -2, -2, -2, -2, \n" " // 3, 3, 3, 3}; \n" " int j = mj[bIdx]; \n" " int i = mi[bIdx]; \n" " float x = round(pos[bIdy].x); \n" " float y = round(pos[bIdy].y); \n" " float orient = orientation[bIdy]; \n" " float co = cos(orient); \n" " float si = sin(orient); \n" " \n" " float thScale = scale[bIdy]; \n" " float ix = i + 5; \n" " float jx = j + 5; \n" " float xs = round(x + ( -jx*thScale*si + ix*thScale*co)); \n" " float ys = round(y + ( jx*thScale*co + ix*thScale*si)); \n" " \n" " // There are 81 work items per work group, logically \n" " // arranged into 9x9 \n" " int k = i + (tha / 9); \n" " int l = j + (tha % 9); \n" " // Get coords of sample point on the rotated axis \n" " int sample_x = round(x + (-l*thScale*si + k*thScale*co)); \n" " int sample_y = round(y + ( l*thScale*co + k*thScale*si)); \n" " // Get the gaussian weighted x and y responses \n" " float gauss_s1 = gaussian((float)(xs-sample_x), (float)(ys-sample_y), \n" " 2.5f*thScale); \n" " float rx = haarX(intImage, width, height, sample_y, sample_x, \n" " 2*round(thScale)); \n" " float ry = haarY(intImage, width, height, sample_y, sample_x, \n" " 2*round(thScale)); \n" " //Get the gaussian weighted x and y responses on rotated axis \n" " float rrx = gauss_s1*(-rx*si + ry*co); \n" " float rry = gauss_s1*(rx*co + ry*si); \n" " desc[tha].x = rrx; \n" " desc[tha].y = rry; \n" " desc[tha].z = fabs(rrx); \n" " desc[tha].w = fabs(rry); \n" " \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " // Call summer function (result goes in index 0) \n" " sumDesc(desc, tha, DES_THREADS); \n" " \n" " barrier(CLK_LOCAL_MEM_FENCE); \n" " if(tha == 0) \n" " { \n" " // There are 16 work groups per i-point \n" " int dpos= bIdy * 16 + bIdx; \n" " float gauss_s2 = gaussian(cx-2.0f,cy-2.0f,1.5f); \n" " \n" " desc[0] *= gauss_s2; \n" " \n" " // Store the descriptor \n" " surfDescriptor[dpos] = desc[0]; \n" " // Store the descriptor length \n" " descLength[bIdy * get_num_groups(0) + bIdx] = dot(desc[0], desc[0]); \n" " } \n" "} \n"; } } }