/********************************************************************
**                Image Component Library (ICL)                    **
**                                                                 **
** Copyright (C) 2006-2013 CITEC, University of Bielefeld          **
**                         Neuroinformatics Group                  **
** Website: www.iclcv.org and                                      **
**          http://opensource.cit-ec.de/projects/icl               **
**                                                                 **
** File   : ICLCV/src/ICLCV/CLSurfLibKernels.cpp                   **
** Module : ICLCV                                                  **
** Authors: Christof Elbrechter                                    **
**                                                                 **
**                                                                 **
** GNU LESSER GENERAL PUBLIC LICENSE                               **
** This file may be used under the terms of the GNU Lesser General **
** Public License version 3.0 as published by the                  **
**                                                                 **
** Free Software Foundation and appearing in the file LICENSE.LGPL **
** included in the packaging of this file.  Please review the      **
** following information to ensure the license requirements will   **
** be met: http://www.gnu.org/licenses/lgpl-3.0.txt                **
**                                                                 **
** The development of this software was supported by the           **
** Excellence Cluster EXC 277 Cognitive Interaction Technology.    **
** The Excellence Cluster EXC 277 is a grant of the Deutsche       **
** Forschungsgemeinschaft (DFG) in the context of the German       **
** Excellence Initiative.                                          **
**                                                                 **
********************************************************************/

 /****************************************************************************\ 
 * Copyright (c) 2011, Advanced Micro Devices, Inc.                           *
 * All rights reserved.                                                       *
 *                                                                            *
 * Redistribution and use in source and binary forms, with or without         *
 * modification, are permitted provided that the following conditions         *
 * are met:                                                                   *
 *                                                                            *
 * Redistributions of source code must retain the above copyright notice,     *
 * this list of conditions and the following disclaimer.                      *
 *                                                                            *
 * Redistributions in binary form must reproduce the above copyright notice,  *
 * this list of conditions and the following disclaimer in the documentation  *
 * and/or other materials provided with the distribution.                     *
 *                                                                            *
 * Neither the name of the copyright holder nor the names of its contributors *
 * may be used to endorse or promote products derived from this software      *
 * without specific prior written permission.                                 *
 *                                                                            *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS        *
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED  *
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR          *
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,      *
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,        *
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR         *
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF     *
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING       *
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS         *
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.               *
 *                                                                            *
 * If you use the software (in whole or in part), you shall adhere to all     *
 * applicable U.S., European, and other export laws, including but not        *
 * limited to the U.S. Export Administration Regulations ('EAR'), (15 C.F.R.  *
 * Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000   *
 * of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR, you       *
 * hereby certify that, except pursuant to a license granted by the United    *
 * States Department of Commerce Bureau of Industry and Security or as        *
 * otherwise permitted pursuant to a License Exception under the U.S. Export  *
 * Administration Regulations ("EAR"), you will not (1) export, re-export or  *
 * release to a national of a country in Country Groups D:1, E:1 or E:2 any   *
 * restricted technology, software, or source code you receive hereunder,     *
 * or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
 * technology or software, if such foreign produced direct product is subject *
 * to national security controls as identified on the Commerce Control List   *
 *(currently found in Supplement 1 to Part 774 of EAR).  For the most current *
 * Country Group listings, or for additional information about the EAR or     *
 * your obligations under those regulations, please refer to the U.S. Bureau  *
 * of Industry and Security's website at http://www.bis.doc.gov/.             *
 \****************************************************************************/

#include <ICLCV/CLSurfLibKernels.h>

namespace icl{
  namespace cv{
    namespace clsurf{

      const char *utilityKernels =
    	        "#ifdef M_PI_F                                                                   \n"
    	        "#define pi M_PI_F                                                               \n"
    	        "#else                                                                           \n"
    	        "#define pi 3.141592654f                                                         \n"
    	        "#endif                                                                          \n"
    	        "typedef __global float* det_layer_t;                                            \n"
    	        "typedef __global int* lap_t;                                                    \n"
                "//! Calculate the value of the 2d gaussian at x,y                               \n"
    	        "float gaussian(float x, float y, float sig)                                     \n"
    	        "{                                                                               \n"
    	        "    return (1.0f/(2.0f*pi*sig*sig)) * exp(-(x*x+y*y)/(2.0f*sig*sig));           \n"
    	        "}                                                                               \n"
    	        "//! Calculate the integral of a region                                          \n"
    	        "float BoxIntegral(                                                              \n"
    	        "                  __global float* data,                                         \n"
    	        "                  int width, int height, int row, int col,                      \n"
    	        "                  int rows, int cols)                                           \n"
    	        "{                                                                               \n"
    	        "    float A = 0.0f;                                                             \n"
    	        "    float B = 0.0f;                                                             \n"
    	        "    float C = 0.0f;                                                             \n"
    	        "    float D = 0.0f;                                                             \n"
    	        "    // The subtraction by one for row/col is because row/col is inclusive.      \n"
    	        "    int r1 = min(row, height) - 1;                                              \n"
    	        "    int c1 = min(col, width)  - 1;                                              \n"
    	        "    int r2 = min(row + rows, height) - 1;                                       \n"
    	        "    int c2 = min(col + cols, width)  - 1;                                       \n"
    	        "    if (r1 >= 0 && c1 >= 0) A = data[r1 * width + c1];                          \n"
    	        "    if (r1 >= 0 && c2 >= 0) B = data[r1 * width + c2];                          \n"
    	        "    if (r2 >= 0 && c1 >= 0) C = data[r2 * width + c1];                          \n"
    	        "    if (r2 >= 0 && c2 >= 0) D = data[r2 * width + c2];                          \n"
    	        "    return max(0.0f, A - B - C + D);                                            \n"
    	        "}                                                                               \n"
    	        "//! Calculate Haar wavelet responses in X direction                             \n"
    	        "float haarX(                                                                    \n"
    	        "            __global float* img,                                                \n"
    	        "            int width, int height, int row, int column, int s)                  \n"
    	        "{                                                                               \n"
    	        "    return BoxIntegral(img, width, height, row-s/2, column,     s, s/2) -       \n"
    	        "           BoxIntegral(img, width, height, row-s/2, column-s/2, s, s/2);        \n"
    	        "}                                                                               \n"
    	        "//! Calculate Haar wavelet responses in Y direction                             \n"
    	        "float haarY(                                                                    \n"
    	        "            __global float* img,                                                \n"
    	        "            int width, int height, int row, int column, int s)                  \n"
    	        "{                                                                               \n"
    	        "    return BoxIntegral(img, width, height, row,     column-s/2, s/2, s) -       \n"
    	        "           BoxIntegral(img, width, height, row-s/2, column-s/2, s/2, s);        \n"
    	        "}                                                                               \n";

      
      const char *normalizeDescriptors_kernel =  
        "__kernel void                                                                   \n"
        "normalizeDescriptors(__global float* surfDescriptors,                           \n"
        "                     __global float* descLengths)                               \n"
        "{                                                                               \n"
        "    // Previous kernels have computed 64 descriptors (surfDescriptors) and      \n"
        "    // 16 lengths (descLengths) for each interesting point found by SURF.       \n"
        "    // This kernel will sum up all of the lengths for each interesting point    \n"
        "    // and take the square root.  This value will then be used to scale the     \n"
        "    // descriptors.                                                             \n"
        "    // Note that each work group contains 64 work items (one per descriptor).   \n"
        "    // This array is used to cache data that will be accessed multiple times.   \n"
        "    // Since it is declared __local, only one instance is created and shared    \n"
        "    // by the entire work group                                                 \n"
        "    __local float ldescLengths[16];                                             \n"
        "    // Get the offset for the descriptor that this work group will normalize    \n"
        "    int descOffset = get_group_id(0) * 64;                                      \n"
        "    // Get the offset for the descriptor lengths that this work group will      \n"
        "    // use to scale the descriptors                                             \n"
        "    int lenOffset = get_group_id(0) * 16;                                       \n"
        "    // Get this work items ID within the work group                             \n"
        "    int tid = get_local_id(0);                                                  \n"
        "    // Only 16 of the work items are needed to cache the lengths                \n"
        "    if(tid < 16) {                                                              \n"
        "        // Have the first 16 work items cache the lengths for this              \n"
        "        // point in local memory                                                \n"
        "        ldescLengths[tid] = descLengths[lenOffset + tid];                       \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    // Work items work together to perform a parallel reduction of the          \n"
        "    // descriptor lengths (result gets stored in ldescLength[0])                \n"
        "    for(int i = 8; i > 0; i >>= 1)                                              \n"
        "    {                                                                           \n"
        "        if (tid < i)                                                            \n"
        "        {                                                                       \n"
        "            ldescLengths[tid] += ldescLengths[tid + i];                         \n"
        "                                                                                \n"
        "        }                                                                       \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    // Calculate the normalized length of the descriptors                       \n"
        "    float lengthOfDescriptor = 1.0f/sqrt(ldescLengths[0]);                      \n"
        "    // Scale each descriptor                                                    \n"
        "    surfDescriptors[descOffset + tid] *= lengthOfDescriptor;	                 \n"
        "}                                                                               \n";

      
      const char *nonMaxSuppression_kernel = 
        "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable              \n"
        "                                                                                \n"
        "int getLaplacian(lap_t layer, int c, int r, int width)                          \n"
        "{                                                                               \n"
        "    int laplacian;                                                              \n"
        "                                                                                \n"
        "    laplacian = layer[r*width+c];                                               \n"
        "    return laplacian;                                                           \n"
        "}                                                                               \n"
        "float getResponse(det_layer_t layer, int c, int r, int width, int scale)        \n"
        "{                                                                               \n"
        "    float val;                                                                  \n"
        "                                                                                \n"
        "    int row = r*scale;                                                          \n"
        "    val = layer[r*scale*width+c*scale];                                         \n"
        "    return val;                                                                 \n"
        "}                                                                               \n"
        "bool interpolateExtremum(      int  r,                                          \n"
        "                               int  c, 			                 \n"
        "                   __global    int* ipt_count,                                  \n"
        "                            float2* pos,                                        \n"
        "                             float* det_scale,                                  \n"
        "                               int* laplacian,                                  \n"
        "                       det_layer_t  t,                                          \n"
        "                               int  tWidth,                                     \n"
        "                               int  tHeight,                                    \n"
        "                               int  tStep,                                      \n"
        "                       det_layer_t  m,                                          \n"
        "                             lap_t  mLaplacian,                                 \n"
        "                               int  mWidth,                                     \n"
        "                               int  mHeight,                                    \n"
        "                               int  mFilter,                                    \n"
        "                       det_layer_t  b,                                          \n"
        "                               int  bWidth,                                     \n"
        "                               int  bHeight,                                    \n"
        "                               int  bFilter)                                    \n"
        "{                                                                               \n"
        "    // ---------------------------------------                                  \n"
        "    // Step 1: Calculate the 3D derivative                                      \n"
        "    // ---------------------------------------                                  \n"
        "    int mScale = mWidth/tWidth;                                                 \n"
        "    int bScale = bWidth/tWidth;                                                 \n"
        "                                                                                \n"
        "    float dx, dy, ds;                                                           \n"
        "    dx = (getResponse(m, c+1, r,   mWidth, mScale) -                            \n"
        "          getResponse(m, c-1, r,   mWidth, mScale)) / 2.0f;                     "                    
        "    dy = (getResponse(m, c,   r+1, mWidth, mScale) -                            \n"
        "          getResponse(m, c,   r-1, mWidth, mScale)) / 2.0f;                     \n"
        "    ds = (getResponse(t, c,   r,   tWidth, 1)      -                            \n"
        "          getResponse(b, c,   r,   bWidth, bScale)) / 2.0f;                     \n"
        "                                                                                \n"
        "    // ---------------------------------------                                  \n"
        "    // Step 2: Calculate the inverse Hessian                                    \n"
        "    // ---------------------------------------                                  \n"
        "                                                                                \n"
        "    float v;                                                                    \n"
        "                                                                                \n"
        "    float dxx, dyy, dss, dxy, dxs, dys;                                         \n"
        "    v = getResponse(m, c, r, mWidth, mScale);                                   \n"
        "    dxx =  getResponse(m, c+1, r,   mWidth, mScale) +                           \n"
        "           getResponse(m, c-1, r,   mWidth, mScale) - 2.0f*v;                   \n"
        "    dyy =  getResponse(m, c,   r+1, mWidth, mScale) +                           \n"
        "           getResponse(m, c,   r-1, mWidth, mScale) - 2.0f*v;                   \n"
        "    dss =  getResponse(t, c,   r,   tWidth, 1)      +                           \n"
        "           getResponse(b, c,   r,   bWidth, bScale) - 2.0f*v;                   \n"
        "    dxy = (getResponse(m, c+1, r+1, mWidth, mScale) -                           \n"
        "           getResponse(m, c-1, r+1, mWidth, mScale) -                           \n"
        "           getResponse(m, c+1, r-1, mWidth, mScale) +                           \n"
        "           getResponse(m, c-1, r-1, mWidth, mScale))/4.0f;                      \n"
        "    dxs = (getResponse(t, c+1, r,   tWidth, 1)      -                           \n"
        "           getResponse(t, c-1, r,   tWidth, 1) -                                \n"
        "           getResponse(b, c+1, r,   bWidth, bScale) +                           \n"
        "           getResponse(b, c-1, r,   bWidth, bScale))/4.0f;                      \n"
        "    dys = (getResponse(t, c,   r+1, tWidth, 1)      -                           \n"
        "           getResponse(t, c,   r-1, tWidth, 1) -                                \n"
        "           getResponse(b, c,   r+1, bWidth, bScale) +                           \n"
        "           getResponse(b, c,   r-1, bWidth, bScale))/4.0f;                      \n"
        "    float H0 = dxx;                                                             \n"
        "    float H1 = dxy;                                                             \n"
        "    float H2 = dxs;                                                             \n"
        "    float H3 = dxy;                                                             \n"
        "    float H4 = dyy;                                                             \n"
        "    float H5 = dys;                                                             \n"
        "    float H6 = dxs;                                                             \n"
        "    float H7 = dys;                                                             \n"
        "    float H8 = dss;                                                             \n"
        "    // NOTE Although the inputs are the same, the value of determinant (and     \n"
        "    //      therefore invdet) vary from the CPU version                         \n"
        "                                                                                \n"
        "    float determinant =                                                         \n"
        "         H0*(H4*H8-H7*H5) -                                                     \n"
        "         H1*(H3*H8-H5*H6) +                                                     \n"
        "         H2*(H3*H7-H4*H6);                                                      \n"
        "                                                                                \n"
        "    float invdet = 1.0f / determinant;                                          \n"
        "                                                                                \n"
        "    float invH0 =  (H4*H8-H7*H5)*invdet;                                        \n"
        "    float invH1 = -(H3*H8-H5*H6)*invdet;                                        \n"
        "    float invH2 =  (H3*H7-H6*H4)*invdet;                                        \n"
        "    float invH3 = -(H1*H8-H2*H7)*invdet;                                        \n"
        "    float invH4 =  (H0*H8-H2*H6)*invdet;                                        \n"
        "    float invH5 = -(H0*H7-H6*H1)*invdet;                                        \n"
        "    float invH6 =  (H1*H5-H2*H4)*invdet;                                        \n"
        "    float invH7 = -(H0*H5-H3*H2)*invdet;                                        \n"
        "    float invH8 =  (H0*H4-H3*H1)*invdet;                                        \n"
        "                                                                                \n"
        "    // ---------------------------------------                                  \n"
        "    // Step 3: Multiply derivative and Hessian                                  \n"
        "    // ---------------------------------------                                  \n"
        "                                                                                \n"
        "    float xi = 0.0f, xr = 0.0f, xc = 0.0f;                                      \n"
        "                                                                                \n"
        "    xc =  invH0 * dx * -1.0f;                                                   \n"
        "    xc += invH1 * dy * -1.0f;                                                   \n"
        "    xc += invH2 * ds * -1.0f;                                                   \n"
        "                                                                                \n"
        "    xr =  invH3 * dx * -1.0f;                                                   \n"
        "    xr += invH4 * dy * -1.0f;                                                   \n"
        "    xr += invH5 * ds * -1.0f;                                                   \n"
        "                                                                                \n"
        "    xi =  invH6 * dx * -1.0f;                                                   \n"
        "    xi += invH7 * dy * -1.0f;                                                   \n"
        "    xi += invH8 * ds * -1.0f;                                                   \n"
        "    // Check if point is sufficiently close to the actual extremum              \n"
        "    if(fabs(xi) < 0.5f && fabs(xr) < 0.5f && fabs(xc) < 0.5f)                   \n"
        "    {                                                                           \n"
        "        int filterStep = mFilter - bFilter;                                     \n"
        "                                                                                \n"
        "        // Store values related to interest point                               \n"
        "        (*pos).x = (float)((c + xc)*tStep);                                     \n"
        "        (*pos).y = (float)((r + xr)*tStep);                                     \n"
        "        *det_scale = (float)(0.1333f)*(mFilter + (xi*filterStep));              \n"
        "                                                                                \n"
        "        int scale = mWidth/tWidth;                                              \n"
        "        *laplacian = getLaplacian(mLaplacian, c*scale, r*scale, mWidth);        \n"
        "                                                                                \n"
        "        return true;                                                            \n"
        "    }                                                                           \n"
        "    return false;                                                               \n"
        "}                                                                               \n"
        "//! Check whether point really is a maximum                                     \n"
        "bool isExtremum( det_layer_t  t,                                                \n"
        "                         int  tWidth,                                           \n"
        "                         int  tHeight,                                          \n"
        "                         int  tFilter,                                          \n"
        "                         int  tStep,                                            \n"
        "                 det_layer_t  m,                                                \n"
        "                         int  mWidth,                                           \n"
        "                         int  mHeight,                                          \n"
        "                 det_layer_t  b,                                                \n"
        "                         int  bWidth,                                           \n"
        "                         int  bHeight,                                          \n"
        "                         int  c,                                                \n"
        "                         int  r,                                                \n"
        "                       float  threshold)                                        \n"
        "{                                                                               \n"
        "    int layerBorder = (tFilter+1)/(2*tStep);                                    \n"
        "    // If any accesses would read out-of-bounds, this point is not a maximum    \n"
        "    if(r <= layerBorder || r >= tHeight - layerBorder ||                        \n"
        "       c <= layerBorder || c >= tWidth - layerBorder) {                         \n"
        "                                                                                \n"
        "       return false;                                                            \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    int mScale = mWidth/tWidth;                                                 \n"
        "                                                                                \n"
        "    // Candidate for local maximum                                              \n"
        "    float candidate = getResponse(m, c, r, mWidth, mScale);                     \n"
        "                                                                                \n"
        "    if(candidate < threshold) {                                                 \n"
        "        return false;                                                           \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    // If any response in 3x3x3 is greater candidate not maximum                \n"
        "    float localMax =          getResponse(t, c-1, r-1, tWidth, 1);              \n"
        "    localMax = fmax(localMax, getResponse(t, c,   r-1, tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c+1, r-1, tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c-1, r,   tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c,   r,   tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c+1, r,   tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c-1, r+1, tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c,   r+1, tWidth, 1));             \n"
        "    localMax = fmax(localMax, getResponse(t, c+1, r+1, tWidth, 1));             \n"
        "                                                                                \n"
        "    int bScale = bWidth/tWidth;                                                 \n"
        "    localMax = fmax(localMax, getResponse(b, c-1, r-1, bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c,   r-1, bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c+1, r-1, bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c-1, r,   bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c,   r,   bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c+1, r,   bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c-1, r+1, bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c,   r+1, bWidth, bScale));        \n"
        "    localMax = fmax(localMax, getResponse(b, c+1, r+1, bWidth, bScale));        \n"
        "                                                                                \n"
        "    //int mScale = mWidth/tWidth;                                               \n"
        "    localMax = fmax(localMax, getResponse(m, c-1, r-1, mWidth, mScale));        \n"
        "    localMax = fmax(localMax, getResponse(m, c,   r-1, mWidth, mScale));        \n"
        "    localMax = fmax(localMax, getResponse(m, c+1, r-1, mWidth, mScale));        \n"
        "    localMax = fmax(localMax, getResponse(m, c-1, r,   mWidth, mScale));        \n"
        "    // This is the candidate pixel                                              \n"
        "    localMax = fmax(localMax, getResponse(m, c+1, r,   mWidth, mScale));        \n"
        "    localMax = fmax(localMax, getResponse(m, c-1, r+1, mWidth, mScale));        \n"
        "    localMax = fmax(localMax, getResponse(m, c,   r+1, mWidth, mScale));        \n"
        "    localMax = fmax(localMax, getResponse(m, c+1, r+1, mWidth, mScale));        \n"
        "                                                                                \n"
        "    // If localMax > candidate, candidate is not the local maxima               \n"
        "    if(localMax > candidate) {                                                  \n"
        "       return false;                                                            \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    return true;                                                                \n"
        "}                                                                               \n"
        " /* We don't know how many Ipoints there will be until after this kernel,       \n"
        " * but we have to have enough space allocated beforehand.  Before a work item   \n"
        " * writes to a location, he should make sure that there is space available.     \n"
        " * After the kernel we'll check to make sure ipt_count is less than the         \n"
        " * allocated space, and if not, we'll run it again.                             \n"
        " */                                                                             \n"
        "__kernel                                                                        \n"
        "void non_max_supression_kernel(                                                 \n"
        "                 det_layer_t  tResponse,                                        \n"
        "                         int  tWidth,                                           \n"
        "                         int  tHeight,                                          \n"
        "                         int  tFilter,                                          \n"
        "                         int  tStep,                                            \n"
        "                 det_layer_t  mResponse,                                        \n"
        "                       lap_t  mLaplacian,                                       \n"
        "                         int  mWidth,                                           \n"
        "                         int  mHeight,                                          \n"
        "                         int  mFilter,                                          \n"
        "                 det_layer_t  bResponse,                                        \n"
        "                         int  bWidth,                                           \n"
        "                         int  bHeight,                                          \n"
        "                         int  bFilter,                                          \n"
        "             __global    int* ipt_count,			                 \n"
        "             __global float2* d_pixPos,                                         \n"
        "             __global  float* d_scale,                                          \n"
        "             __global    int* d_laplacian,                                      \n"
        "                         int  maxPoints,                                        \n"
        "                       float  threshold)                                        \n"
        "{                                                                               \n"
        "                                                                                \n"
        "    int r = get_global_id(1);                                                   \n"
        "    int c = get_global_id(0);                                                   \n"
        "                                                                                \n"
        "    float2 pixpos;                                                              \n"
        "    float scale;                                                                \n"
        "    int laplacian;                                                              \n"
        "                                                                                \n"
        "    // Check the block extremum is an extremum across boundaries.               \n"
        "    if(isExtremum(tResponse, tWidth, tHeight, tFilter, tStep, mResponse,        \n"
        "                  mWidth, mHeight, bResponse, bWidth, bHeight, c, r, threshold))\n"
        "    {                                                                           \n"
        "        if(interpolateExtremum(r, c, ipt_count, &pixpos, &scale,                \n"
        "            &laplacian, tResponse, tWidth, tHeight, tStep, mResponse,           \n"
        "            mLaplacian, mWidth, mHeight, mFilter, bResponse, bWidth, bHeight,   \n"
        "            bFilter)) {      			 			         \n"
        "            int index = atom_add(&ipt_count[0], 1);                             \n"
        "            if(index < maxPoints) {                                             \n"
        "                d_pixPos[index] = pixpos;                                       \n"
        "                d_scale[index] = scale;                                         \n"
        "                d_laplacian[index] = laplacian;                                 \n"
        "            }                                                                   \n"
        "        }                                                                       \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "}                                                                               \n";


      const char *nearestNeighbor_kernel =  
        "typedef struct{                                                                 \n"
        "    int point1;                                                                 \n"
        "    float x1;                                                                   \n"
        "    float y1;                                                                   \n"
        "    int point2;                                                                 \n"
        "    float x2;                                                                   \n"
        "    float y2;                                                                   \n"
        "    float dist;                                                                 \n"
        "    float orientationDiff;                                                      \n"
        "    float scaleDiff;                                                            \n"
        "    int match;                                                                  \n"
        "    bool dup;                                                                   \n"
        "} distPoint;                                                                    \n"
        "typedef struct{                                                                 \n"
        "    float x;                                                                    \n"
        "    float y;                                                                    \n"
        "    float scale;                                                                \n"
        "    float orientation;                                                          \n"
        "    int laplacian;                                                              \n"
        "    int clusterIndex;                                                           \n"
        "    float descriptor[64];                                                       \n"
        "} Ipoint;                                                                       \n"
        "__kernel void NearestNeighbor(__global Ipoint *d_ipts1,                         \n"
        "                              __global Ipoint *d_ipts2,                         \n"
        "                              __global distPoint *d_distancePoints,             \n"
        "                              __local float* tempDistPts,                       \n"
        "                              const unsigned int ipts2Size) {                   \n"
        "    int groupId = get_group_id(0);                                              \n"
        "    int localId = get_local_id(0);                                              \n"
        "    int localSize = get_local_size(0);                                          \n"
        "    __global Ipoint *ipts1 = d_ipts1 + groupId;                                 \n"
        "    __global Ipoint *ipts2;                                                     \n"
        "    __global distPoint *dp = d_distancePoints + groupId;                        \n"
        "    int point2Match = 0;                                                        \n"
        "    float minDist = FLT_MAX;                                                    \n"
        "    for (unsigned int i2 = 0; i2 < ipts2Size; i2++){                            \n"
        "                                                                                \n"
        "        // Calculate the sum of squares distance                                \n"
        "        ipts2 = d_ipts2 + i2;                                                   \n"
        "        // TODO This metric may need to be improved.  Currently                 \n"
        "        //      we determine matches based on the sum of descriptor             \n"
        "        //      differences                                                     \n"
        "        tempDistPts[localId] = fabs(ipts1->descriptor[localId] -                \n"
        "                                    ipts2->descriptor[localId]);                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // reduce                                                               \n"
        "        for(int k=32; k > 0 ;k>>=1) {                                           \n"
        "            if(localId < k)                                                     \n"
        "            {                                                                   \n"
        "                tempDistPts[localId] += tempDistPts[localId + k];               \n"
        "            }                                                                   \n"
        "            barrier(CLK_LOCAL_MEM_FENCE);                                       \n"
        "        }                                                                       \n"
        "                                                                                \n"
        "        if (localId==0) {                                                       \n"
        "            tempDistPts[0] /= 64; // Get the average descriptor difference      \n"
        "            if(tempDistPts[0] < minDist)                                        \n"
        "            {                                                                   \n"
        "                minDist = tempDistPts[0];                                       \n"
        "                point2Match = i2;                                               \n"
        "            }                                                                   \n"
        "        }                                                                       \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    if(localId == 0)                                                            \n"
        "    {                                                                           \n"
        "        ipts2 = d_ipts2+point2Match;                                            \n"
        "        dp->point1 = groupId;                                                   \n"
        "        dp->x1 = ipts1->x;                                                      \n"
        "        dp->y1 = ipts1->y;                                                      \n"
        "        dp->point2 = point2Match;                                               \n"
        "        dp->x2 = ipts2->x;                                                      \n"
        "        dp->y2 = ipts2->y;                                                      \n"
        "        dp->dist = minDist;                                                     \n"
        "        dp->orientationDiff = ipts1->orientation - ipts2->orientation;          \n"
        "        dp->scaleDiff = ipts1->scale - ipts2->scale;                            \n"
        "    }                                                                           \n"
        "}                                                                               \n";

      const char *integralImage_kernels =  
        "#define WG_SIZE 64                                                              \n"
        "#define HALF_WG_SIZE (WG_SIZE/2)                                                \n"
        " /* This kernel does a prefix scan on any size image.  Each work group marches  \n"
        " * down a row                                                                   \n"
        " * of the image performing a sub-scan on WG_SIZE elements at a time.            \n"
        " * The value obtained by the thread with the biggest ID is kept as the base     \n"
        " * for the next sub-scan.                                                       \n"
        " */                                                                             \n"
        "__kernel                                                                        \n"
        "void scan(__global float *input,                                                \n"
        "          __global float *output,                                               \n"
        "                     int  rows,                                                 \n"
        "                     int  cols) {                                               \n"
        "    // Each work group is responsible for scanning a row                        \n"
        "    // If we add additional elements to local memory (half the                  \n"
        "    // work group size) and fill them with zeros, we don't need                 \n"
        "    // conditionals in the code to check for out-of-bounds                      \n"
        "    // accesses                                                                 \n"
        "    __local float lData[WG_SIZE+HALF_WG_SIZE];                                  \n"
        "    __local float lData2[WG_SIZE+HALF_WG_SIZE];                                 \n"
        "    int myGlobalY = get_global_id(1);                                           \n"
        "    int myLocalX = get_local_id(0);                                             \n"
        "    // Initialize out-of-bounds elements with zeros                             \n"
        "    if(myLocalX < HALF_WG_SIZE) {                                               \n"
        "        lData[myLocalX] = 0.0f;                                                 \n"
        "        lData2[myLocalX] = 0.0f;                                                \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    // Start past out-of-bounds elements                                        \n"
        "    myLocalX += HALF_WG_SIZE;                                                   \n"
        "                                                                                \n"
        "    // This value needs to be added to the local data.  It's the highest        \n"
        "    // value from the prefix scan of the previous elements in the row           \n"
        "    float prevMaxVal = 0;                                                       \n"
        "    // March down a row WG_SIZE elements at a time                              \n"
        "    int iterations = cols/WG_SIZE;                                              \n"
        "    if(cols % WG_SIZE != 0)                                                     \n"
        "    {                                                                           \n"
        "        // If cols are not an exact multiple of WG_SIZE, then we need to do     \n"
        "        // one more iteration                                                   \n"
        "        iterations++;                                                           \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    for(int i = 0; i < iterations; i++) {                                       \n"
        "                                                                                \n"
        "        int columnOffset = i*WG_SIZE + get_local_id(0);                         \n"
        "        // Don't do anything if a thread's index is past the end of the row.    \n"
        "        //  We still need the thread available for memory barriers though.      \n"
        "        if(columnOffset < cols) {                                               \n"
        "            // Cache the input data in local memory                             \n"
        "            lData[myLocalX] = input[myGlobalY*cols + columnOffset];             \n"
        "        }                                                                       \n"
        "        else {                                                                  \n"
        "            // Otherwise just store zeros                                       \n"
        "            lData[myLocalX] = 0.0f;                                             \n"
        "        }                                                                       \n"
        "                                                                                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 1                                                                    \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-1];                 \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 2                                                                    \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-2];                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 4                                                                    \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-4];                 \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 8                                                                    \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-8];                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 16                                                                   \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-16];                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 32                                                                   \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-32];               \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // Write data out to global memory                                      \n"
        "        if(columnOffset < cols) {                                               \n"
        "            output[myGlobalY*cols + columnOffset] =                             \n"
        "              lData[myLocalX] + prevMaxVal;                                     \n"
        "        }                                                                       \n"
        "                                                                                \n"
        "        // Copy the value from the highest thread in the group to my local index\n"
        "        prevMaxVal += lData[WG_SIZE+HALF_WG_SIZE-1];                            \n"
        "    }                                                                           \n"
        "}                                                                               \n"
        "__kernel                                                                        \n"
        "void scan4(__global float4 *input,                                              \n"
        "           __global float4 *output,                                             \n"
        "                       int  rows,                                               \n"
        "                       int  cols) {                                             \n"
        "    // Each work group is responsible for scanning a row                        \n"
        "    // If we add additional elements to local memory (half the                  \n"
        "    // work group size) and fill them with zeros, we don't need                 \n"
        "    // conditionals in the code to check for out-of-bounds                      \n"
        "    // accesses                                                                 \n"
        "    __local float4 lData[WG_SIZE+HALF_WG_SIZE];                                 \n"
        "    __local float4 lData2[WG_SIZE+HALF_WG_SIZE];                                \n"
        "    int myGlobalY = get_global_id(1);                                           \n"
        "    int myLocalX = get_local_id(0);                                             \n"
        "    // Initialize out-of-bounds elements with zeros                             \n"
        "    if(myLocalX < HALF_WG_SIZE) {                                               \n"
        "        lData[myLocalX] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);                     \n"
        "        lData2[myLocalX] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);                    \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    // Start past out-of-bounds elements                                        \n"
        "    myLocalX += HALF_WG_SIZE;                                                   \n"
        "                                                                                \n"
        "    // This value needs to be added to the local data.  It's the highest        \n"
        "    // value from the prefix scan of the previous elements in the row           \n"
        "    float prevMaxVal = 0;                                                       \n"
        "                                                                                \n"
        "    // There will actually be only 1/4 of the columns since we've changed       \n"
        "    // the data type to float4                                                  \n"
        "    int columns = cols/4;                                                       \n"
        "    // March down a row WG_SIZE elements at a time                              \n"
        "    int iterations = columns/WG_SIZE;                                           \n"
        "    if(columns % WG_SIZE != 0)                                                  \n"
        "    {                                                                           \n"
        "        // If cols are not an exact multiple of WG_SIZE, then we need to do     \n"
        "        // one more iteration                                                   \n"
        "        iterations++;                                                           \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    for(int i = 0; i < iterations; i++) {                                       \n"
        "                                                                                \n"
        "        int columnOffset = i*WG_SIZE + get_local_id(0);                         \n"
        "        // Don't do anything if a thread's index is past the end of the row.    \n"
        "        //    We still need the thread available for memory barriers though.    \n"
        "        if(columnOffset < columns) {                                            \n"
        "            // Cache the input data in local memory                             \n"
        "            lData[myLocalX] = input[myGlobalY*columns + columnOffset];          \n"
        "        }                                                                       \n"
        "        else {                                                                  \n"
        "            // Otherwise just store zeros                                       \n"
        "            lData[myLocalX] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);                 \n"
        "        }                                                                       \n"
        "                                                                                \n"
        "        lData[myLocalX].y += lData[myLocalX].x;                                 \n"
        "        lData[myLocalX].z += lData[myLocalX].y;                                 \n"
        "        lData[myLocalX].w += lData[myLocalX].z;                                 \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 1                                                                    \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-1].w;               \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 2                                                                    \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-2].w;              \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 4                                                                    \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-4].w;               \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 8                                                                    \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-8].w;              \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 16                                                                   \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-16].w;              \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 32                                                                   \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-32].w;             \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // Write data out to global memory                                      \n"
        "        if(columnOffset < columns) {                                            \n"
        "            output[myGlobalY*columns + columnOffset] =                          \n"
        "              lData[myLocalX] + prevMaxVal;                                     \n"
        "        }                                                                       \n"
        "                                                                                \n"
        "        // Copy the value from the highest thread in the group to my local index\n"
        "        prevMaxVal += lData[WG_SIZE+HALF_WG_SIZE-1].w;                          \n"
        "    }                                                                           \n"
        "}                                                                               \n"
        "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |                    \n"
        "                               CLK_ADDRESS_CLAMP           |                    \n"
        "                               CLK_FILTER_NEAREST;                              \n"
        "                                                                                \n"
        "__kernel                                                                        \n"
        "void scanImage(__read_only image2d_t input,                                     \n"
        "               __write_only image2d_t output,                                   \n"
        "                     int  rows,                                                 \n"
        "                     int  cols) {                                               \n"
        "    // Each work group is responsible for scanning a row                        \n"
        "    // If we add additional elements to local memory (half the                  \n"
        "    // work group size) and fill them with zeros, we don't need                 \n"
        "    // conditionals in the code to check for out-of-bounds                      \n"
        "    // accesses                                                                 \n"
        "    __local float lData[WG_SIZE+HALF_WG_SIZE];                                  \n"
        "    __local float lData2[WG_SIZE+HALF_WG_SIZE];                                 \n"
        "    int myGlobalY = get_global_id(1);                                           \n"
        "    int myLocalX = get_local_id(0);                                             \n"
        "    // Initialize out-of-bounds elements with zeros                             \n"
        "    if(myLocalX < HALF_WG_SIZE) {                                               \n"
        "        lData[myLocalX] = 0.0f;                                                 \n"
        "        lData2[myLocalX] = 0.0f;                                                \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    // Start past out-of-bounds elements                                        \n"
        "    myLocalX += HALF_WG_SIZE;                                                   \n"
        "                                                                                \n"
        "    // This value needs to be added to the local data.  It's the highest        \n"
        "    // value from the prefix scan of the previous elements in the row           \n"
        "    float prevMaxVal = 0;                                                       \n"
        "    // March down a row WG_SIZE elements at a time                              \n"
        "    int iterations = cols/WG_SIZE;                                              \n"
        "    if(cols % WG_SIZE != 0)                                                     \n"
        "    {                                                                           \n"
        "        // If cols are not an exact multiple of WG_SIZE, then we need to do     \n"
        "        // one more iteration                                                   \n"
        "        iterations++;                                                           \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    for(int i = 0; i < iterations; i++) {                                       \n"
        "                                                                                \n"
        "        int columnOffset = i*WG_SIZE + get_local_id(0);                         \n"
        "        // Don't do anything if a thread's index is past the end of the row.    \n"
        "        // we will still need thread available for memory barriers though.      \n"
        "                                                                                \n"
        "        // Cache the input data in local memory                                 \n"
        "        lData[myLocalX] = read_imagef(input, sampler,                           \n"
        "             (int2)(columnOffset,myGlobalY)).x;                                 \n"
        "                                                                                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 1                                                                    \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-1];                 \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 2                                                                    \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-2];                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 4                                                                    \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-4];                 \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 8                                                                    \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-8];                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 16                                                                   \n"
        "        lData2[myLocalX] = lData[myLocalX] + lData[myLocalX-16];                \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // 32                                                                   \n"
        "        lData[myLocalX] = lData2[myLocalX] + lData2[myLocalX-32];               \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "                                                                                \n"
        "        // Write data out to global memory                                      \n"
        "        if(columnOffset < cols) {                                               \n"
        "            write_imagef(output, (int2)(columnOffset, myGlobalY),               \n"
        "                (float4)(lData[myLocalX] + prevMaxVal, 0.0f, 0.0f, 0.0f));      \n"
        "        }                                                                       \n"
        "                                                                                \n"
        "        // Copy the value from the highest thread in the group to my local index\n"
        "        prevMaxVal += lData[WG_SIZE+HALF_WG_SIZE-1];                            \n"
        "    }                                                                           \n"
        "}                                                                               \n"
        "__kernel                                                                        \n"
        "void transpose(__global float* iImage,                                          \n"
        "               __global float* oImage,                                          \n"
        "                        int    inRows,                                          \n"
        "                        int    inCols) {                                        \n"
        "                                                                                \n"
        "    __local float tmpBuffer[256];                                               \n"
        "    // Work groups will perform the transpose                                   \n"
        "    // (i.e., an entire work group will be moved from                           \n"
        "    // one part of the image to another)                                        \n"
        "    int myWgInX = get_group_id(0);                                              \n"
        "    int myWgInY = get_group_id(1);                                              \n"
        "    // The local index of a thread should not change (threads with adjacent IDs \n"
        "    // within a work group should always perform adjacent memory accesses).     \n"
        "    // we will account for the clever indexing of local memory later.           \n"
        "    int myLocalX = get_local_id(0);                                             \n"
        "    int myLocalY = get_local_id(1);                                             \n"
        "                                                                                \n"
        "    int myGlobalInX = myWgInX*16 + myLocalX;                                    \n"
        "    int myGlobalInY = myWgInY*16 + myLocalY;                                    \n"
        "    // Don't read out of bounds                                                 \n"
        "    if(myGlobalInX < inCols && myGlobalInY < inRows) {                          \n"
        "                                                                                \n"
        "        // Cache data to local memory (coalesced reads)                         \n"
        "        tmpBuffer[myLocalY*16 + myLocalX] =                                     \n"
        "             iImage[myGlobalInY*inCols + myGlobalInX];                          \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "                                                                                \n"
        "    // This avoids some confusion with dimensions, but otherwise aren't needed  \n"
        "    int outRows = inCols;                                                       \n"
        "    int outCols = inRows;                                                       \n"
        "    // Swap work group IDs for their location after the transpose               \n"
        "    int myWgOutX = myWgInY;                                                     \n"
        "    int myWgOutY = myWgInX;                                                     \n"
        "    int myGlobalOutX = myWgOutX*16 + myLocalX;                                  \n"
        "    int myGlobalOutY = myWgOutY*16 + myLocalY;                                  \n"
        "    // When writing back to global memory, we need to swap image dimensions     \n"
        "    // for the transposed size), and also need to swap thread X/Y indices       \n"
        "    // in local memory (to achieve coalesced memory writes)                     \n"
        "    // Don't write out of bounds                                                \n"
        "    if(myGlobalOutX >= 0 && myGlobalOutX < outCols &&                           \n"
        "       myGlobalOutY >= 0 && myGlobalOutY < outRows) {                           \n"
        "       // The read from tmpBuffer is going to conflict,                         \n"
        "       // but the write should be coalesced                                     \n"
        "       oImage[myGlobalOutY*outCols + myGlobalOutX] =                            \n"
        "            tmpBuffer[myLocalX*16 + myLocalY];                                  \n"
        "    }                                                                           \n"
        "    return;                                                                     \n"
        "}                                                                               \n"
        "__kernel                                                                        \n"
        "void transposeImage(__read_only image2d_t iImage,                               \n"
        "                   __write_only image2d_t oImage,                               \n"
        "                                      int inRows,                               \n"
        "                                      int inCols) {                             \n"
        "                                                                                \n"
        "    __local float tmpBuffer[256];                                               \n"
        "    // Work groups will perform the transpose (i.e., an entire work             \n"
        "    // group will be moved fromone part of the image to another)                \n"
        "    int myWgInX = get_group_id(0);                                              \n"
        "    int myWgInY = get_group_id(1);                                              \n"
        "    // The local index of a thread should not change (threads with adjacent IDs \n"
        "    // within a work group should always perform adjacent memory accesses).     \n"
        "    // We will account for the clever indexing of local memory later.           \n"
        "    int myLocalX = get_local_id(0);                                             \n"
        "    int myLocalY = get_local_id(1);                                             \n"
        "                                                                                \n"
        "    int myGlobalInX = myWgInX*16 + myLocalX;                                    \n"
        "    int myGlobalInY = myWgInY*16 + myLocalY;                                    \n"
        "    // Cache data to local memory (coalesced reads)                             \n"
        "    tmpBuffer[myLocalY*16 + myLocalX] = read_imagef(iImage, sampler,            \n"
        "           (int2)(myGlobalInX, myGlobalInY)).x;                                 \n"
        "                                                                                \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "                                                                                \n"
        "    // This avoids some confusion with dimensions, but otherwise aren't needed  \n"
        "    int outRows = inCols;                                                       \n"
        "    int outCols = inRows;                                                       \n"
        "    // Swap work group IDs for their location after the transpose               \n"
        "    int myWgOutX = myWgInY;                                                     \n"
        "    int myWgOutY = myWgInX;                                                     \n"
        "    int myGlobalOutX = myWgOutX*16 + myLocalX;                                  \n"
        "    int myGlobalOutY = myWgOutY*16 + myLocalY;                                  \n"
        "    // When writing back to global memory, we need to swap image dimensions     \n"
        "    // (for the transposed size), and also need to swap thread X/Y indices      \n"
        "    // in local memory (to achieve coalesced memory writes)                     \n"
        "    // Don't write out of bounds                                                \n"
        "    if(myGlobalOutX >= 0 && myGlobalOutX < outCols &&                           \n"
        "       myGlobalOutY >= 0 && myGlobalOutY < outRows) {                           \n"
        "       // The read from tmpBuffer is going to conflict, but the write           \n"
        "       // should be coalesced                                                   \n"
        "       write_imagef(oImage, (int2)(myGlobalOutX, myGlobalOutY),                 \n"
        "         (float4)(tmpBuffer[myLocalX*16 + myLocalY], 0.0f, 0.0f, 0.0f));        \n"
        "    }                                                                           \n"
        "}                                                                               \n";

    
      const char *hessianDet_kernel = 
        "typedef __global float* int_img_t;                                              \n"
        "// Compute the hessian determinant                                              \n"
        "__kernel void                                                                   \n"
        "hessian_det(                                                                    \n"
        "    int_img_t img,              // integral image                               \n"
        "    int width,                  // integral image width                         \n"
        "    int height,                 // integral image height                        \n"
        "    det_layer_t responses,      // hessian determinant                          \n"
        "    lap_t laplacians,           // laplacian values                             \n"
        "    int layerWidth,                                                             \n"
        "    int layerHeight,                                                            \n"
        "    int step,                   // determinant step size                        \n"
        "    int filter)                 // determinant filter size                      \n"
        "{                                                                               \n"
        "    int l, w, b;                                                                \n"
        "    float Dxx, Dyy, Dxy, inverse_area;                                          \n"
        "    int idx = get_global_id(0);                                                 \n"
        "    int idy = get_global_id(1);                                                 \n"
        "    w = filter;                  // filter size                                 \n"
        "    l = filter/3;                // lobe for this filter                        \n"
        "    b = (filter - 1)/ 2 + 1;     // border for this filter                      \n"
        "    inverse_area = 1.0f/(w * w); // normalization factor                        \n"
        "    int r = idy * step;                                                         \n"
        "    int c = idx * step;                                                         \n"
        "    // Have threads accessing out-of-bounds data return immediately             \n"
        "    if(r >= height || c >= width) {                                             \n"
        "        return;                                                                 \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    Dxx = BoxIntegral(img, width, height, r - l + 1, c - b, 2*l - 1, w) -       \n"
        "          BoxIntegral(img, width, height, r - l + 1, c - l / 2, 2*l - 1, l)*3;  \n"
        "    Dyy = BoxIntegral(img, width, height, r - b, c - l + 1, w, 2*l - 1) -       \n"
        "          BoxIntegral(img, width, height, r - l / 2, c - l + 1, l, 2*l - 1)*3;  \n"
        "    Dxy = BoxIntegral(img, width, height, r - l, c + 1, l, l) +                 \n"
        "          BoxIntegral(img, width, height, r + 1, c - l, l, l) -                 \n"
        "          BoxIntegral(img, width, height, r - l, c - l, l, l) -                 \n"
        "          BoxIntegral(img, width, height, r + 1, c + 1, l, l);                  \n"
        "    // Normalize the filter responses with respect to their size                \n"
        "    Dxx *= inverse_area;                                                        \n"
        "    Dyy *= inverse_area;                                                        \n"
        "    Dxy *= inverse_area;                                                        \n"
        "    // Save the determinant of hessian response                                 \n"
        "    float determinant = (Dxx*Dyy - 0.81f*Dxy*Dxy);                              \n"
        "    int laplacian = (Dxx + Dyy >= 0 ? 1 : 0);                                   \n"
        "                                                                                \n"
        "    responses[idy*layerWidth+idx] = determinant;                                \n"
        "    laplacians[idy*layerWidth+idx] = laplacian;                                 \n"
        "}                                                                               \n";


      const char *getOrientation_kernels =
        "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable               \n"
        "//! Get the angle from the +ve x-axis of the vector given by (X Y)              \n"
        "float getAngle(float X, float Y)                                                \n"
        "{                                                                               \n"
        "    if(X >= 0 && Y >= 0)                                                        \n"
        "        return atan(Y/X);                                                       \n"
        "    if(X < 0 && Y >= 0)                                                         \n"
        "        return pi - atan(-Y/X);                                                 \n"
        "    if(X < 0 && Y < 0)                                                          \n"
         "        return pi + atan(Y/X);                                                  \n"
        "    if(X >= 0 && Y < 0)                                                         \n"
        "        return 2*pi - atan(-Y/X);                                               \n"
        "    return 0;                                                                   \n"
        "}                                                                               \n"
        "//!                                                                             \n"
        "__kernel void                                                                   \n"
        "getOrientationStep1(                                                            \n"
        "                    __global float* d_img,                                      \n"
        "                    __global float* d_scale,                                    \n"
        "                    __global float2* d_pixPos,                                  \n"
        "                    __global float* d_gauss25,                                  \n"
        "                    __global unsigned int* d_id,                                \n"
        "                    int i_width,                                                \n"
        "                    int i_height,                                               \n"
        "                    __global float4* res)                                       \n"
        "{                                                                               \n"
        "     // Cache the gaussian data in local memory                                 \n"
        "    __local float l_gauss25[49];                                                \n"
        "    __local unsigned int l_id[13];                                              \n"
        "                                                                                \n"
        "    int localId = get_local_id(0);                                              \n"
        "    int groupId = get_group_id(0);                                              \n"
        "                                                                                \n"
        "    int i = (int)(localId/13) - 6;                                              \n"
        "    int j = (localId%13) - 6;                                                   \n"
        "                                                                                \n"
        "    // Buffer gauss data in local memory                                        \n"
        "    if(localId < 49)                                                            \n"
        "    {                                                                           \n"
        "        l_gauss25[localId] = d_gauss25[localId];                                \n"
        "    }                                                                           \n"
        "    // Buffer d_ids in local memory                                             \n"
        "    if(localId < 13)                                                            \n"
        "    {                                                                           \n"
        "        l_id[localId] = d_id[localId];                                          \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "                                                                                \n"
        "    // NOTE Since this only reads once from global memory, it is                \n"
        "    //      quicker just to leave the data there.  There is a higher            \n"
        "    //      overhead from placing it in constant memory.                        \n"
        "    int s = round(d_scale[groupId]);                                            \n"
        "    int r = round(d_pixPos[groupId].y);                                         \n"
        "    int c = round(d_pixPos[groupId].x);                                         \n"
        "    float gauss = 0.f;                                                          \n"
        "    float4 rs = {0.0f, 0.0f, FLT_MAX, 1.0f};                                    \n"
        "                                                                                \n"
        "    __local int angleCount[1];                                                  \n"
        "    if(localId == 0) {                                                          \n"
        "        angleCount[0] = 0;                                                      \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    // calculate haar responses for points within radius of 6*scale             \n"
        "    if(i*i + j*j < 36)                                                          \n"
        "    {                                                                           \n"
        "        gauss = l_gauss25[7*l_id[i+6]+l_id[j+6]];                               \n"
        "        rs.x = gauss * haarX(d_img, i_width, i_height, r+j*s, c+i*s, 4*s);      \n"
        "        rs.y = gauss * haarY(d_img, i_width, i_height, r+j*s, c+i*s, 4*s);      \n"
        "        rs.z = getAngle(rs.x, rs.y);                                            \n"
        "        int index = atom_add(&angleCount[0], 1);                                \n"
        "                                                                                \n"
        "        res[groupId * 109 +  index] = rs;                                       \n"
        "    }                                                                           \n"
        "}                                                                               \n"
        "float magnitude(float2 val) {                                                   \n"
        "    return (val.x*val.x + val.y*val.y);                                         \n"
        "}                                                                               \n"
        "                                                                                \n"
        "//!                                                                             \n"
        "__kernel void getOrientationStep2(__global float *d_orientation,                \n"
        "                                  __global float4 *d_res)                       \n"
        "{                                                                               \n"
        "    // There are 42 threads                                                     \n"
        "    int tid = get_local_id(0);                                                  \n"
        "    int groupId = get_group_id(0);                                              \n"
        "    // calculate the dominant direction                                         \n"
        "    float ang1= 0.15f * (float)tid;                                             \n"
        "    float ang2 = (ang1+(pi/3.0f) > 2*pi ? ang1-(5.0f*pi/3.0f) : ang1+(pi/3.0f));\n"
        "    __local float4 res[109];                                                    \n"
        "    // Cache data in local memory                                               \n"
        "    for(uint k = tid; k < 109; k += get_local_size(0))                          \n"
        "    {                                                                           \n"
        "        res[k] = d_res[groupId * 109 + k];                                      \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "                                                                                \n"
        "    // loop slides pi/3 window around feature point                             \n"
        "                                                                                \n"
        "    __local float2 sum[42];                                                     \n"
        "    sum[tid] = (float2)(0.f, 0.f);                                              \n"
        "    // If tid is 0-34, ang1 < ang2                                              \n"
        "    if(tid <= 34) {                                                             \n"
        "        for(uint k = 0; k < 109; ++k)                                           \n"
        "        {	                                                                 \n"
        "            const float4 rs = res[k];	                                         \n"
        "            // get angle from the x-axis of the sample point	                 \n"
        "            const float ang = rs.z;                                             \n"
        "            // determine whether the point is within the window                 \n"
        "            int check = ang1 < ang && ang < ang2;                               \n"
        "            sum[tid].x += rs.x * check;                                         \n"
        "            sum[tid].y += rs.y * check;                                         \n"
        "        }                                                                       \n"
        "    }                                                                           \n"
        "    else {                                                                      \n"
        "        // If tid is 35-41, ang2 < ang1                                         \n"
        "        for(uint k = 0; k < 109; ++k)                                           \n"
        "        {	                                                                 \n"
        "            const float4 rs = res[k];	                                         \n"
        "            // get angle from the x-axis of the sample point	                 \n"
        "            const float ang = rs.z;                                             \n"
        "            // determine whether the point is within the window                 \n"
        "            int check = ((ang > 0 && ang < ang2) || (ang > ang1 && ang < 2*pi));\n"
        "            sum[tid].x += rs.x * check;                                         \n"
        "            sum[tid].y += rs.y * check;                                         \n"
        "        }                                                                       \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "                                                                                \n"
        "    // If the vector produced from this window is longer than all               \n"
        "    // previous vectors then this forms the new dominant direction              \n"
        "    for(int offset = 32; offset > 0; offset >>= 1)                              \n"
        "    {                                                                           \n"
        "        if (tid < offset && tid + offset < 42) {                                \n"
        "            if(magnitude(sum[tid]) < magnitude(sum[tid+offset])) {              \n"
        "                                                                                \n"
        "                sum[tid] = sum[tid+offset];                                     \n"
        "            }                                                                   \n"
        "        }                                                                       \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "    }                                                                           \n"
        "                                                                                \n"
        "    if (tid == 0)                                                               \n"
        "    {                                                                           \n"
        "        // assign orientation of the dominant response vector                   \n"
        "        d_orientation[groupId] = getAngle(sum[0].x, sum[0].y);                  \n"
        "    }                                                                           \n"
        "}                                                                               \n";


      const char *createDescriptors_kernel =
        "#define DES_THREADS 81                                                          \n"
        "void sumDesc(__local float4* desc, int tha, int length)                         \n"
        "{                                                                               \n"
        "    // do one loop to get all the > tha 64 values                               \n"
        "    if(tha < length - 64)                                                       \n"
        "    {                                                                           \n"
        "        desc[tha] += desc[tha + 64];	                                         \n"
        "    }                                                                           \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    int stride ; // 64/2                                                        \n"
        "    for (stride = 32; stride>0; stride>>=1)                                     \n"
        "    {                                                                           \n"
        "        if (tha < stride)	                                                 \n"
        "        {                                                                       \n"
        "            desc[tha] += desc[tha + stride];			                 \n"
        "        }                                                                       \n"
        "        barrier(CLK_LOCAL_MEM_FENCE);                                           \n"
        "    }                                                                           \n"
        "}                                                                               \n"
        "__kernel void createDescriptors_kernel(                                         \n"
        "              __global float* intImage,                                         \n"
        "              int width, int height,                                            \n"
        "              __global float* scale,                                            \n"
        "              __global float4* surfDescriptor,                                  \n"
        "              __global float2* pos,                                             \n"
        "              __global float* orientation,                                      \n"
        "              __global float* descLength,                                       \n"
        "              __constant int* mj,                                               \n"
        "              __constant int* mi)                                               \n"
        "{                                                                               \n"
        "    __local float4 desc[DES_THREADS];                                           \n"
        "    // There are 16 work groups per descriptor.  The groups are arranged as     \n"
        "    // 16 x NumDescriptors, so each bIdy is a new descriptor.                   \n"
        "    int bIdx = get_group_id(0);                                                 \n"
        "    int bIdy = get_group_id(1);                                                 \n"
        "                                                                                \n"
        "    // get the x & y indexes and absolute                                       \n"
        "    int thx = get_local_id(0);                                                  \n"
        "    int tha = get_local_id(1) * get_local_size(0) + thx;                        \n"
        "    // init shared memory to zero                                               \n"
        "    desc[tha] =(float4)(0.0f,0.0f,0.0f,0.0f);                                   \n"
        "    // The 16 work groups are logically arranged in a 4x4 square                \n"
        "    float cx = 0.5f, cy = 0.5f; //Subregion centers for the 4x4 gaussian weig.  \n"
        "    cx += (float)((int)(bIdx/4));                                               \n"
        "    cy += (float)((int)(bIdx%4));                                               \n"
        "                                                                                \n"
        "    //const int mj[16] = {	                                                 \n"
        "    //           -12, -7, -2, 3,                                                \n"
        "    //			 -12, -7, -2, 3,                                         \n"
        "    //			 -12, -7, -2, 3,                                         \n"
        "    //			 -12, -7, -2, 3};                                        \n"
        "    //const int mi[16] = { 	                                                 \n"
        "    //          -12,-12,-12,-12,                                                \n"
        "    //		     -7, -7, -7, -7,                                             \n"
        "    //		     -2, -2, -2, -2,                                             \n"
        "    //		      3,  3,  3,  3};                                            \n"
        "    int j = mj[bIdx];                                                           \n"
        "    int i = mi[bIdx];                                                           \n"
        "    float x = round(pos[bIdy].x);                                               \n"
        "    float y = round(pos[bIdy].y);                                               \n"
        "    float orient = orientation[bIdy];                                           \n"
        "    float co = cos(orient);                                                     \n"
        "    float si = sin(orient);                                                     \n"
        "                                                                                \n"
        "    float thScale = scale[bIdy];                                                \n"
        "    float ix = i + 5;                                                           \n"
        "    float jx = j + 5;                                                           \n"
        "    float xs = round(x + ( -jx*thScale*si + ix*thScale*co));                    \n"
        "    float ys = round(y + ( jx*thScale*co + ix*thScale*si));                     \n"
        "                                                                                \n"
        "    // There are 81 work items per work group, logically                        \n"
        "    // arranged into 9x9                                                        \n"
        "    int k = i + (tha / 9);                                                      \n"
        "    int l = j + (tha % 9);                                                      \n"
        "    // Get coords of sample point on the rotated axis                           \n"
        "    int sample_x = round(x + (-l*thScale*si + k*thScale*co));                   \n"
        "    int sample_y = round(y + ( l*thScale*co + k*thScale*si));                   \n"
        "    // Get the gaussian weighted x and y responses                              \n"
        "    float gauss_s1 = gaussian((float)(xs-sample_x), (float)(ys-sample_y),       \n"
        "                              2.5f*thScale);                                    \n"
        "    float rx = haarX(intImage, width, height, sample_y, sample_x,               \n"
        "                     2*round(thScale));                                         \n"
        "    float ry = haarY(intImage, width, height, sample_y, sample_x,               \n"
        "                     2*round(thScale));                                         \n"
        "    //Get the gaussian weighted x and y responses on rotated axis               \n"
        "    float rrx = gauss_s1*(-rx*si + ry*co);                                      \n"
        "    float rry = gauss_s1*(rx*co + ry*si);                                       \n"
        "    desc[tha].x = rrx;                                                          \n"
        "    desc[tha].y = rry;                                                          \n"
        "    desc[tha].z = fabs(rrx);                                                    \n"
        "    desc[tha].w = fabs(rry);                                                    \n"
        "                                                                                \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    // Call summer function (result goes in index 0)                            \n"
        "    sumDesc(desc, tha, DES_THREADS);                                            \n"
        "                                                                                \n"
        "    barrier(CLK_LOCAL_MEM_FENCE);                                               \n"
        "    if(tha == 0)                                                                \n"
        "    {                                                                           \n"
        "        // There are 16 work groups per i-point                                 \n"
        "        int dpos= bIdy * 16 + bIdx;                                             \n"
        "        float gauss_s2 = gaussian(cx-2.0f,cy-2.0f,1.5f);                        \n"
        "                                                                                \n"
        "        desc[0] *= gauss_s2;                                                    \n"
        "                                                                                \n"
        "        // Store the descriptor                                                 \n"
        "        surfDescriptor[dpos] = desc[0];                                         \n"
        "        // Store the descriptor length                                          \n"
        "        descLength[bIdy * get_num_groups(0) + bIdx] = dot(desc[0], desc[0]);	 \n"
        "    }                                                                           \n"
        "}                                                                               \n";

    }
  }
}