/******************************************************************** ** Image Component Library (ICL) ** ** ** ** Copyright (C) 2006-2013 CITEC, University of Bielefeld ** ** Neuroinformatics Group ** ** Website: www.iclcv.org and ** ** http://opensource.cit-ec.de/projects/icl ** ** ** ** File : ICLCV/src/ICLCV/CLSurfLib.cpp ** ** Module : ICLCV ** ** Authors: Christof Elbrechter ** ** ** ** ** ** GNU LESSER GENERAL PUBLIC LICENSE ** ** This file may be used under the terms of the GNU Lesser General ** ** Public License version 3.0 as published by the ** ** ** ** Free Software Foundation and appearing in the file LICENSE.LGPL ** ** included in the packaging of this file. Please review the ** ** following information to ensure the license requirements will ** ** be met: http://www.gnu.org/licenses/lgpl-3.0.txt ** ** ** ** The development of this software was supported by the ** ** Excellence Cluster EXC 277 Cognitive Interaction Technology. ** ** The Excellence Cluster EXC 277 is a grant of the Deutsche ** ** Forschungsgemeinschaft (DFG) in the context of the German ** ** Excellence Initiative. ** ** ** ********************************************************************/ /****************************************************************************\ * Copyright (c) 2011, Advanced Micro Devices, Inc. * * All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * * are met: * * * * Redistributions of source code must retain the above copyright notice, * * this list of conditions and the following disclaimer. * * * * Redistributions in binary form must reproduce the above copyright notice, * * this list of conditions and the following disclaimer in the documentation * * and/or other materials provided with the distribution. * * * * Neither the name of the copyright holder nor the names of its contributors * * may be used to endorse or promote products derived from this software * * without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR * * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * If you use the software (in whole or in part), you shall adhere to all * * applicable U.S., European, and other export laws, including but not * * limited to the U.S. Export Administration Regulations ('EAR'), (15 C.F.R. * * Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 * * of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you * * hereby certify that, except pursuant to a license granted by the United * * States Department of Commerce Bureau of Industry and Security or as * * otherwise permitted pursuant to a License Exception under the U.S. Export * * Administration Regulations ("EAR"), you will not (1) export, re-export or * * release to a national of a country in Country Groups D:1, E:1 or E:2 any * * restricted technology, software, or source code you receive hereunder, * * or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such * * technology or software, if such foreign produced direct product is subject * * to national security controls as identified on the Commerce Control List * *(currently found in Supplement 1 to Part 774 of EAR). For the most current * * Country Group listings, or for additional information about the EAR or * * your obligations under those regulations, please refer to the U.S. Bureau * * of Industry and Security's website at http://www.bis.doc.gov/. * \****************************************************************************/ #include #include #include #include #include "cstdio" #ifdef WIN32 #include #else #include #include #endif #include #define DESC_SIZE 64 #define MAX_ERR_VAL 64 namespace icl { using namespace utils; using namespace math; using namespace core; namespace cv { namespace clsurf { typedef struct { int x; int y; } int2; typedef struct { float x; float y; } float2; typedef struct { float x; float y; float z; float w; } float4; class ResponseLayer; class FastHessian; struct Surf::Data { IpVec m_outputBuffer; icl::core::Img32f m_grayBuffer; // The actual number of ipoints for this image int numIpts; //! The amount of ipoints we have allocated space for int maxIpts; //! A fast hessian object that will be used for detecting ipoints FastHessian* fh; //! The integral image CLBuffer d_intImage; CLBuffer d_tmpIntImage; // orig orientation CLBuffer d_tmpIntImageT1; // transposed CLBuffer d_tmpIntImageT2; // transposed //! Number of surf descriptors CLBuffer d_length; //! Array of Descriptors for each Ipoint CLBuffer d_desc; //! Orientation of each Ipoint an array of float CLBuffer d_orientation; CLBuffer d_gauss25; CLBuffer d_id; CLBuffer d_i; CLBuffer d_j; //! Position data on the host float2* pixPos; //! Scale data on the host float* scale; //! Laplacian data on the host int* laplacian; //! Descriptor data on the host float* desc; //! Orientation data on the host float* orientation; //! Position buffer on the device CLBuffer d_pixPos; //! Scale buffer on the device CLBuffer d_scale; //! Laplacian buffer on the device CLBuffer d_laplacian; //! Res buffer on the device CLBuffer d_res; const static int j[16]; const static int i[16]; const static unsigned int id[13]; const static float gauss25[49]; }; typedef double cl_time; class ResponseLayer { public: ResponseLayer(int width, int height, int step, int filter, CLProgram &program); ~ResponseLayer(); int getWidth(); int getHeight(); int getStep(); int getFilter(); CLBuffer getResponses(); CLBuffer getLaplacian(); private: int width; int height; int step; int filter; CLProgram &program; CLBuffer d_responses; CLBuffer d_laplacian; }; static const int OCTAVES = 5; static const int INTERVALS = 4; static const float THRES = 0.0001f; static const int SAMPLE_STEP = 2; //! FastHessian Calculates array of hessian and co-ordinates of ipoints /*! FastHessian declaration\n Calculates array of hessian and co-ordinates of ipoints */ class FastHessian { public: //! Destructor ~FastHessian(); //! Constructor without image FastHessian(int i_height, int i_width, CLProgram &program, CLKernel &hessian_detKernel, CLKernel &non_max_supressionKernel, const int octaves = OCTAVES, const int intervals = INTERVALS, const int sample_step = SAMPLE_STEP, const float thres = THRES); // TODO Fix this name void selectIpoints(CLBuffer d_laplacian, CLBuffer d_pixPos, CLBuffer d_scale, int maxPoints); // TODO Fix this name void computeHessianDet(CLBuffer d_intImage, int i_width, int i_height); //! Find the image features and write into vector of features int getIpoints(const icl::core::Img32f &image, CLBuffer d_intImage, CLBuffer d_laplacian, CLBuffer d_pixPos, CLBuffer d_scale, int maxIpts); //! Resets the information required for the next frame to compute void reset(); private: void createResponseMap(int octaves, int imgWidth, int imgHeight, int sample_step); CLProgram &program; CLKernel &hessian_detKernel; CLKernel &non_max_supressionKernel; //! Number of Ipoints int num_ipts; //! Number of Octaves int octaves; //! Number of Intervals per octave int intervals; //! Initial sampling step for Ipoint detection int sample_step; //! Threshold value for blob resonses float thres; std::vector responseMap; //! Number of Ipoints on GPU CLBuffer d_ipt_count; }; // Rounds up size to the nearest multiple of multiple unsigned int roundUp(unsigned int value, unsigned int multiple) { unsigned int remainder = value % multiple; // Make the value a multiple of multiple if (remainder != 0) { value += (multiple - remainder); } return value; } ////////////////////////////////////////////////////////////////////////////// // utils functions /////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // Wrapper for malloc void* alloc(size_t size) { void* ptr = NULL; ptr = malloc(size); if (ptr == NULL) { perror("malloc"); exit(-1); } return ptr; } //! OpenCl error code list /*! An array of character strings used to give the error corresponding to the error code \n The error code is the index within this array */ const char *cl_errs[MAX_ERR_VAL] = { "CL_SUCCESS", // 0 "CL_DEVICE_NOT_FOUND", //-1 "CL_DEVICE_NOT_AVAILABLE", //-2 "CL_COMPILER_NOT_AVAILABLE", //-3 "CL_MEM_OBJECT_ALLOCATION_FAILURE", //-4 "CL_OUT_OF_RESOURCES", //-5 "CL_OUT_OF_HOST_MEMORY", //-6 "CL_PROFILING_INFO_NOT_AVAILABLE", //-7 "CL_MEM_COPY_OVERLAP", //-8 "CL_IMAGE_FORMAT_MISMATCH", //-9 "CL_IMAGE_FORMAT_NOT_SUPPORTED", //-10 "CL_BUILD_PROGRAM_FAILURE", //-11 "CL_MAP_FAILURE", //-12 "", //-13 "", //-14 "", //-15 "", //-16 "", //-17 "", //-18 "", //-19 "", //-20 "", //-21 "", //-22 "", //-23 "", //-24 "", //-25 "", //-26 "", //-27 "", //-28 "", //-29 "CL_INVALID_VALUE", //-30 "CL_INVALID_DEVICE_TYPE", //-31 "CL_INVALID_PLATFORM", //-32 "CL_INVALID_DEVICE", //-33 "CL_INVALID_CONTEXT", //-34 "CL_INVALID_QUEUE_PROPERTIES", //-35 "CL_INVALID_COMMAND_QUEUE", //-36 "CL_INVALID_HOST_PTR", //-37 "CL_INVALID_MEM_OBJECT", //-38 "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", //-39 "CL_INVALID_IMAGE_SIZE", //-40 "CL_INVALID_SAMPLER", //-41 "CL_INVALID_BINARY", //-42 "CL_INVALID_BUILD_OPTIONS", //-43 "CL_INVALID_PROGRAM", //-44 "CL_INVALID_PROGRAM_EXECUTABLE", //-45 "CL_INVALID_KERNEL_NAME", //-46 "CL_INVALID_KERNEL_DEFINITION", //-47 "CL_INVALID_KERNEL", //-48 "CL_INVALID_ARG_INDEX", //-49 "CL_INVALID_ARG_VALUE", //-50 "CL_INVALID_ARG_SIZE", //-51 "CL_INVALID_KERNEL_ARGS", //-52 "CL_INVALID_WORK_DIMENSION ", //-53 "CL_INVALID_WORK_GROUP_SIZE", //-54 "CL_INVALID_WORK_ITEM_SIZE", //-55 "CL_INVALID_GLOBAL_OFFSET", //-56 "CL_INVALID_EVENT_WAIT_LIST", //-57 "CL_INVALID_EVENT", //-58 "CL_INVALID_OPERATION", //-59 "CL_INVALID_GL_OBJECT", //-60 "CL_INVALID_BUFFER_SIZE", //-61 "CL_INVALID_MIP_LEVEL", //-62 "CL_INVALID_GLOBAL_WORK_SIZE" }; //-63 ////////////////////////////////////////////////////////////////////////////// // ResponseLayer Hessian class implementation //////////////////////////////// ////////////////////////////////////////////////////////////////////////////// ResponseLayer::ResponseLayer(int width, int height, int step, int filter, CLProgram &program) : program(program) { this->width = width; this->height = height; this->step = step; this->filter = filter; this->d_laplacian = program.createBuffer("rw", sizeof(int) * width * height); this->d_responses = program.createBuffer("rw", sizeof(float) * width * height); } ResponseLayer::~ResponseLayer() { } int ResponseLayer::getWidth() { return this->width; } int ResponseLayer::getHeight() { return this->height; } int ResponseLayer::getStep() { return this->step; } int ResponseLayer::getFilter() { return this->filter; } CLBuffer ResponseLayer::getLaplacian() { return this->d_laplacian; } CLBuffer ResponseLayer::getResponses() { return this->d_responses; } ////////////////////////////////////////////////////////////////////////////// // Fast Hessian class implementation ///////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // Based on the octave (row) and interval (column), this lookup table // identifies the appropriate determinant layer const int filter_map[OCTAVES][INTERVALS] = { { 0, 1, 2, 3 }, { 1, 3, 4, 5 }, { 3, 5, 6, 7 }, { 5, 7, 8, 9 }, { 7, 9, 10, 11 } }; //------------------------------------------------------- //! Constructor FastHessian::FastHessian(int i_height, int i_width, CLProgram &program, CLKernel &hessian_detKernel, CLKernel &non_max_supressionKernel, const int octaves, const int intervals, const int sample_step, const float thres) : program(program), hessian_detKernel(hessian_detKernel), non_max_supressionKernel( non_max_supressionKernel) { // Initialise variables with bounds-checked values this->octaves = (octaves > 0 && octaves <= 4 ? octaves : OCTAVES); this->intervals = (intervals > 0 && intervals <= 4 ? intervals : INTERVALS); this->sample_step = ( sample_step > 0 && sample_step <= 6 ? sample_step : SAMPLE_STEP); this->thres = (thres >= 0 ? thres : THRES); this->num_ipts = 0; // TODO implement this as device zero-copy memory this->d_ipt_count = program.createBuffer("rw", sizeof(int), &this->num_ipts); // Create the hessian response map objects this->createResponseMap(octaves, i_width, i_height, sample_step); } //! Destructor FastHessian::~FastHessian() { for (unsigned int i = 0; i < this->responseMap.size(); i++) { delete responseMap.at(i); } } void FastHessian::createResponseMap(int octaves, int imgWidth, int imgHeight, int sample_step) { int w = (imgWidth / sample_step); int h = (imgHeight / sample_step); int s = (sample_step); // Calculate approximated determinant of hessian values if (octaves >= 1) { this->responseMap.push_back(new ResponseLayer(w, h, s, 9, program)); this->responseMap.push_back(new ResponseLayer(w, h, s, 15, program)); this->responseMap.push_back(new ResponseLayer(w, h, s, 21, program)); this->responseMap.push_back(new ResponseLayer(w, h, s, 27, program)); } if (octaves >= 2) { this->responseMap.push_back( new ResponseLayer(w / 2, h / 2, s * 2, 39, program)); this->responseMap.push_back( new ResponseLayer(w / 2, h / 2, s * 2, 51, program)); } if (octaves >= 3) { this->responseMap.push_back( new ResponseLayer(w / 4, h / 4, s * 4, 75, program)); this->responseMap.push_back( new ResponseLayer(w / 4, h / 4, s * 4, 99, program)); } if (octaves >= 4) { this->responseMap.push_back( new ResponseLayer(w / 8, h / 8, s * 8, 147, program)); this->responseMap.push_back( new ResponseLayer(w / 8, h / 8, s * 8, 195, program)); } if (octaves >= 5) { this->responseMap.push_back( new ResponseLayer(w / 16, h / 16, s * 16, 291, program)); this->responseMap.push_back( new ResponseLayer(w / 16, h / 16, s * 16, 387, program)); } } //! Hessian determinant for the image using approximated box filters /*! \param d_intImage Integral Image \param i_width Image Width \param i_height Image Height */ void FastHessian::computeHessianDet(CLBuffer d_intImage, int i_width, int i_height) { // set matrix size and x,y threads per block const int BLOCK_DIM = 16; size_t localWorkSize[2] = { BLOCK_DIM, BLOCK_DIM }; hessian_detKernel.setArgs(d_intImage, i_width, i_height); for (unsigned int i = 0; i < this->responseMap.size(); i++) { CLBuffer responses = this->responseMap.at(i)->getResponses(); CLBuffer laplacian = this->responseMap.at(i)->getLaplacian(); int step = this->responseMap.at(i)->getStep(); int filter = this->responseMap.at(i)->getFilter(); int layerWidth = this->responseMap.at(i)->getWidth(); int layerHeight = this->responseMap.at(i)->getHeight(); hessian_detKernel[3] = responses; hessian_detKernel[4] = laplacian; hessian_detKernel[5] = layerWidth; hessian_detKernel[6] = layerHeight; hessian_detKernel[7] = step; hessian_detKernel[8] = filter; hessian_detKernel.apply(roundUp(layerWidth, localWorkSize[0]), roundUp(layerHeight, localWorkSize[1]), 0, localWorkSize[0], localWorkSize[1]); } } /*! Find the image features and write into vector of features Determine what points are interesting and store them \param img \param d_intImage The integral image pointer on the device \param d_laplacian \param d_pixPos \param d_scale */ int FastHessian::getIpoints(const icl::core::Img32f &image, CLBuffer d_intImage, CLBuffer d_laplacian, CLBuffer d_pixPos, CLBuffer d_scale, int maxIpts) { // Compute the hessian determinants // GPU kernels: init_det and build_det kernels // this->computeHessianDet(d_intImage, img->width, img->height); this->computeHessianDet(d_intImage, image.getWidth(), image.getHeight()); // Determine which points are interesting // GPU kernels: non_max_suppression kernel this->selectIpoints(d_laplacian, d_pixPos, d_scale, maxIpts); // Copy the number of interesting points back to the host d_ipt_count.read(&this->num_ipts, sizeof(int)); // Sanity check if (this->num_ipts < 0) { printf("Invalid number of Ipoints\n"); exit(-1); }; return num_ipts; } /*! //! Calculate the position of ipoints (gpuIpoint::d_pixPos) using non maximal suppression Convert d_m_det which is a array of all the hessians into d_pixPos which is a float2 array of the (x,y) of all ipoint locations \param i_width The width of the image \param i_height The height of the image \param d_laplacian \param d_pixPos \param d_scale */ void FastHessian::selectIpoints(CLBuffer d_laplacian, CLBuffer d_pixPos, CLBuffer d_scale, int maxPoints) { // The search for exterema (the most interesting point in a neighborhood) // is done by non-maximal suppression int BLOCK_W = 16; int BLOCK_H = 16; non_max_supressionKernel[14] = this->d_ipt_count; non_max_supressionKernel[15] = d_pixPos; non_max_supressionKernel[16] = d_scale; non_max_supressionKernel[17] = d_laplacian; non_max_supressionKernel[18] = maxPoints; non_max_supressionKernel[19] = this->thres; // Run the kernel for each octave for (int o = 0; o < octaves; o++) { for (int i = 0; i <= 1; i++) { CLBuffer bResponse = this->responseMap.at(filter_map[o][i])->getResponses(); int bWidth = this->responseMap.at(filter_map[o][i])->getWidth(); int bHeight = this->responseMap.at(filter_map[o][i])->getHeight(); int bFilter = this->responseMap.at(filter_map[o][i])->getFilter(); CLBuffer mResponse = this->responseMap.at(filter_map[o][i + 1])->getResponses(); int mWidth = this->responseMap.at(filter_map[o][i + 1])->getWidth(); int mHeight = this->responseMap.at(filter_map[o][i + 1])->getHeight(); int mFilter = this->responseMap.at(filter_map[o][i + 1])->getFilter(); CLBuffer mLaplacian = this->responseMap.at(filter_map[o][i + 1])->getLaplacian(); CLBuffer tResponse = this->responseMap.at(filter_map[o][i + 2])->getResponses(); int tWidth = this->responseMap.at(filter_map[o][i + 2])->getWidth(); int tHeight = this->responseMap.at(filter_map[o][i + 2])->getHeight(); int tFilter = this->responseMap.at(filter_map[o][i + 2])->getFilter(); int tStep = this->responseMap.at(filter_map[o][i + 2])->getStep(); size_t localWorkSize[2] = { (size_t) BLOCK_W, (size_t) BLOCK_H }; size_t globalWorkSize[2] = { (size_t) roundUp(mWidth, BLOCK_W), (size_t) roundUp(mHeight, BLOCK_H) }; non_max_supressionKernel[0] = tResponse; non_max_supressionKernel[1] = tWidth; non_max_supressionKernel[2] = tHeight; non_max_supressionKernel[3] = tFilter; non_max_supressionKernel[4] = tStep; non_max_supressionKernel[5] = mResponse; non_max_supressionKernel[6] = mLaplacian; non_max_supressionKernel[7] = mWidth; non_max_supressionKernel[8] = mHeight; non_max_supressionKernel[9] = mFilter; non_max_supressionKernel[10] = bResponse; non_max_supressionKernel[11] = bWidth; non_max_supressionKernel[12] = bHeight; non_max_supressionKernel[13] = bFilter; // Call non-max supression kernel non_max_supressionKernel.apply(globalWorkSize[0], globalWorkSize[1], 0, localWorkSize[0], localWorkSize[1]); // TODO Verify that a clFinish is not required (setting an argument // to the loop counter without it may be problematic, but it // really kills performance on AMD parts) //cl_sync(); } } } //! Reset the state of the data void FastHessian::reset() { int numIpts = 0; d_ipt_count.write(&numIpts, sizeof(int)); } ////////////////////////////////////////////////////////////////////////////// // Surf class implementation ///////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // TODO Get rid of these arrays (i and j). Have the values computed // dynamically within the kernel const int Surf::Data::j[] = { -12, -7, -2, 3, -12, -7, -2, 3, -12, -7, -2, 3, -12, -7, -2, 3 }; const int Surf::Data::i[] = { -12, -12, -12, -12, -7, -7, -7, -7, -2, -2, -2, -2, 3, 3, 3, 3 }; const unsigned int Surf::Data::id[] = { 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6 }; const float Surf::Data::gauss25[] = { 0.02350693969273f, 0.01849121369071f, 0.01239503121241f, 0.00708015417522f, 0.00344628101733f, 0.00142945847484f, 0.00050524879060f, 0.02169964028389f, 0.01706954162243f, 0.01144205592615f, 0.00653580605408f, 0.00318131834134f, 0.00131955648461f, 0.00046640341759f, 0.01706954162243f, 0.01342737701584f, 0.00900063997939f, 0.00514124713667f, 0.00250251364222f, 0.00103799989504f, 0.00036688592278f, 0.01144205592615f, 0.00900063997939f, 0.00603330940534f, 0.00344628101733f, 0.00167748505986f, 0.00069579213743f, 0.00024593098864f, 0.00653580605408f, 0.00514124713667f, 0.00344628101733f, 0.00196854695367f, 0.00095819467066f, 0.00039744277546f, 0.00014047800980f, 0.00318131834134f, 0.00250251364222f, 0.00167748505986f, 0.00095819467066f, 0.00046640341759f, 0.00019345616757f, 0.00006837798818f, 0.00131955648461f, 0.00103799989504f, 0.00069579213743f, 0.00039744277546f, 0.00019345616757f, 0.00008024231247f, 0.00002836202103f }; void Surf::createKernels() { createDescrtptorsKernel = program.createKernel("createDescriptors_kernel"); getOrientationStep1Kernel = program.createKernel("getOrientationStep1"); getOrientationStep2Kernel = program.createKernel("getOrientationStep2"); hessian_detKernel = program.createKernel("hessian_det"); scanKernel = program.createKernel("scan"); scan4Kernel = program.createKernel("scan4"); scanImageKernel = program.createKernel("scanImage"); transposeKernel = program.createKernel("transpose"); transposeImageKernel = program.createKernel("transposeImage"); nearestNeighborKernel = program.createKernel("NearestNeighbor"); non_max_supressionKernel = program.createKernel( "non_max_supression_kernel"); normalizeDescriptorsKernel = program.createKernel("normalizeDescriptors"); } //! Constructor Surf::Surf(int initialPoints, int i_height, int i_width, int octaves, int intervals, int sample_step, float threshold) : m_data(new Data) { stringstream ss; ss << utilityKernels << endl << createDescriptors_kernel << endl << getOrientation_kernels << endl << hessianDet_kernel << endl << integralImage_kernels << endl << nearestNeighbor_kernel << endl << nonMaxSuppression_kernel << endl << normalizeDescriptors_kernel << endl; program = CLProgram("gpu", ss.str()); createKernels(); this->m_data->fh = new FastHessian(i_height, i_width, program, hessian_detKernel, non_max_supressionKernel, octaves, intervals, sample_step, threshold); // Once we know the size of the image, successive frames should stay // the same size, so we can just allocate the space once for the integral // image and intermediate data this->m_data->d_intImage = program.createBuffer("rw", sizeof(float) * i_width * i_height); this->m_data->d_tmpIntImage = program.createBuffer("rw", sizeof(float) * i_width * i_height); // These two are unnecessary for buffers, but required for images, so // we'll use them for buffers as well to keep the code clean this->m_data->d_tmpIntImageT1 = program.createBuffer("rw", sizeof(float) * i_width * i_height); this->m_data->d_tmpIntImageT2 = program.createBuffer("rw", sizeof(float) * i_width * i_height); // Allocate constant data on device this->m_data->d_gauss25 = program.createBuffer("r", sizeof(float) * 49, (void*) Surf::Data::gauss25); this->m_data->d_id = program.createBuffer("r", sizeof(unsigned int) * 13, (void*) Surf::Data::id); this->m_data->d_i = program.createBuffer("r", sizeof(unsigned int) * 16, (void*) Surf::Data::i); this->m_data->d_j = program.createBuffer("r", sizeof(int) * 16, (void*) Surf::Data::j); // Allocate buffers for each of the interesting points. We don't know // how many there are initially, so must allocate more than enough space this->m_data->d_scale = program.createBuffer("rw", initialPoints * sizeof(float)); this->m_data->d_pixPos = program.createBuffer("rw", initialPoints * sizeof(float2)); this->m_data->d_laplacian = program.createBuffer("rw", initialPoints * sizeof(int)); // These buffers used to wait for the number of actual ipts to be known // before being allocated, instead now we'll only allocate them once // so that we can take advantage of optimized data transfers and reallocate // them if there's not enough space available this->m_data->d_length = program.createBuffer("rw", initialPoints * DESC_SIZE * sizeof(float)); this->m_data->d_desc = program.createBuffer("rw", initialPoints * DESC_SIZE * sizeof(float)); this->m_data->d_res = program.createBuffer("rw", initialPoints * 109 * sizeof(float4)); this->m_data->d_orientation = program.createBuffer("rw", initialPoints * sizeof(float)); // Allocate buffers to store the output data (descriptor information) // on the host this->m_data->scale = (float*) alloc(initialPoints * sizeof(float)); this->m_data->pixPos = (float2*) alloc(initialPoints * sizeof(float2)); this->m_data->laplacian = (int*) alloc(initialPoints * sizeof(int)); this->m_data->desc = (float*) alloc( initialPoints * DESC_SIZE * sizeof(float)); this->m_data->orientation = (float*) alloc(initialPoints * sizeof(float)); // This is how much space is available for Ipts this->m_data->maxIpts = initialPoints; this->m_data->m_grayBuffer = icl::core::Img32f(icl::utils::Size(1, 1), icl::core::formatGray); } //! Destructor Surf::~Surf() { free(this->m_data->orientation); free(this->m_data->scale); free(this->m_data->laplacian); free(this->m_data->desc); free(this->m_data->pixPos); delete this->m_data->fh; delete m_data; } //! Computes the integral image of image img. //! Assumes source image to be a 32-bit floating point. /*! Saves integral Image in d_intImage on the GPU \param source Input Image as grabbed by OpenCv */ inline void scale_to_01(float &f) { static const float s = 1.0f / 255.0f; f *= s; } void Surf::computeIntegralImage(const icl::core::Img32f &image) { //! convert the image to single channel 32f // TODO This call takes about 4ms (is there any way to speed it up?) //IplImage *img = getGray(source); //cc(&image,&this->m_data->m_grayBuffer); // set up variables for data access int height = image.getHeight(); //img->height; int width = image.getWidth(); //img->width; float *data = (float*) image.begin(0); //img->imageData; //m_grayBuffer = m_grayBuffer/(1.0f/255.0f); // this->m_data->m_grayBuffer.forEach(scale_to_01); CLKernel scan_kernel; { // Copy the data to the GPU this->m_data->d_intImage.write(data, sizeof(float) * width * height); scan_kernel = scanKernel; } // ----------------------------------------------------------------- // Step 1: Perform integral summation on the rows // ----------------------------------------------------------------- size_t localWorkSize1[2] = { 64, 1 }; size_t globalWorkSize1[2] = { (size_t) 64, (size_t) height }; scan_kernel.setArgs(this->m_data->d_intImage, this->m_data->d_tmpIntImage, height, width); scan_kernel.apply(globalWorkSize1[0], globalWorkSize1[1], 0, localWorkSize1[0], localWorkSize1[1]); // ----------------------------------------------------------------- // Step 2: Transpose // ----------------------------------------------------------------- size_t localWorkSize2[] = { 16, 16 }; size_t globalWorkSize2[] = { roundUp(width, 16), roundUp(height, 16) }; transposeKernel.setArgs(this->m_data->d_tmpIntImage, this->m_data->d_tmpIntImageT1, height, width); transposeKernel.apply(globalWorkSize2[0], globalWorkSize2[1], 0, localWorkSize2[0], localWorkSize2[1]); // ----------------------------------------------------------------- // Step 3: Run integral summation on the rows again (same as columns // integral since we've transposed). // ----------------------------------------------------------------- int heightT = width; int widthT = height; size_t localWorkSize3[2] = { 64, 1 }; size_t globalWorkSize3[2] = { (size_t) 64, (size_t) heightT }; scan_kernel.setArgs(this->m_data->d_tmpIntImageT1, this->m_data->d_tmpIntImageT2, heightT, widthT); scan_kernel.apply(globalWorkSize3[0], globalWorkSize3[1], 0, localWorkSize3[0], localWorkSize3[1]); // ----------------------------------------------------------------- // Step 4: Transpose back // ----------------------------------------------------------------- size_t localWorkSize4[] = { 16, 16 }; size_t globalWorkSize4[] = { roundUp(widthT, 16), roundUp(heightT, 16) }; transposeKernel.setArgs(this->m_data->d_tmpIntImageT2, this->m_data->d_intImage, heightT, widthT); transposeKernel.apply(globalWorkSize4[0], globalWorkSize4[1], 0, localWorkSize4[0], localWorkSize4[1]); // release the gray image //cvReleaseImage(&img); } //! Create the SURF descriptors /*! Calculate orientation for all ipoints using the sliding window technique from OpenSurf \param d_intImage The integral image \param width The width of the image \param height The height of the image */ void Surf::createDescriptors(int i_width, int i_height) { const size_t threadsPerWG = 81; const size_t wgsPerIpt = 16; size_t localWorkSizeSurf64[2] = { threadsPerWG, 1 }; size_t globalWorkSizeSurf64[2] = { (wgsPerIpt * threadsPerWG), (size_t) this->m_data->numIpts }; createDescrtptorsKernel.setArgs(this->m_data->d_intImage, i_width, i_height, this->m_data->d_scale, this->m_data->d_desc, this->m_data->d_pixPos, this->m_data->d_orientation, this->m_data->d_length, this->m_data->d_j, this->m_data->d_i); createDescrtptorsKernel.apply(globalWorkSizeSurf64[0], globalWorkSizeSurf64[1], 0, localWorkSizeSurf64[0], localWorkSizeSurf64[1]); size_t localWorkSizeNorm64[] = { DESC_SIZE }; size_t globallWorkSizeNorm64[] = { (size_t) this->m_data->numIpts * DESC_SIZE }; normalizeDescriptorsKernel.setArgs(this->m_data->d_desc, this->m_data->d_length); normalizeDescriptorsKernel.apply(globallWorkSizeNorm64[0], 0, 0, localWorkSizeNorm64[0]); } //! Calculate orientation for all ipoints /*! Calculate orientation for all ipoints using the sliding window technique from OpenSurf \param i_width The image width \param i_height The image height */ void Surf::getOrientations(int i_width, int i_height) { size_t localWorkSize1[] = { 169 }; size_t globalWorkSize1[] = { (size_t) this->m_data->numIpts * 169 }; /*! Assign the supplied Ipoint an orientation */ getOrientationStep1Kernel.setArgs(this->m_data->d_intImage, this->m_data->d_scale, this->m_data->d_pixPos, this->m_data->d_gauss25, this->m_data->d_id, i_width, i_height, this->m_data->d_res); getOrientationStep1Kernel.apply(globalWorkSize1[0], 0, 0, localWorkSize1[0]); getOrientationStep2Kernel.setArgs(this->m_data->d_orientation, this->m_data->d_res); size_t localWorkSize2[] = { 42 }; size_t globalWorkSize2[] = { (size_t) this->m_data->numIpts * 42 }; getOrientationStep2Kernel.apply(globalWorkSize2[0], 0, 0, localWorkSize2[0]); } //! Allocates the memory objects requried for the ipt descriptor information void Surf::reallocateIptBuffers() { // Release the old memory objects (that were too small) free(this->m_data->orientation); free(this->m_data->scale); free(this->m_data->laplacian); free(this->m_data->desc); free(this->m_data->pixPos); int newSize = this->m_data->maxIpts; // Allocate new memory objects based on the new size this->m_data->d_scale = program.createBuffer("rw", newSize * sizeof(float)); this->m_data->d_pixPos = program.createBuffer("rw", newSize * sizeof(float2)); this->m_data->d_laplacian = program.createBuffer("rw", newSize * sizeof(int)); this->m_data->d_length = program.createBuffer("rw", newSize * DESC_SIZE * sizeof(float)); this->m_data->d_desc = program.createBuffer("rw", newSize * DESC_SIZE * sizeof(float)); this->m_data->d_res = program.createBuffer("rw", newSize * 121 * sizeof(float4)); this->m_data->d_orientation = program.createBuffer("rw", newSize * sizeof(float)); this->m_data->scale = (float*) alloc(newSize * sizeof(float)); this->m_data->pixPos = (float2*) alloc(newSize * sizeof(float2)); this->m_data->laplacian = (int*) alloc(newSize * sizeof(int)); this->m_data->desc = (float*) alloc(newSize * DESC_SIZE * sizeof(float)); this->m_data->orientation = (float*) alloc(newSize * sizeof(float)); } //! This function gets called each time SURF is run on a new frame. It prevents //! having to create and destroy the object each time (lots of OpenCL overhead) void Surf::reset() { this->m_data->fh->reset(); } //! Retreive the descriptors from the GPU /*! Copy data back from the GPU into an IpVec structure on the host */ const IpVec &Surf::retrieveDescriptors() { m_data->m_outputBuffer.resize(m_data->numIpts); if (!m_data->m_outputBuffer.size()) { return m_data->m_outputBuffer; } // Copy back the output data // Copy back Laplacian information this->m_data->d_laplacian.read(this->m_data->laplacian, (this->m_data->numIpts) * sizeof(int), false); // Copy back scale data this->m_data->d_scale.read(this->m_data->scale, this->m_data->numIpts * sizeof(float), false); // Copy back pixel positions this->m_data->d_pixPos.read(this->m_data->pixPos, (this->m_data->numIpts) * sizeof(float2), false); // Copy back descriptors this->m_data->d_desc.read(this->m_data->desc, (this->m_data->numIpts) * DESC_SIZE * sizeof(float), false); // Copy back orientation data this->m_data->d_orientation.read(this->m_data->orientation, (this->m_data->numIpts) * sizeof(float)); // Parse the data into Ipoint structures for (int i = 0; i < (this->m_data->numIpts); i++) { Ipoint &ipt = m_data->m_outputBuffer[i]; ipt.x = this->m_data->pixPos[i].x; ipt.y = this->m_data->pixPos[i].y; ipt.scale = this->m_data->scale[i]; ipt.laplacian = this->m_data->laplacian[i]; ipt.orientation = this->m_data->orientation[i]; memcpy(ipt.descriptor, &this->m_data->desc[i * 64], sizeof(float) * 64); // m_data->m_outputBuffer[i] = ipts->push_back(ipt); } return m_data->m_outputBuffer; //ipts; } //! Function that builds vector of interest points. This is the main SURF function //! that will be called for any type of input. /*! High level driver function for entire OpenSurfOpenCl \param img image to find Ipoints within \param upright Switch for future functionality of upright surf \param fh FastHessian object */ void Surf::run(const icl::core::Img32f &image, bool upright) { if (upright) { // Extract upright (i.e. not rotation invariant) descriptors printf("Upright surf not supported\n"); exit(1); } // Perform the scan sum of the image (populates d_intImage) // GPU kernels: scan (x2), tranpose (x2) this->computeIntegralImage(image); // Determines the points of interest // GPU kernels: init_det, hessian_det (x12), non_max_suppression (x3) // GPU mem transfer: copies back the number of ipoints this->m_data->numIpts = this->m_data->fh->getIpoints(image, this->m_data->d_intImage, this->m_data->d_laplacian, this->m_data->d_pixPos, this->m_data->d_scale, this->m_data->maxIpts); // Verify that there was enough space allocated for the number of // Ipoints found if (this->m_data->numIpts >= this->m_data->maxIpts) { // If not enough space existed, we need to reallocate space and // run the kernels again printf( "Not enough space for Ipoints, reallocating and running again\n"); this->m_data->maxIpts = this->m_data->numIpts * 2; this->reallocateIptBuffers(); // XXX This was breaking sometimes this->m_data->fh->reset(); this->m_data->numIpts = this->m_data->fh->getIpoints(image, this->m_data->d_intImage, this->m_data->d_laplacian, this->m_data->d_pixPos, this->m_data->d_scale, this->m_data->maxIpts); } // printf("There were %d interest points\n", this->m_data->numIpts); // Main SURF-64 loop assigns orientations and gets descriptors if (this->m_data->numIpts == 0) return; // GPU kernel: getOrientation1 (1x), getOrientation2 (1x) this->getOrientations(image.getWidth(), image.getHeight()); // GPU kernel: surf64descriptor (1x), norm64descriptor (1x) this->createDescriptors(image.getWidth(), image.getHeight()); } const IpVec &Surf::detect(const ImgBase *image) { ICLASSERT_THROW(image, ICLException("CLSurfLib::Surf::detect: given image was null")); cc(image, &this->m_data->m_grayBuffer); this->m_data->m_grayBuffer.forEach(scale_to_01); run(m_data->m_grayBuffer, false); const IpVec &ipts = retrieveDescriptors(); reset(); return ipts; } } } }