/******************************************************************** ** Image Component Library (ICL) ** ** ** ** Copyright (C) 2006-2013 CITEC, University of Bielefeld ** ** Neuroinformatics Group ** ** Website: www.iclcv.org and ** ** http://opensource.cit-ec.de/projects/icl ** ** ** ** File : ICLUtils/src/ICLUtils/SSEUtils.h ** ** Module : ICLUtils ** ** Authors: Sergius Gaulik ** ** ** ** ** ** GNU LESSER GENERAL PUBLIC LICENSE ** ** This file may be used under the terms of the GNU Lesser General ** ** Public License version 3.0 as published by the ** ** ** ** Free Software Foundation and appearing in the file LICENSE.LGPL ** ** included in the packaging of this file. Please review the ** ** following information to ensure the license requirements will ** ** be met: http://www.gnu.org/licenses/lgpl-3.0.txt ** ** ** ** The development of this software was supported by the ** ** Excellence Cluster EXC 277 Cognitive Interaction Technology. ** ** The Excellence Cluster EXC 277 is a grant of the Deutsche ** ** Forschungsgemeinschaft (DFG) in the context of the German ** ** Excellence Initiative. ** ** ** ********************************************************************/ #pragma once #include #include namespace icl{ namespace utils{ #ifdef ICL_HAVE_SSE2 // ++ rounding ++ // // possible modes: // _MM_ROUND_NEAREST // _MM_ROUND_DOWN // _MM_ROUND_UP // _MM_ROUND_TOWARD_ZERO static const unsigned int INITIAL_ROUNDING_MODE = _MM_GET_ROUNDING_MODE(); static unsigned int PREVIOUS_ROUNDING_MODE = INITIAL_ROUNDING_MODE; inline void sse_restore_initial_rounding_mode() { _MM_SET_ROUNDING_MODE(INITIAL_ROUNDING_MODE); } inline void sse_restore_previous_rounding_mode() { const unsigned int mode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(PREVIOUS_ROUNDING_MODE); PREVIOUS_ROUNDING_MODE = mode; } inline void sse_set_rounding_mode(const unsigned int mode) { PREVIOUS_ROUNDING_MODE = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(mode); } // -- rounding -- // // ++ alignment ++ // template inline int sse_is_16byte_aligned(const T *ptr) { return !(((uintptr_t)ptr) & 15); } template inline int sse_is_not_16byte_aligned(const T *ptr) { return (((uintptr_t)ptr) & 15); } template inline int sse_is_aligned(const T *ptr, const unsigned int bytes) { return !(((uintptr_t)ptr) & (bytes-1)); } template inline int sse_is_not_aligned(const T *ptr, const unsigned int bytes) { return (((uintptr_t)ptr) & (bytes-1)); } // -- alignment -- // // ++ conditions ++ // template inline T sse_if(const T &vIf, const T &v0) { T ret = (v0 & vIf); return ret; } template inline T sse_ifelse(const T &vIf, const T &v0, const T &v1) { T ret = (v0 & vIf); ret += andnot(v1, vIf); return ret; } // -- conditions -- // // ++ for-loops ++ // // the sse_for functions can be implemented compact in only one function // using pointer-to-pointer, but it is slower than the current // implementation of many versions template inline void sse_for(const S *src0, D *dst0, D *dstEnd, void (*subMethod)(const S*, D*), void (*subSSEMethod)(const S*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, D*, D*), void (*subSSEMethod)(const S*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd, void (*subMethod)(const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dstEnd, void (*subMethod)(const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, void (*subMethod)(const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), long step) { D *dstSSEEnd = dstEnd - (step - 1); for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dstEnd, void (*subMethod)(const S*, D*), void (*subSSEMethod)(const S*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, D*, D*), void (*subSSEMethod)(const S*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd, void (*subMethod)(const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dstEnd, void (*subMethod)(const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, void (*subMethod)(const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstSSEEnd = dstEnd - (dstStep - 1); long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*), void (*subSSEMethod)(const S*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*, D*), void (*subSSEMethod)(const S*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, const S *src4, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, const S*, D*), long step) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (step - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*), void (*subSSEMethod)(const S*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*, D*), void (*subSSEMethod)(const S*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, D*, D*, D*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0 inline void sse_for(const S *src0, const S *src1, const S *src2, const S *src3, const S *src4, D *dst0, D *dstEnd, long srcWidth, long dstWidth, long lineWidth, void (*subMethod)(const S*, const S*, const S*, const S*, const S*, D*), void (*subSSEMethod)(const S*, const S*, const S*, const S*, const S*, D*), long srcStep, long dstStep) { D *dstLEnd = dst0 + lineWidth; D *dstSSEEnd = dstLEnd - (dstStep - 1); long srcOffset = srcWidth - lineWidth; long dstOffset = dstWidth - lineWidth; long sStep, dStep; if (srcStep < dstStep) { dStep = dstStep / srcStep; sStep = 1; } else { sStep = srcStep / dstStep; dStep = 1; } for (; dst0