#if (__SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)) #include "emmintrin.h" #warning "using sse2" #if (__SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)) #include "pmmintrin.h" #warning "using sse3" #if (__SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)) #include "tmmintrin.h" #warning "using ssse3" #ifdef __SSE4_1__ #include <smmintrin.h> #ifdef __SSE4_2__ #include <nmmintrin.h> #endif #endif #endif #endif #endif #include <ICLQt/Common.h> VBox gui; GenericGrabber g; inline icl8u pix(const Channel8u &c, float x, float y){ int px = (int)x; // floor of x int py = (int)y; // floor of y icl8u p1 = c(px,py); icl8u p2 = c(px+1,py); icl8u p3 = c(px,py+1); icl8u p4 = c(px+1,py+1); // Calculate the weights for each pixel float fx = x - px; float fy = y - py; float fx1 = 1.0f - fx; float fy1 = 1.0f - fy; int w1 = fx1 * fy1 * 256.0f; int w2 = fx * fy1 * 256.0f; int w3 = fx1 * fy * 256.0f; int w4 = fx * fy * 256.0f; // Calculate the weighted sum of pixels (for each color channel) unsigned int out = p1 * w1 + p2 * w2 + p3 * w3 + p4 * w4; return (out >> 8)& 0xff; } inline icl8u pixfix(const Channel8u &c, float x, float y){ const unsigned int shift = 8; // shift can have values 8 to 16 const unsigned int fixed = 1<<shift; unsigned int Fx = (unsigned int) (x * fixed); // convert to Fixed unsigned int Fy = (unsigned int) (y * fixed); // convert to Fixed unsigned int px = (Fx & -fixed)>>shift; unsigned int py = (Fy & -fixed)>>shift; icl8u p1 = c(px,py); icl8u p2 = c(px+1,py); icl8u p3 = c(px,py+1); icl8u p4 = c(px+1,py+1); unsigned int fx = Fx & (fixed-1); unsigned int fy = Fy & (fixed-1); unsigned int fx1 = fixed - fx; unsigned int fy1 = fixed - fy; unsigned int w1 = (fx1 * fy1) >> shift; unsigned int w2 = (fx * fy1) >> shift; unsigned int w3 = (fx1 * fy ) >> shift; unsigned int w4 = (fx * fy ) >> shift; // Calculate the weighted sum of pixels (for each color channel) unsigned int out = (p1 * w1 + p2 * w2 + p3 * w3 + p4 * w4) >> shift; return out & 0xff; } const Img8u &scale(const Img8u &src, const Size &size){ static Img8u dst(size,1); dst.setSize(size); const Channel8u s = src[0]; Channel8u d = dst[0]; const float fx = float(src.getWidth()-1)/float(size.width-1); const float fy = float(src.getHeight()-1)/float(size.height-1); for(int y=0;y<size.height-1;++y){ for(int x=0;x<size.width-1;++x){ d(x,y) = pix(s,x*fx,y*fy); } } return dst; } struct Pre{ unsigned int p,f,f1; }; static inline __m128i mult_sse(const __m128i &a, const __m128i &b) { #ifdef __SSE4_1__ // modern CPU - use SSE 4.1 #warning using sse 4.1 return _mm_mullo_epi32(a, b); #else // old CPU - use SSE 2 __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/ __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */ return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */ #endif } const Img8u &scale_fix(const Img8u &src, const Size &size){ static Img8u dst(size,1); dst.setSize(size); const Channel8u s = src[0]; Channel8u d = dst[0]; const float fx = float(src.getWidth()-1)/float(size.width-1); const float fy = float(src.getHeight()-1)/float(size.height-1); static const unsigned int shift = 8; // shift can have values 8 to 16 static const unsigned int fixed = 1<<shift; std::vector<Pre> pre_x(size.width), pre_y(size.height); unsigned int F = 0; for(size_t x=0;x<size.width;++x){ Pre &p = pre_x[x]; F = (unsigned int)((x * fx) * fixed); p.p = (F & -fixed) >> shift; p.f = F & (fixed-1); p.f1 = fixed - p.f; } for(size_t y=0;y<size.height;++y){ Pre &p = pre_y[y]; F = (unsigned int)((y*fy) * fixed); p.p = (F & -fixed) >> shift; p.f = F & (fixed-1); p.f1 = fixed - p.f; } const icl8u *data = &s[0]; for(int y=0;y<size.height-1;++y){ const Pre &py = pre_y[y]; //const icl8u *datarow = data + py.p * size.width; for(int x=0;x<size.width-1;++x){ const Pre &px = pre_x[x]; // ?? //const icl8u *p = datarow + px.p; //icl8u p1 = p[0], p2 = p[1], p3 = p[size.width], p4 = p[size.width+1]; icl8u p1 = s(px.p,py.p); icl8u p2 = s(px.p+1,py.p); icl8u p3 = s(px.p,py.p+1); icl8u p4 = s(px.p+1,py.p+1); unsigned int w1 = (px.f1 * py.f1) >> shift; unsigned int w2 = (px.f * py.f1) >> shift; unsigned int w3 = (px.f1 * py.f) >> shift; unsigned int w4 = (px.f * py.f) >> shift; /* sse: A = load left colum, B = load right column C = A*B D = C >> shift E = load p1,p2,p3,p4 D = <D.E> res = get D >> shift ... */ #ifdef OFF__SSE4_1__ __m128i l = _mm_setr_epi32(px.f1, px.f, px.f1, px.f); __m128i r = _mm_setr_epi32(py.f1, py.f1, py.f, py.f); __m128i pw = _mm_srli_si128(mult_sse(l,r),shift); __m128i pv = _mm_setr_epi32(p1,p2,p3,p4); //__m128i pw = _mm_setr_epi32(w1,w2,w3,w4); __m128i m = mult_sse(pv,pw); __m128i vsum = _mm_add_epi32(m, _mm_srli_si128(m, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); d(x,y) = (_mm_cvtsi128_si32(vsum) >> shift) & 0xff; /* worked, but slower! __m128i pv = _mm_setr_epi32(p1,p2,p3,p4); __m128i pw = _mm_setr_epi32(w1,w2,w3,w4); __m128i m = mult_sse(pv,pw); __m128i vsum = _mm_add_epi32(m, _mm_srli_si128(m, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); d(x,y) = (_mm_cvtsi128_si32(vsum) >> shift) & 0xff; */ #else //unsigned int out = (p1 * w1 + p2 * w2 + p3 * w3 + p4 * w4) >> shift; //d(x,y) = out & 0xff; d(x,y) = ( (p1 * w1 + p2 * w2 + p3 * w3 + p4 * w4) >> shift) & 0xff; #endif } } return dst; } void init(){ gui << Image().minSize(32,24).handle("image") << ( HBox().maxSize(99,2) << Combo("QVGA,VGA,!1280x960,1920x1080").handle("size") << Label("").handle("time") << Combo("scale,manual,!fixed").handle("mode") ) << Show(); g.init(pa("-i")); g.useDesired(depth8u); g.useDesired(Size(160,120)); g.useDesired(formatGray); } void run(){ const Img8u &image = *g.grab()->as8u(); Size s = parse<Size>(gui["size"].as<std::string>()); std::string mode = gui["mode"]; if(mode == "scale"){ static Img8u dst(s,1); dst.setSize(s); Time t = Time::now(); image.scaledCopy(&dst,interpolateLIN); gui["time"] = str(t.age().toMilliSecondsDouble()) + " ms"; gui["image"] = dst; }else if (mode == "manual"){ Time t = Time::now(); const Img8u &dst = scale(image,s); gui["time"] = str(t.age().toMilliSecondsDouble()) + " ms"; gui["image"] = dst; }else{ Time t = Time::now(); const Img8u &dst = scale_fix(image,s); gui["time"] = str(t.age().toMilliSecondsDouble()) + " ms"; gui["image"] = dst; } } int main(int n, char **ppc){ return ICLApp(n,ppc,"-input|-i(2)",init,run).exec(); }