SSE优化代码如下:
static float IC_Angle(const Mat& image, Point2f pt, const vector<int> & u_max)
{
int m_01 = 0, m_10 = 0;
int m_10_v0 = 0;
__m128i sse_m_01 = _mm_setzero_si128();//y轴求和变量 初值为零
__m128i sse_m_10 = _mm_setzero_si128();//x轴求和变量 初值为零
__m128i sse_m_10_v0 = _mm_setzero_si128();//x轴求和变量 初值为零
const uchar* center = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));
int nBlockWidth = 4;
//int nRemainder = 2*HALF_PATCH_SIZE%nBlockWidth;
__m128i SSE_LOAD;//加载
__m128i SSE_MUL;//乘积
for (int u = -HALF_PATCH_SIZE; u <= HALF_PATCH_SIZE; )
{
//m_10 += u * center[u];
SSE_LOAD = _mm_loadu_si128((__m128i*)center);
__m128i SSE_u = _mm_set1_epi32(u);//设置所有四个值为同一个值
SSE_MUL= _mm_mul_epu32(SSE_u,SSE_LOAD);
sse_m_10_v0 = _mm_add_epi8(sse_m_10_v0,SSE_MUL);
u += nBlockWidth;
center += nBlockWidth;
}
const uchar *q = (const uchar*)&sse_m_10_v0;
m_10_v0 = q[0] + q[1] + q[2] +q[3];
int step = (int)image.step1();
const uchar* center_plus = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));
const uchar* center_minus = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));
for (int v = 1; v <= HALF_PATCH_SIZE; ++v)
{
int v_sum = 0;
int d = u_max[v];
for (int u = -d; u <= d; )
{
__m128i SSE_u = _mm_set1_epi32(u);//设置所有四个值为同一个值
center_plus += u + v*step;
__m128i SSE_val_plus = _mm_loadu_si128((__m128i*)center_plus);
center_minus += u-v*step;
__m128i SSE_val_minus = _mm_loadu_si128((__m128i*)center_minus);
__m128i SSE_plus_sub_minus = _mm_sub_epi8(SSE_val_plus , SSE_val_minus);
__m128i SSE_v_sum = _mm_add_epi8(SSE_v_sum,SSE_plus_sub_minus);
__m128i SSE_plus_add_mins = _mm_add_epi8(SSE_val_minus,SSE_val_plus);
SSE_MUL= _mm_mul_epu32(SSE_u,SSE_plus_add_mins);
sse_m_10 = _mm_add_epi8(sse_m_10,SSE_MUL);
u = u + 4;
}
const uchar *q = (const uchar*)&sse_m_10;
m_10 = q[0] + q[1] + q[2] + q[3] + m_10_v0;
m_01 += v * v_sum;
}
//为了加快速度还使用了fastAtan2()函数,输出为[0,360)角度,精度为0.3°
return fastAtan2((float)m_01, (float)m_10);
}
七月四号更新:
inline __m128i _mm_mullo_epi8(__m128i a, __m128i b)
{
__m128i zero = _mm_setzero_si128();
__m128i Alo = _mm_cvtepu8_epi16(a);
__m128i Ahi = _mm_unpackhi_epi8(a, zero);
__m128i Blo = _mm_cvtepu8_epi16(b);
__m128i Bhi = _mm_unpackhi_epi8(b, zero);
__m128i Clo = _mm_mullo_epi16(Alo, Blo);
__m128i Chi = _mm_mullo_epi16(Ahi, Bhi);
__m128i maskLo = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
__m128i maskHi = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
__m128i C = _mm_or_si128(_mm_shuffle_epi8(Clo, maskLo), _mm_shuffle_epi8(Chi, maskHi));
return C;
}
static float IC_Angle(const Mat& image, Point2f pt, const vector<int> & u_max)
{
//printf("main IC_Angle enter.\n");
//START_TIME
int m_01 = 0, m_10 = 0;
int m_10_v0 = 0;
__m128i sse_m_01 = _mm_setzero_si128();//y轴求和变量 初值为零
__m128i sse_m_10 = _mm_setzero_si128();//x轴求和变量 初值为零
__m128i sse_m_10_v0 = _mm_setzero_si128();//x轴求和变量 初值为零
//const float* center = &image.at<float> (cvRound(pt.y), cvRound(pt.x));
const uchar* center = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));
int nBlockWidth = 16;
//int nRemainder = 2*HALF_PATCH_SIZE%nBlockWidth;
__m128i SSE_LOAD;//加载
__m128i SSE_MUL;//乘积
for (int u = -HALF_PATCH_SIZE; u <= HALF_PATCH_SIZE; )
{
//m_10 += u * center[u];
SSE_LOAD = _mm_loadu_si128((__m128i*)center);
__m128i SSE_u = _mm_set1_epi8(u);//设置所有四个值为同一个值
SSE_MUL= _mm_mullo_epi8(SSE_u,SSE_LOAD);
sse_m_10_v0 = _mm_add_epi8(sse_m_10_v0,SSE_MUL);
u += nBlockWidth;
center += nBlockWidth;
}
const uchar *q = (const uchar*)&sse_m_10_v0;
m_10_v0 = q[0] + q[1] + q[2] +q[3];
int step = (int)image.step1();
const uchar* center_plus = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));
const uchar* center_minus = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));
for (int v = 1; v <= HALF_PATCH_SIZE; ++v)
{
int v_sum = 0;
int d = u_max[v];
for (int u = -d; u <= d; )
{
__m128i SSE_u = _mm_set1_epi8(u);//设置所有四个值为同一个值
center_plus += u + v*step;
__m128i SSE_val_plus = _mm_loadu_si128((__m128i*)center_plus);
center_minus += u-v*step;
__m128i SSE_val_minus = _mm_loadu_si128((__m128i*)center_minus);
__m128i SSE_plus_sub_minus = _mm_sub_epi8(SSE_val_plus , SSE_val_minus);
__m128i SSE_v_sum = _mm_add_epi8(SSE_v_sum,SSE_plus_sub_minus);
__m128i SSE_plus_add_mins = _mm_add_epi8(SSE_val_minus,SSE_val_plus);
SSE_MUL= _mm_mullo_epi8(SSE_u,SSE_plus_add_mins);
sse_m_10 = _mm_add_epi8(sse_m_10,SSE_MUL);
u = u + nBlockWidth;
//cout<<"point"<<endl;
}
const uchar *q = (const uchar*)&sse_m_10;
m_10 = q[0] + q[1] + q[2] + q[3] + m_10_v0;
m_01 += v * v_sum;
}
//END_TIME
//printf("main IC_Angle out.\n");
//为了加快速度还使用了fastAtan2()函数,输出为[0,360)角度,精度为0.3°
return fastAtan2((float)m_01, (float)m_10);
}
2.3 Gaussian + ComputeDescriptors 八层金字塔图像 + 计算特征点描述子 耗时:≈30.9ms(十次平均)(error!)