# SSE图像算法优化系列十二：多尺度的图像细节提升。

论文的核心思想类似于Retinex，使用了三个尺度的高斯模糊，再和原图做减法，获得不同程度的细节信息，然后通过一定的组合方式把这些细节信息融合到原图中，从而得到加强原图信息的能力。

```int IM_MultiScaleSharpen(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius)
{
int Channel = Stride / Width;
if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE;
if ((Width <= 0) || (Height <= 0))                                    return IM_STATUS_INVALIDPARAMETER;
if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER;

int Status = IM_STATUS_OK;

unsigned char *B1 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));        //    如果最后的大图在这里内存有问题，一种解决办法就是下面的模糊用BoxBlur
unsigned char *B2 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));        //    然后不要调用现有的BoxBlur函数，而是直接在本函数实现三种不同半径的模糊
unsigned char *B3 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));        //    运算速度（因为有些循环是公共的)会有点点提高，额外的内存占用则可以忽略

if ((B1 == NULL) || (B2 == NULL) || (B3 == NULL))
{
if (B1 != NULL) free(B1);
if (B2 != NULL) free(B2);
if (B3 != NULL) free(B3);
return IM_STATUS_OUTOFMEMORY;
}

Status = IM_ExpBlur(Src, B1, Width, Height, Stride, Radius);
if (Status != IM_STATUS_OK)    goto FreeMemory;
Status = IM_ExpBlur(Src, B2, Width, Height, Stride, Radius * 2);
if (Status != IM_STATUS_OK)    goto FreeMemory;
Status = IM_ExpBlur(Src, B3, Width, Height, Stride, Radius * 4);
if (Status != IM_STATUS_OK)    goto FreeMemory;
for (int Y = 0; Y < Height * Stride; Y++)
{
int DiffB1 = Src[Y] - B1[Y];
int DiffB2 = B1[Y] - B2[Y];
int DiffB3 = B2[Y] - B3[Y];
Dest[Y] = IM_ClampToByte(((4 - 2 * IM_Sign(DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]);
}

FreeMemory:
free(B1);
free(B2);
free(B3);
return Status;
}```

为避免浮点计算，我们将W1、W2、W3放大四倍，累加玩后在除以4就可以了，整个的书写过程就是按照公式（13）进行的。

```inline int IM_Sign(int X)
{
return (X >> 31) | (unsigned(-X)) >> 31;
}```

处理前

处理后（半径5）

```inline __m128i _mm_sgn_epi16(__m128i v)
{
#ifdef __SSSE3__
v = _mm_sign_epi16(_mm_set1_epi16(1), v); // use PSIGNW on SSSE3 and later
#else
v = _mm_min_epi16(v, _mm_set1_epi16(1));  // use PMINSW/PMAXSW on SSE2/SSE3.
v = _mm_max_epi16(v, _mm_set1_epi16(-1));
//_mm_set1_epi16(1) = _mm_srli_epi16(_mm_cmpeq_epi16(v, v), 15);
//_mm_set1_epi16(-1) = _mm_cmpeq_epi16(v, v);

#endif
return v;
}```

如上所示，当系统只支持SSE2以及其下的版本时，直接用_mm_min_epi16和_mm_max_epi16这样的函数硬实现，这个逻辑也很好理解。

```//　　extern __m128i _mm_sign_epi16 (__m128i a, __m128i b);
//　　Negate packed words in a if corresponding sign in b is less than zero.
//　　Interpreting a, b, and r as arrays of signed 16-bit integers:
for (i = 0; i < 8; i++)
{
if (b[i] < 0)
{
r[i] = -a[i];
}
else if (b[i] == 0)
{
r[i] = 0;
}
else
{
r[i] = a[i];
}
}```

如果参数a传值为1，不就能实现IM_Sign这个效果了吗，真好玩。

```int IM_MultiScaleSharpen(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius)
{
int Channel = Stride / Width;
if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE;
if ((Width <= 0) || (Height <= 0))                                    return IM_STATUS_INVALIDPARAMETER;
if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER;
int Status = IM_STATUS_OK;
unsigned char *B1 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));        //    如果最后的大图在这里内存有问题，一种解决办法就是下面的模糊用BoxBlur
unsigned char *B2 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));        //    然后不要调用现有的BoxBlur函数，而是直接在本函数实现三种不同半径的模糊
unsigned char *B3 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));        //    运算速度（因为有些循环是公共的)会有点点提高，额外的内存占用则可以忽略
if ((B1 == NULL) || (B2 == NULL) || (B3 == NULL))
{
if (B1 != NULL) free(B1);
if (B2 != NULL) free(B2);
if (B3 != NULL) free(B3);
return IM_STATUS_OUTOFMEMORY;
}
Status = IM_ExpBlur(Src, B1, Width, Height, Stride, Radius);
if (Status != IM_STATUS_OK)    goto FreeMemory;
Status = IM_ExpBlur(Src, B2, Width, Height, Stride, Radius * 2);
if (Status != IM_STATUS_OK)    goto FreeMemory;
Status = IM_ExpBlur(Src, B3, Width, Height, Stride, Radius * 4);
if (Status != IM_STATUS_OK)    goto FreeMemory;
int BlockSize = 8, Block = (Height * Stride) / BlockSize;
__m128i Zero = _mm_setzero_si128();
__m128i Four = _mm_set1_epi16(4);
for (int Y = 0; Y < Block * BlockSize; Y += BlockSize)
{
__m128i SrcV = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Src + Y)), Zero);
__m128i SrcB1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B1 + Y)), Zero);
__m128i SrcB2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B2 + Y)), Zero);
__m128i SrcB3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B3 + Y)), Zero);
__m128i DiffB1 = _mm_sub_epi16(SrcV, SrcB1);
__m128i DiffB2 = _mm_sub_epi16(SrcB1, SrcB2);
__m128i DiffB3 = _mm_sub_epi16(SrcB2, SrcB3);
_mm_storel_epi64((__m128i *)(Dest + Y), _mm_packus_epi16(_mm_add_epi16(SrcV, Offset), Zero));
}
for (int Y = Block * BlockSize; Y < Height * Stride; Y++)
{
int DiffB1 = Src[Y] - B1[Y];
int DiffB2 = B1[Y] - B2[Y];
int DiffB3 = B2[Y] - B3[Y];
Dest[Y] = IM_ClampToByte(((4 - 2 * IM_Sign(DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]);
}
FreeMemory:
free(B1);
free(B2);
free(B3);
return Status;
}```

基本上就是按照C语言或者公式（13）所示的流程一步一步的编写，只不过把有些乘法变成了移位。

对于1080P的彩色图像，上述改动后处理时间变为了35ms，纯C语言部分的耗时约在11ms左右，同之前的相比速度提高了4倍多，提速还是相当的明显的。

```    int BlockSize = 8, Block = (Height * Stride) / BlockSize;
__m128i Zero = _mm_setzero_si128();
for (int Y = 0; Y < Block * BlockSize; Y += BlockSize)
{
__m128i SrcV = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Src + Y)), Zero);
__m128i SrcB1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B1 + Y)), Zero);
__m128i SrcB2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B2 + Y)), Zero);
__m128i SrcB3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B3 + Y)), Zero);
__m128i DiffB1 = _mm_sub_epi16(SrcV, SrcB1);
_mm_storel_epi64((__m128i *)(Dest + Y), _mm_packus_epi16(_mm_add_epi16(SrcV, Offset), Zero));
}```

虽然测试表明，速度没有提高多少（主要是计算量太少），但这样写明显合理很多。

107 篇文章56 人订阅

0 条评论

## 相关文章

### 终于等到你——ggplot2树状图

2017年8月份的R语言更新包中，默默地加入了支持ggplot2树状图的新几何对象，从此在R语言中制作树状图，不用再求助于第三方包的辅助了。 该包既有Cran...

45660

### R语言之可视化⑦easyGgplot2散点图目录

ggplot2.stripchart是一个易于使用的函数（来自easyGgplot2包），使用ggplot2绘图系统和R软件生成条带图。 条形图也被称为一维散点...

12910

25740

17030

17050

56150

18320

25850

35220

### HGE系列之管中窥豹(变形网格)

近来实在忙了一些，一直没有时间继续这个HGE系列，但实际上，时间紧之类的说辞深究起来往往都是有些苍白无力的，尤其当你一心想做成某事时，时间总归是可以挤出来的...

10820