/*
* Cloud Wu's JPEG decoder
*
* 2000/3/4 第 1 版
*
* 允许非赢利性质的自由使用, 但如果使用此代码的全部或部分
* 请署上 Cloud Wu (云风)
*
* 商业使用请向作者直接联系
*
* www.codingnow.com
* cloudwu@263.net
*
* MMX 版本 8x8 AAN IDCT
* 本函数摘取修改自 developer.intel.com
*/
#define mmword dword
#ifndef WORD
#define WORD unsigned short
#endif
static __int64 x0005000200010001=0x0005000200010001;
static __int64 x0040000000000000=0x40000000000000;
static __int64 x5a825a825a825a82=0x5a825a825a825a82; // 23170
static __int64 x539f539f539f539f=0x539f539f539f539f; // 21407
static __int64 x4546454645464546=0x4546454645464546; // 17734
static __int64 x61f861f861f861f8=0x61f861f861f861f8; // 25080
static __int64 scratch1=0;
static __int64 scratch3=0;
static __int64 scratch5=0;
static __int64 scratch7=0;
static WORD preSC[64]={
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
12873, 17855, 16819, 15137, 25746, 20228, 13933, 7103,
17734, 24598, 23170, 20853, 17734, 13933, 9597, 4892,
18081, 25080, 23624, 21261, 18081, 14206, 9785, 4988
};
__declspec( naked ) void jpeg_idct8x8aan (short *src_result)
{
__asm {
push ebp
lea ecx, [preSC]
mov ebp, esp
push esi
mov esi, DWORD PTR [ebp+8] // source
//slot
// column 0: even part
// use V4, V12, V0, V8 to produce V22..V25
movq mm0, mmword ptr [ecx+8*12] // maybe the first mul can be done together
// with the dequantization in iHuff module ?
//slot
pmulhw mm0, mmword ptr [esi+8*12] // V12
//slot
movq mm1, mmword ptr [ecx+8*4]
//slot
pmulhw mm1, mmword ptr [esi+8*4] // V4
//slot
movq mm3, mmword ptr [ecx+8*0]
psraw mm0, 1 // t64=t66
pmulhw mm3, mmword ptr [esi+8*0] // V0
//slot
movq mm5, mmword ptr [ecx+8*8] // duplicate V4
movq mm2, mm1 // added 11/1/96
pmulhw mm5, mmword ptr [esi+8*8] // V8
psubsw mm1, mm0 // V16
pmulhw mm1, mmword ptr x5a825a825a825a82 // 23170 ->V18
paddsw mm2, mm0 // V17
movq mm0, mm2 // duplicate V17
psraw mm2, 1 // t75=t82
psraw mm0, 2 // t72
movq mm4, mm3 // duplicate V0
paddsw mm3, mm5 // V19
psubsw mm4, mm5 // V20 //mm5 free
//moved from the block below
movq mm7, mmword ptr [ecx+8*10]
psraw mm3, 1 // t74=t81
movq mm6, mm3 // duplicate t74=t81
psraw mm4, 2 // t77=t79
psubsw mm1, mm0 // V21 // mm0 free
paddsw mm3, mm2 // V22
movq mm5, mm1 // duplicate V21
paddsw mm1, mm4 // V23
movq mmword ptr [esi+8*4], mm3 // V22
psubsw mm4, mm5 // V24 // mm5 free
movq mmword ptr [esi+8*12], mm1 // V23
psubsw mm6, mm2 // V25 // mm2 free
movq mmword ptr [esi+8*0], mm4 // V24
//slot
// keep mm6 alive all along the next block
//movq mmword ptr [esi+8*8], mm6 // V25
// column 0: odd part
// use V2, V6, V10, V14 to produce V31, V39, V40, V41
//moved above
//movq mm7, mmword ptr [ecx+8*10]
pmulhw mm7, mmword ptr [esi+8*10] // V10
//slot
movq mm0, mmword ptr [ecx+8*6]
//slot
pmulhw mm0, mmword ptr [esi+8*6] // V6
//slot
movq mm5, mmword ptr [ecx+8*2]
movq mm3, mm7 // duplicate V10
pmulhw mm5, mmword ptr [esi+8*2] // V2
//slot
movq mm4, mmword ptr [ecx+8*14]
psubsw mm7, mm0 // V26
pmulhw mm4, mmword ptr [esi+8*14] // V14
paddsw mm3, mm0 // V29 // free mm0
movq mm1, mm7 // duplicate V26
psraw mm3, 1 // t91=t94
pmulhw mm7, mmword ptr x539f539f539f539f // V33
psraw mm1, 1 // t96
movq mm0, mm5 // duplicate V2
psraw mm4, 2 // t85=t87
paddsw mm5, mm4 // V27
psubsw mm0, mm4 // V28 // free mm4
movq mm2, mm0 // duplicate V28
psraw mm5, 1 // t90=t93
pmulhw mm0, mmword ptr x4546454645464546 // V35
psraw mm2, 1 // t97
movq mm4, mm5 // duplicate t90=t93
psubsw mm1, mm2 // V32 // free mm2
pmulhw mm1, mmword ptr x61f861f861f861f8 // V36
psllw mm7, 1 // t107
paddsw mm5, mm3 // V31
psubsw mm4, mm3 // V30 // free mm3
pmulhw mm4, mmword ptr x5a825a825a825a82 // V34
nop //slot
psubsw mm0, mm1 // V38
psubsw mm1, mm7 // V37 // free mm7
psllw mm1, 1 // t114
//move from the next block
movq mm3, mm6 // duplicate V25
//move from the next block
movq mm7, mmword ptr [esi+8*4] // V22
psllw mm0, 1 // t110
psubsw mm0, mm5 // V39 (mm5 still needed for next block)
psllw mm4, 2 // t112
//move from the next block
movq mm2, mmword ptr [esi+8*12] // V23
psubsw mm4, mm0 // V40
paddsw mm1, mm4 // V41 // free mm0
//move from the next block
psllw mm2, 1 // t117=t125
// column 0: output butterfly
//move above
//movq mm3, mm6 // duplicate V25
//movq mm7, mmword ptr [esi+8*4] // V22
//movq mm2, mmword ptr [esi+8*12] // V23
//psllw mm2, 1 // t117=t125
psubsw mm6, mm1 // tm6
paddsw mm3, mm1 // tm8 // free mm1
movq mm1, mm7 // duplicate V22
paddsw mm7, mm5 // tm0
movq mmword ptr [esi+8*8], mm3 // tm8 // free mm3
psubsw mm1, mm5 // tm14 // free mm5
movq mmword ptr [esi+8*6], mm6 // tm6 // free mm6
movq mm3, mm2 // duplicate t117=t125
movq mm6, mmword ptr [esi+8*0] // V24
paddsw mm2, mm0 // tm2
movq mmword ptr [esi+8*0], mm7 // tm0 // free mm7
psubsw mm3, mm0 // tm12 // free mm0
movq mmword ptr [esi+8*14], mm1 // tm14 // free mm1
psllw mm6, 1 // t119=t123
movq mmword ptr [esi+8*2], mm2 // tm2 // free mm2
movq mm0, mm6 // duplicate t119=t123
movq mmword ptr [esi+8*12], mm3 // tm12 // free mm3
paddsw mm6, mm4 // tm4
//moved from next block
movq mm1, mmword ptr [ecx+8*5]
psubsw mm0, mm4 // tm10 // free mm4
//moved from next block
pmulhw mm1, mmword ptr [esi+8*5] // V5
//slot
movq mmword ptr [esi+8*4], mm6 // tm4 // free mm6
//slot
movq mmword ptr [esi+8*10], mm0 // tm10 // free mm0
//slot
// column 1: even part
// use V5, V13, V1, V9 to produce V56..V59
//moved to prev block
//movq mm1, mmword ptr [ecx+8*5]
//pmulhw mm1, mmword ptr [esi+8*5] // V5
movq mm7, mmword ptr [ecx+8*13]
psllw mm1, 1 // t128=t130
pmulhw mm7, mmword ptr [esi+8*13] // V13
movq mm2, mm1 // duplicate t128=t130
movq mm3, mmword ptr [ecx+8*1]
//slot
pmulhw mm3, mmword ptr [esi+8*1] // V1
//slot
movq mm5, mmword ptr [ecx+8*9]
psubsw mm1, mm7 // V50
pmulhw mm5, mmword ptr [esi+8*9] // V9
paddsw mm2, mm7 // V51
pmulhw mm1, mmword ptr x5a825a825a825a82 // 23170 ->V52
movq mm6, mm2 // duplicate V51
psraw mm2, 1 // t138=t144
movq mm4, mm3 // duplicate V1
psraw mm6, 2 // t136
paddsw mm3, mm5 // V53
psubsw mm4, mm5 // V54 //mm5 free
movq mm7, mm3 // duplicate V53
//moved from next block
movq mm0, mmword ptr [ecx+8*11]
psraw mm4, 1 // t140=t142
psubsw mm1, mm6 // V55 // mm6 free
paddsw mm3, mm2 // V56
movq mm5, mm4 // duplicate t140=t142
paddsw mm4, mm1 // V57
movq mmword ptr [esi+8*5], mm3 // V56
psubsw mm5, mm1 // V58 // mm1 free
movq mmword ptr [esi+8*13], mm4 // V57
psubsw mm7, mm2 // V59 // mm2 free
movq mmword ptr [esi+8*9], mm5 // V58
//slot
// keep mm7 alive all along the
- 1
- 2
- 3
- 4
- 5
- 6
前往页