1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 * libpng version 1.2.6 - August 15, 2004
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 * [runtime MMX configuration, GRR 20010102]
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 static int mmx_supported
=2;
37 int mmx_supported_local
= 0;
39 push ebx
/*CPUID will trash these */
43 pushfd
/*Save Eflag to stack */
44 pop eax
/*Get Eflag from stack into eax */
45 mov ecx
, eax
/*Make another copy of Eflag in ecx */
46 xor eax
, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */
47 push eax
/*Save modified Eflag back to stack */
49 popfd
/*Restored modified value back to Eflag reg */
50 pushfd
/*Save Eflag to stack */
51 pop eax
/*Get Eflag from stack */
52 push ecx
/* save original Eflag to stack */
53 popfd
/* restore original Eflag */
54 xor eax
, ecx
/*Compare the new Eflag with the original Eflag */
55 jz NOT_SUPPORTED
/*If the same, CPUID instruction is not supported, */
56 /*skip following instructions and jump to */
57 /*NOT_SUPPORTED label */
59 xor eax
, eax
/*Set eax to zero */
61 _asm _emit
0x0f /*CPUID instruction (two bytes opcode) */
64 cmp eax
, 1 /*make sure eax return non-zero value */
65 jl NOT_SUPPORTED
/*If eax is zero, mmx not supported */
67 xor eax
, eax
/*set eax to zero */
68 inc eax
/*Now increment eax to 1. This instruction is */
69 /*faster than the instruction "mov eax, 1" */
71 _asm _emit
0x0f /*CPUID instruction */
74 and edx
, 0x00800000 /*mask out all bits but mmx bit(24) */
75 cmp edx
, 0 /* 0 = mmx not supported */
76 jz NOT_SUPPORTED
/* non-zero = Yes, mmx IS supported */
78 mov mmx_supported_local
, 1 /*set return value to 1 */
81 mov eax
, mmx_supported_local
/*move return value to eax */
82 pop edx
/*CPUID trashed these */
87 /*mmx_supported_local=0; // test code for force don't support MMX */
88 /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */
90 mmx_supported
= mmx_supported_local
;
91 return mmx_supported_local
;
94 /* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
105 /* Use this routine for x86 platform - uses faster MMX routine if machine
109 png_combine_row(png_structp png_ptr
, png_bytep row
, int mask
)
111 #ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
115 png_debug(1,"in png_combine_row_asm\n");
117 if (mmx_supported
== 2) {
118 #if !defined(PNG_1_0_X)
119 /* this should have happened in png_init_mmx_flags() already */
120 png_warning(png_ptr
, "asm_flags may not have been initialized");
127 png_memcpy(row
, png_ptr
->row_buf
+ 1,
128 (png_size_t
)PNG_ROWBYTES(png_ptr
->row_info
.pixel_depth
,
131 /* GRR: add "else if (mask == 0)" case?
132 * or does png_combine_row() not even get called in that case? */
135 switch (png_ptr
->row_info
.pixel_depth
)
141 int s_inc
, s_start
, s_end
;
146 sp
= png_ptr
->row_buf
+ 1;
149 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
150 if (png_ptr
->transformations
& PNG_PACKSWAP
)
166 for (i
= 0; i
< png_ptr
->width
; i
++)
172 value
= (*sp
>> shift
) & 0x1;
173 *dp
&= (png_byte
)((0x7f7f >> (7 - shift
)) & 0xff);
174 *dp
|= (png_byte
)(value
<< shift
);
198 int s_start
, s_end
, s_inc
;
204 sp
= png_ptr
->row_buf
+ 1;
207 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
208 if (png_ptr
->transformations
& PNG_PACKSWAP
)
224 for (i
= 0; i
< png_ptr
->width
; i
++)
228 value
= (*sp
>> shift
) & 0x3;
229 *dp
&= (png_byte
)((0x3f3f >> (6 - shift
)) & 0xff);
230 *dp
|= (png_byte
)(value
<< shift
);
253 int s_start
, s_end
, s_inc
;
259 sp
= png_ptr
->row_buf
+ 1;
262 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
263 if (png_ptr
->transformations
& PNG_PACKSWAP
)
278 for (i
= 0; i
< png_ptr
->width
; i
++)
282 value
= (*sp
>> shift
) & 0xf;
283 *dp
&= (png_byte
)((0xf0f >> (4 - shift
)) & 0xff);
284 *dp
|= (png_byte
)(value
<< shift
);
311 __int64 mask0
=0x0102040810204080;
313 #if !defined(PNG_1_0_X)
314 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
315 /* && mmx_supported */ )
320 srcptr
= png_ptr
->row_buf
+ 1;
324 len
= png_ptr
->width
&~7; /*reduce to multiple of 8 */
325 diff
= png_ptr
->width
& 7; /*amount lost */
329 movd mm7
, unmask
/*load bit pattern */
330 psubb mm6
,mm6
/*zero mm6 */
333 punpckldq mm7
,mm7
/*fill register with 8 masks */
337 pand mm0
,mm7
/*nonzero if keep byte */
338 pcmpeqb mm0
,mm6
/*zeros->1s, v versa */
340 mov ecx
,len
/*load length of line (pixels) */
341 mov esi
,srcptr
/*load source */
342 mov ebx
,dstptr
/*load dest */
354 add esi
,8 /*inc by 8 bytes processed */
356 sub ecx
,8 /*dec by 8 pixels processed */
366 sal edx
,24 /*make low byte the high byte */
369 sal edx
,1 /*move high bit to CF */
370 jnc skip8
/*if CF = 0 */
383 else /* mmx not supported - use modified C routine */
385 register unsigned int incr1
, initial_val
, final_val
;
386 png_size_t pixel_bytes
;
388 register int disp
= png_pass_inc
[png_ptr
->pass
];
389 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
391 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
392 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
394 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
395 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
396 final_val
= png_ptr
->width
*pixel_bytes
;
397 incr1
= (disp
)*pixel_bytes
;
398 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
400 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
415 __int64 mask1
=0x0101020204040808,
416 mask0
=0x1010202040408080;
418 #if !defined(PNG_1_0_X)
419 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
420 /* && mmx_supported */ )
425 srcptr
= png_ptr
->row_buf
+ 1;
429 len
= (png_ptr
->width
)&~7;
430 diff
= (png_ptr
->width
)&7;
433 movd mm7
, unmask
/*load bit pattern */
434 psubb mm6
,mm6
/*zero mm6 */
437 punpckldq mm7
,mm7
/*fill register with 8 masks */
448 mov ecx
,len
/*load length of line */
449 mov esi
,srcptr
/*load source */
450 mov ebx
,dstptr
/*load dest */
471 add esi
,16 /*inc by 16 bytes processed */
473 sub ecx
,8 /*dec by 8 pixels processed */
483 sal edx
,24 /*make low byte the high byte */
485 sal edx
,1 /*move high bit to CF */
486 jnc skip16
/*if CF = 0 */
499 else /* mmx not supported - use modified C routine */
501 register unsigned int incr1
, initial_val
, final_val
;
502 png_size_t pixel_bytes
;
504 register int disp
= png_pass_inc
[png_ptr
->pass
];
505 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
507 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
508 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
510 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
511 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
512 final_val
= png_ptr
->width
*pixel_bytes
;
513 incr1
= (disp
)*pixel_bytes
;
514 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
516 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
532 __int64 mask2
=0x0101010202020404, /*24bpp */
533 mask1
=0x0408080810101020,
534 mask0
=0x2020404040808080;
536 srcptr
= png_ptr
->row_buf
+ 1;
540 len
= (png_ptr
->width
)&~7;
541 diff
= (png_ptr
->width
)&7;
543 #if !defined(PNG_1_0_X)
544 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
545 /* && mmx_supported */ )
552 movd mm7
, unmask
/*load bit pattern */
553 psubb mm6
,mm6
/*zero mm6 */
556 punpckldq mm7
,mm7
/*fill register with 8 masks */
570 mov ecx
,len
/*load length of line */
571 mov esi
,srcptr
/*load source */
572 mov ebx
,dstptr
/*load dest */
602 add esi
,24 /*inc by 24 bytes processed */
604 sub ecx
,8 /*dec by 8 pixels processed */
614 sal edx
,24 /*make low byte the high byte */
616 sal edx
,1 /*move high bit to CF */
617 jnc skip24
/*if CF = 0 */
634 else /* mmx not supported - use modified C routine */
636 register unsigned int incr1
, initial_val
, final_val
;
637 png_size_t pixel_bytes
;
639 register int disp
= png_pass_inc
[png_ptr
->pass
];
640 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
642 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
643 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
645 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
646 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
647 final_val
= png_ptr
->width
*pixel_bytes
;
648 incr1
= (disp
)*pixel_bytes
;
649 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
651 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
667 __int64 mask3
=0x0101010102020202, /*32bpp */
668 mask2
=0x0404040408080808,
669 mask1
=0x1010101020202020,
670 mask0
=0x4040404080808080;
672 srcptr
= png_ptr
->row_buf
+ 1;
676 len
= (png_ptr
->width
)&~7;
677 diff
= (png_ptr
->width
)&7;
679 #if !defined(PNG_1_0_X)
680 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
681 /* && mmx_supported */ )
688 movd mm7
, unmask
/*load bit pattern */
689 psubb mm6
,mm6
/*zero mm6 */
692 punpckldq mm7
,mm7
/*fill register with 8 masks */
709 mov ecx
,len
/*load length of line */
710 mov esi
,srcptr
/*load source */
711 mov ebx
,dstptr
/*load dest */
749 add esi
,32 /*inc by 32 bytes processed */
751 sub ecx
,8 /*dec by 8 pixels processed */
761 sal edx
,24 /*make low byte the high byte */
763 sal edx
,1 /*move high bit to CF */
764 jnc skip32
/*if CF = 0 */
778 else /* mmx _not supported - Use modified C routine */
780 register unsigned int incr1
, initial_val
, final_val
;
781 png_size_t pixel_bytes
;
783 register int disp
= png_pass_inc
[png_ptr
->pass
];
784 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
786 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
787 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
789 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
790 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
791 final_val
= png_ptr
->width
*pixel_bytes
;
792 incr1
= (disp
)*pixel_bytes
;
793 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
795 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
811 __int64 mask5
=0x0101010101010202,
812 mask4
=0x0202020204040404,
813 mask3
=0x0404080808080808,
814 mask2
=0x1010101010102020,
815 mask1
=0x2020202040404040,
816 mask0
=0x4040808080808080;
818 #if !defined(PNG_1_0_X)
819 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
820 /* && mmx_supported */ )
825 srcptr
= png_ptr
->row_buf
+ 1;
829 len
= (png_ptr
->width
)&~7;
830 diff
= (png_ptr
->width
)&7;
833 movd mm7
, unmask
/*load bit pattern */
834 psubb mm6
,mm6
/*zero mm6 */
837 punpckldq mm7
,mm7
/*fill register with 8 masks */
860 mov ecx
,len
/*load length of line */
861 mov esi
,srcptr
/*load source */
862 mov ebx
,dstptr
/*load dest */
910 add esi
,48 /*inc by 32 bytes processed */
912 sub ecx
,8 /*dec by 8 pixels processed */
922 sal edx
,24 /*make low byte the high byte */
925 sal edx
,1 /*move high bit to CF */
926 jnc skip48
/*if CF = 0 */
940 else /* mmx _not supported - Use modified C routine */
942 register unsigned int incr1
, initial_val
, final_val
;
943 png_size_t pixel_bytes
;
945 register int disp
= png_pass_inc
[png_ptr
->pass
];
946 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
948 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
949 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
951 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
952 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
953 final_val
= png_ptr
->width
*pixel_bytes
;
954 incr1
= (disp
)*pixel_bytes
;
955 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
957 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
970 png_size_t pixel_bytes
;
971 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
973 register int disp
= png_pass_inc
[png_ptr
->pass
]; /* get the offset */
974 register unsigned int incr1
, initial_val
, final_val
;
976 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
977 sptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
979 dp
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
980 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
981 final_val
= png_ptr
->width
*pixel_bytes
;
982 incr1
= (disp
)*pixel_bytes
;
983 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
985 png_memcpy(dp
, sptr
, pixel_bytes
);
991 } /* end switch (png_ptr->row_info.pixel_depth) */
992 } /* end if (non-trivial mask) */
994 } /* end png_combine_row() */
997 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1000 png_do_read_interlace(png_structp png_ptr
)
1002 png_row_infop row_info
= &(png_ptr
->row_info
);
1003 png_bytep row
= png_ptr
->row_buf
+ 1;
1004 int pass
= png_ptr
->pass
;
1005 png_uint_32 transformations
= png_ptr
->transformations
;
1006 #ifdef PNG_USE_LOCAL_ARRAYS
1007 const int png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
1010 png_debug(1,"in png_do_read_interlace\n");
1012 if (mmx_supported
== 2) {
1013 #if !defined(PNG_1_0_X)
1014 /* this should have happened in png_init_mmx_flags() already */
1015 png_warning(png_ptr
, "asm_flags may not have been initialized");
1020 if (row
!= NULL
&& row_info
!= NULL
)
1022 png_uint_32 final_width
;
1024 final_width
= row_info
->width
* png_pass_inc
[pass
];
1026 switch (row_info
->pixel_depth
)
1032 int s_start
, s_end
, s_inc
;
1037 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 3);
1038 dp
= row
+ (png_size_t
)((final_width
- 1) >> 3);
1039 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040 if (transformations
& PNG_PACKSWAP
)
1042 sshift
= (int)((row_info
->width
+ 7) & 7);
1043 dshift
= (int)((final_width
+ 7) & 7);
1051 sshift
= 7 - (int)((row_info
->width
+ 7) & 7);
1052 dshift
= 7 - (int)((final_width
+ 7) & 7);
1058 for (i
= row_info
->width
; i
; i
--)
1060 v
= (png_byte
)((*sp
>> sshift
) & 0x1);
1061 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1063 *dp
&= (png_byte
)((0x7f7f >> (7 - dshift
)) & 0xff);
1064 *dp
|= (png_byte
)(v
<< dshift
);
1065 if (dshift
== s_end
)
1073 if (sshift
== s_end
)
1088 int s_start
, s_end
, s_inc
;
1091 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 2);
1092 dp
= row
+ (png_size_t
)((final_width
- 1) >> 2);
1093 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094 if (transformations
& PNG_PACKSWAP
)
1096 sshift
= (png_size_t
)(((row_info
->width
+ 3) & 3) << 1);
1097 dshift
= (png_size_t
)(((final_width
+ 3) & 3) << 1);
1105 sshift
= (png_size_t
)((3 - ((row_info
->width
+ 3) & 3)) << 1);
1106 dshift
= (png_size_t
)((3 - ((final_width
+ 3) & 3)) << 1);
1112 for (i
= row_info
->width
; i
; i
--)
1117 v
= (png_byte
)((*sp
>> sshift
) & 0x3);
1118 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1120 *dp
&= (png_byte
)((0x3f3f >> (6 - dshift
)) & 0xff);
1121 *dp
|= (png_byte
)(v
<< dshift
);
1122 if (dshift
== s_end
)
1130 if (sshift
== s_end
)
1145 int s_start
, s_end
, s_inc
;
1148 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 1);
1149 dp
= row
+ (png_size_t
)((final_width
- 1) >> 1);
1150 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151 if (transformations
& PNG_PACKSWAP
)
1153 sshift
= (png_size_t
)(((row_info
->width
+ 1) & 1) << 2);
1154 dshift
= (png_size_t
)(((final_width
+ 1) & 1) << 2);
1162 sshift
= (png_size_t
)((1 - ((row_info
->width
+ 1) & 1)) << 2);
1163 dshift
= (png_size_t
)((1 - ((final_width
+ 1) & 1)) << 2);
1169 for (i
= row_info
->width
; i
; i
--)
1174 v
= (png_byte
)((*sp
>> sshift
) & 0xf);
1175 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1177 *dp
&= (png_byte
)((0xf0f >> (4 - dshift
)) & 0xff);
1178 *dp
|= (png_byte
)(v
<< dshift
);
1179 if (dshift
== s_end
)
1187 if (sshift
== s_end
)
1198 default: /* This is the place where the routine is modified */
1200 __int64 const4
= 0x0000000000FFFFFF;
1201 /* __int64 const5 = 0x000000FFFFFF0000; // unused... */
1202 __int64 const6
= 0x00000000000000FF;
1205 png_size_t pixel_bytes
;
1206 int width
= row_info
->width
;
1208 pixel_bytes
= (row_info
->pixel_depth
>> 3);
1210 sptr
= row
+ (width
- 1) * pixel_bytes
;
1211 dp
= row
+ (final_width
- 1) * pixel_bytes
;
1212 /* New code by Nirav Chhatrapati - Intel Corporation */
1213 /* sign fix by GRR */
1214 /* NOTE: there is NO MMX code for 48-bit and 64-bit images */
1216 // use MMX routine if machine supports it
1217 #if !defined(PNG_1_0_X)
1218 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_INTERLACE
)
1219 /* && mmx_supported */ )
1224 if (pixel_bytes
== 3)
1226 if (((pass
== 0) || (pass
== 1)) && width
)
1233 sub edi
, 21 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1235 movd mm0
, [esi
] ; X X X X X v2 v1 v0
1236 pand mm0
, const4
; 0 0 0 0 0 v2 v1 v0
1237 movq mm1
, mm0
; 0 0 0 0 0 v2 v1 v0
1238 psllq mm0
, 16 ; 0 0 0 v2 v1 v0
0 0
1239 movq mm2
, mm0
; 0 0 0 v2 v1 v0
0 0
1240 psllq mm0
, 24 ; v2 v1 v0
0 0 0 0 0
1241 psrlq mm1
, 8 ; 0 0 0 0 0 0 v2 v1
1242 por mm0
, mm2
; v2 v1 v0 v2 v1 v0
0 0
1243 por mm0
, mm1
; v2 v1 v0 v2 v1 v0 v2 v1
1244 movq mm3
, mm0
; v2 v1 v0 v2 v1 v0 v2 v1
1245 psllq mm0
, 16 ; v0 v2 v1 v0 v2 v1
0 0
1246 movq mm4
, mm3
; v2 v1 v0 v2 v1 v0 v2 v1
1247 punpckhdq mm3
, mm0
; v0 v2 v1 v0 v2 v1 v0 v2
1249 psrlq mm0
, 32 ; 0 0 0 0 v0 v2 v1 v0
1251 punpckldq mm0
, mm4
; v1 v0 v2 v1 v0 v2 v1 v0
1261 else if (((pass
== 2) || (pass
== 3)) && width
)
1268 sub edi
, 9 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1270 movd mm0
, [esi
] ; X X X X X v2 v1 v0
1271 pand mm0
, const4
; 0 0 0 0 0 v2 v1 v0
1272 movq mm1
, mm0
; 0 0 0 0 0 v2 v1 v0
1273 psllq mm0
, 16 ; 0 0 0 v2 v1 v0
0 0
1274 movq mm2
, mm0
; 0 0 0 v2 v1 v0
0 0
1275 psllq mm0
, 24 ; v2 v1 v0
0 0 0 0 0
1276 psrlq mm1
, 8 ; 0 0 0 0 0 0 v2 v1
1277 por mm0
, mm2
; v2 v1 v0 v2 v1 v0
0 0
1278 por mm0
, mm1
; v2 v1 v0 v2 v1 v0 v2 v1
1279 movq
[edi
+4], mm0
; move to memory
1280 psrlq mm0
, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1281 movd
[edi
], mm0
; move to memory
1289 else if (width
) /* && ((pass == 4) || (pass == 5)) */
1291 int width_mmx
= ((width
>> 1) << 1) - 8;
1294 width
-= width_mmx
; /* 8 or 9 pix, 24 or 27 bytes */
1305 movq mm0
, [esi
] ; X X v2 v1 v0 v5 v4 v3
1306 movq mm7
, mm0
; X X v2 v1 v0 v5 v4 v3
1307 movq mm6
, mm0
; X X v2 v1 v0 v5 v4 v3
1308 psllq mm0
, 24 ; v1 v0 v5 v4 v3
0 0 0
1309 pand mm7
, const4
; 0 0 0 0 0 v5 v4 v3
1310 psrlq mm6
, 24 ; 0 0 0 X X v2 v1 v0
1311 por mm0
, mm7
; v1 v0 v5 v4 v3 v5 v4 v3
1312 movq mm5
, mm6
; 0 0 0 X X v2 v1 v0
1313 psllq mm6
, 8 ; 0 0 X X v2 v1 v0
0
1314 movq
[edi
], mm0
; move quad to memory
1315 psrlq mm5
, 16 ; 0 0 0 0 0 X X v2
1316 pand mm5
, const6
; 0 0 0 0 0 0 0 v2
1317 por mm6
, mm5
; 0 0 X X v2 v1 v0 v2
1318 movd
[edi
+8], mm6
; move
double to memory
1327 sptr
-= width_mmx
*3;
1329 for (i
= width
; i
; i
--)
1334 png_memcpy(v
, sptr
, 3);
1335 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1337 png_memcpy(dp
, v
, 3);
1343 } /* end of pixel_bytes == 3 */
1345 else if (pixel_bytes
== 1)
1347 if (((pass
== 0) || (pass
== 1)) && width
)
1349 int width_mmx
= ((width
>> 2) << 2);
1361 movd mm0
, [esi
] ; X X X X v0 v1 v2 v3
1362 movq mm1
, mm0
; X X X X v0 v1 v2 v3
1363 punpcklbw mm0
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1364 movq mm2
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1365 punpcklwd mm0
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1366 movq mm3
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1367 punpckldq mm0
, mm0
; v3 v3 v3 v3 v3 v3 v3 v3
1368 punpckhdq mm3
, mm3
; v2 v2 v2 v2 v2 v2 v2 v2
1369 movq
[edi
], mm0
; move to memory v3
1370 punpckhwd mm2
, mm2
; v0 v0 v0 v0 v1 v1 v1 v1
1371 movq
[edi
+8], mm3
; move to memory v2
1372 movq mm4
, mm2
; v0 v0 v0 v0 v1 v1 v1 v1
1373 punpckldq mm2
, mm2
; v1 v1 v1 v1 v1 v1 v1 v1
1374 punpckhdq mm4
, mm4
; v0 v0 v0 v0 v0 v0 v0 v0
1375 movq
[edi
+16], mm2
; move to memory v1
1376 movq
[edi
+24], mm4
; move to memory v0
1387 for (i
= width
; i
; i
--)
1391 /* I simplified this part in version 1.0.4e
1392 * here and in several other instances where
1393 * pixel_bytes == 1 -- GR-P
1398 * png_memcpy(v, sptr, pixel_bytes);
1399 * for (j = 0; j < png_pass_inc[pass]; j++)
1401 * png_memcpy(dp, v, pixel_bytes);
1402 * dp -= pixel_bytes;
1404 * sptr -= pixel_bytes;
1406 * Replacement code is in the next three lines:
1409 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1414 else if (((pass
== 2) || (pass
== 3)) && width
)
1416 int width_mmx
= ((width
>> 2) << 2);
1428 movd mm0
, [esi
] ; X X X X v0 v1 v2 v3
1429 punpcklbw mm0
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1430 movq mm1
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1431 punpcklwd mm0
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1432 punpckhwd mm1
, mm1
; v0 v0 v0 v0 v1 v1 v1 v1
1433 movq
[edi
], mm0
; move to memory v2
and v3
1435 movq
[edi
+8], mm1
; move to memory v1
and v0
1445 for (i
= width
; i
; i
--)
1449 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1456 else if (width
) /* && ((pass == 4) || (pass == 5))) */
1458 int width_mmx
= ((width
>> 3) << 3);
1470 movq mm0
, [esi
] ; v0 v1 v2 v3 v4 v5 v6 v7
1471 movq mm1
, mm0
; v0 v1 v2 v3 v4 v5 v6 v7
1472 punpcklbw mm0
, mm0
; v4 v4 v5 v5 v6 v6 v7 v7
1473 /*movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 */
1474 punpckhbw mm1
, mm1
;v0 v0 v1 v1 v2 v2 v3 v3
1475 movq
[edi
+8], mm1
; move to memory v0 v1 v2
and v3
1477 movq
[edi
], mm0
; move to memory v4 v5 v6
and v7
1488 for (i
= width
; i
; i
--)
1492 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1499 } /* end of pixel_bytes == 1 */
1501 else if (pixel_bytes
== 2)
1503 if (((pass
== 0) || (pass
== 1)) && width
)
1505 int width_mmx
= ((width
>> 1) << 1);
1517 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1518 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1519 movq mm1
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1520 punpckldq mm0
, mm0
; v3 v2 v3 v2 v3 v2 v3 v2
1521 punpckhdq mm1
, mm1
; v1 v0 v1 v0 v1 v0 v1 v0
1524 movq
[edi
+ 16], mm1
1525 movq
[edi
+ 24], mm1
1534 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
1535 dp
-= (width_mmx
*16 - 2); /* sign fixed */
1536 for (i
= width
; i
; i
--)
1541 png_memcpy(v
, sptr
, 2);
1542 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1545 png_memcpy(dp
, v
, 2);
1549 else if (((pass
== 2) || (pass
== 3)) && width
)
1551 int width_mmx
= ((width
>> 1) << 1) ;
1563 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1564 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1565 movq mm1
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1566 punpckldq mm0
, mm0
; v3 v2 v3 v2 v3 v2 v3 v2
1567 punpckhdq mm1
, mm1
; v1 v0 v1 v0 v1 v0 v1 v0
1579 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
1580 dp
-= (width_mmx
*8 - 2); /* sign fixed */
1581 for (i
= width
; i
; i
--)
1586 png_memcpy(v
, sptr
, 2);
1587 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1590 png_memcpy(dp
, v
, 2);
1594 else if (width
) /* pass == 4 or 5 */
1596 int width_mmx
= ((width
>> 1) << 1) ;
1608 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1609 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1619 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
1620 dp
-= (width_mmx
*4 - 2); /* sign fixed */
1621 for (i
= width
; i
; i
--)
1626 png_memcpy(v
, sptr
, 2);
1627 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1630 png_memcpy(dp
, v
, 2);
1634 } /* end of pixel_bytes == 2 */
1636 else if (pixel_bytes
== 4)
1638 if (((pass
== 0) || (pass
== 1)) && width
)
1640 int width_mmx
= ((width
>> 1) << 1) ;
1652 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1653 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1654 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1655 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1658 movq
[edi
+ 16], mm0
1659 movq
[edi
+ 24], mm0
1661 movq
[edi
+ 40], mm1
1664 movq
[edi
+ 56], mm1
1672 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
1673 dp
-= (width_mmx
*32 - 4); /* sign fixed */
1674 for (i
= width
; i
; i
--)
1679 png_memcpy(v
, sptr
, 4);
1680 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1683 png_memcpy(dp
, v
, 4);
1687 else if (((pass
== 2) || (pass
== 3)) && width
)
1689 int width_mmx
= ((width
>> 1) << 1) ;
1701 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1702 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1703 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1704 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1708 movq
[edi
+ 24], mm1
1717 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
1718 dp
-= (width_mmx
*16 - 4); /* sign fixed */
1719 for (i
= width
; i
; i
--)
1724 png_memcpy(v
, sptr
, 4);
1725 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1728 png_memcpy(dp
, v
, 4);
1732 else if (width
) /* pass == 4 or 5 */
1734 int width_mmx
= ((width
>> 1) << 1) ;
1746 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1747 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1748 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1749 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1760 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
1761 dp
-= (width_mmx
*8 - 4); /* sign fixed */
1762 for (i
= width
; i
; i
--)
1767 png_memcpy(v
, sptr
, 4);
1768 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1771 png_memcpy(dp
, v
, 4);
1776 } /* end of pixel_bytes == 4 */
1778 else if (pixel_bytes
== 6)
1780 for (i
= width
; i
; i
--)
1784 png_memcpy(v
, sptr
, 6);
1785 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1787 png_memcpy(dp
, v
, 6);
1792 } /* end of pixel_bytes == 6 */
1796 for (i
= width
; i
; i
--)
1800 png_memcpy(v
, sptr
, pixel_bytes
);
1801 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1803 png_memcpy(dp
, v
, pixel_bytes
);
1809 } /* end of mmx_supported */
1811 else /* MMX not supported: use modified C code - takes advantage
1812 * of inlining of memcpy for a constant */
1814 if (pixel_bytes
== 1)
1816 for (i
= width
; i
; i
--)
1819 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1824 else if (pixel_bytes
== 3)
1826 for (i
= width
; i
; i
--)
1830 png_memcpy(v
, sptr
, pixel_bytes
);
1831 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1833 png_memcpy(dp
, v
, pixel_bytes
);
1836 sptr
-= pixel_bytes
;
1839 else if (pixel_bytes
== 2)
1841 for (i
= width
; i
; i
--)
1845 png_memcpy(v
, sptr
, pixel_bytes
);
1846 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1848 png_memcpy(dp
, v
, pixel_bytes
);
1851 sptr
-= pixel_bytes
;
1854 else if (pixel_bytes
== 4)
1856 for (i
= width
; i
; i
--)
1860 png_memcpy(v
, sptr
, pixel_bytes
);
1861 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1863 png_memcpy(dp
, v
, pixel_bytes
);
1866 sptr
-= pixel_bytes
;
1869 else if (pixel_bytes
== 6)
1871 for (i
= width
; i
; i
--)
1875 png_memcpy(v
, sptr
, pixel_bytes
);
1876 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1878 png_memcpy(dp
, v
, pixel_bytes
);
1881 sptr
-= pixel_bytes
;
1886 for (i
= width
; i
; i
--)
1890 png_memcpy(v
, sptr
, pixel_bytes
);
1891 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1893 png_memcpy(dp
, v
, pixel_bytes
);
1896 sptr
-= pixel_bytes
;
1900 } /* end of MMX not supported */
1903 } /* end switch (row_info->pixel_depth) */
1905 row_info
->width
= final_width
;
1907 row_info
->rowbytes
= PNG_ROWBYTES(row_info
->pixel_depth
,final_width
);
1912 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1915 /* These variables are utilized in the functions below. They are declared */
1916 /* globally here to ensure alignment on 8-byte boundaries. */
1921 } LBCarryMask
= {0x0101010101010101},
1922 HBClearMask
= {0x7f7f7f7f7f7f7f7f},
1923 ActiveMask
, ActiveMask2
, ActiveMaskEnd
, ShiftBpp
, ShiftRem
;
1926 /* Optimized code for PNG Average filter decoder */
1928 png_read_filter_row_mmx_avg(png_row_infop row_info
, png_bytep row
1929 , png_bytep prev_row
)
1932 png_uint_32 FullLength
;
1933 png_uint_32 MMXLength
;
1934 /*png_uint_32 len; */
1937 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
1938 FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
1940 /* Init address pointers and offset */
1941 mov edi
, row
/* edi ==> Avg(x) */
1942 xor ebx
, ebx
/* ebx ==> x */
1944 mov esi
, prev_row
/* esi ==> Prior(x) */
1945 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
1948 /* Compute the Raw value for the first bpp bytes */
1949 /* Raw(x) = Avg(x) + (Prior(x)/2) */
1951 mov al
, [esi
+ ebx
] /* Load al with Prior(x) */
1953 shr al
, 1 /* divide by 2 */
1954 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
1956 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
1957 /* mov does not affect flags; -1 to offset inc ebx */
1959 /* get # of bytes to alignment */
1960 mov diff
, edi
/* take start of row */
1961 add diff
, ebx
/* add bpp */
1962 add diff
, 0xf /* add 7 + 8 to incr past alignment boundary */
1963 and diff
, 0xfffffff8 /* mask to alignment boundary */
1964 sub diff
, edi
/* subtract from start ==> value ebx at alignment */
1967 /* Compute the Raw value for the bytes upto the alignment boundary */
1968 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
1972 mov cl
, [esi
+ ebx
] /* load cl with Prior(x) */
1973 mov al
, [edx
+ ebx
] /* load al with Raw(x-bpp) */
1976 shr ax
, 1 /* divide by 2 */
1977 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
1978 cmp ebx
, diff
/* Check if at alignment boundary */
1979 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
1980 /* mov does not affect flags; -1 to offset inc ebx */
1981 jb davglp1
/* Repeat until at alignment boundary */
1985 sub eax
, ebx
/* subtract alignment fix */
1986 and eax
, 0x00000007 /* calc bytes over mult of 8 */
1987 sub ecx
, eax
/* drop over bytes from original length */
1989 } /* end _asm block */
1990 /* Now do the math for the rest of the row */
1995 ActiveMask
.use
= 0x0000000000ffffff;
1996 ShiftBpp
.use
= 24; /* == 3 * 8 */
1997 ShiftRem
.use
= 40; /* == 64 - 24 */
1999 /* Re-init address pointers and offset */
2000 movq mm7
, ActiveMask
2001 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2002 movq mm5
, LBCarryMask
2003 mov edi
, row
/* edi ==> Avg(x) */
2004 movq mm4
, HBClearMask
2005 mov esi
, prev_row
/* esi ==> Prior(x) */
2006 /* PRIME the pump (load the first Raw(x-bpp) data set */
2007 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2008 /* (we correct position in loop below) */
2010 movq mm0
, [edi
+ ebx
] /* Load mm0 with Avg(x) */
2011 /* Add (Prev_row/2) to Average */
2013 psrlq mm2
, ShiftRem
/* Correct position Raw(x-bpp) data */
2014 movq mm1
, [esi
+ ebx
] /* Load mm1 with Prior(x) */
2016 pand mm3
, mm1
/* get lsb for each prev_row byte */
2017 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2018 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2019 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2020 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2021 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2022 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2023 /* lsb's were == 1 (Only valid for active group) */
2024 psrlq mm2
, 1 /* divide raw bytes by 2 */
2025 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2026 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2027 pand mm2
, mm6
/* Leave only Active Group 1 bytes to add to Avg */
2028 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2030 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2031 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 3-5 */
2032 movq mm2
, mm0
/* mov updated Raws to mm2 */
2033 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2034 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2035 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2036 /* lsb's were == 1 (Only valid for active group) */
2037 psrlq mm2
, 1 /* divide raw bytes by 2 */
2038 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2039 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2040 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2041 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2044 /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */
2045 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover the last two */
2047 movq mm2
, mm0
/* mov updated Raws to mm2 */
2048 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2049 /* Data only needs to be shifted once here to */
2050 /* get the correct x-bpp offset. */
2051 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2052 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2053 /* lsb's were == 1 (Only valid for active group) */
2054 psrlq mm2
, 1 /* divide raw bytes by 2 */
2055 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2056 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2057 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2059 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2062 /* Now ready to write back to memory */
2063 movq
[edi
+ ebx
- 8], mm0
2064 /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */
2066 movq mm2
, mm0
/* mov updated Raw(x) to mm2 */
2068 } /* end _asm block */
2077 ActiveMask
.use
= 0xffffffffffffffff; /* use shift below to clear */
2078 /* appropriate inactive bytes */
2079 ShiftBpp
.use
= bpp
<< 3;
2080 ShiftRem
.use
= 64 - ShiftBpp
.use
;
2082 movq mm4
, HBClearMask
2083 /* Re-init address pointers and offset */
2084 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2085 /* Load ActiveMask and clear all bytes except for 1st active group */
2086 movq mm7
, ActiveMask
2087 mov edi
, row
/* edi ==> Avg(x) */
2089 mov esi
, prev_row
/* esi ==> Prior(x) */
2091 movq mm5
, LBCarryMask
2092 psllq mm6
, ShiftBpp
/* Create mask for 2nd active group */
2093 /* PRIME the pump (load the first Raw(x-bpp) data set */
2094 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2095 /* (we correct position in loop below) */
2097 movq mm0
, [edi
+ ebx
]
2098 psrlq mm2
, ShiftRem
/* shift data to position correctly */
2099 movq mm1
, [esi
+ ebx
]
2100 /* Add (Prev_row/2) to Average */
2102 pand mm3
, mm1
/* get lsb for each prev_row byte */
2103 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2104 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2105 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2106 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2107 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2108 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2109 /* lsb's were == 1 (Only valid for active group) */
2110 psrlq mm2
, 1 /* divide raw bytes by 2 */
2111 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2112 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2113 pand mm2
, mm7
/* Leave only Active Group 1 bytes to add to Avg */
2114 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2116 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2117 movq mm2
, mm0
/* mov updated Raws to mm2 */
2118 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2120 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2121 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2122 /* lsb's were == 1 (Only valid for active group) */
2123 psrlq mm2
, 1 /* divide raw bytes by 2 */
2124 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2125 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2126 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2127 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2130 /* Now ready to write back to memory */
2131 movq
[edi
+ ebx
- 8], mm0
2132 /* Prep Raw(x-bpp) for next loop */
2133 movq mm2
, mm0
/* mov updated Raws to mm2 */
2135 } /* end _asm block */
2140 ActiveMask
.use
= 0x000000000000ffff;
2141 ShiftBpp
.use
= 16; /* == 2 * 8 [BUGFIX] */
2142 ShiftRem
.use
= 48; /* == 64 - 16 [BUGFIX] */
2144 /* Load ActiveMask */
2145 movq mm7
, ActiveMask
2146 /* Re-init address pointers and offset */
2147 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2148 movq mm5
, LBCarryMask
2149 mov edi
, row
/* edi ==> Avg(x) */
2150 movq mm4
, HBClearMask
2151 mov esi
, prev_row
/* esi ==> Prior(x) */
2152 /* PRIME the pump (load the first Raw(x-bpp) data set */
2153 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2154 /* (we correct position in loop below) */
2156 movq mm0
, [edi
+ ebx
]
2157 psrlq mm2
, ShiftRem
/* shift data to position correctly [BUGFIX] */
2158 movq mm1
, [esi
+ ebx
]
2159 /* Add (Prev_row/2) to Average */
2161 pand mm3
, mm1
/* get lsb for each prev_row byte */
2162 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2163 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2165 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2166 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2167 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2168 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2169 /* lsb's were == 1 (Only valid for active group) */
2170 psrlq mm2
, 1 /* divide raw bytes by 2 */
2171 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2172 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2173 pand mm2
, mm6
/* Leave only Active Group 1 bytes to add to Avg */
2174 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2175 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2176 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 2 & 3 */
2177 movq mm2
, mm0
/* mov updated Raws to mm2 */
2178 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2179 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2180 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2181 /* lsb's were == 1 (Only valid for active group) */
2182 psrlq mm2
, 1 /* divide raw bytes by 2 */
2183 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2184 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2185 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2186 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2188 /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */
2189 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 4 & 5 */
2190 movq mm2
, mm0
/* mov updated Raws to mm2 */
2191 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2192 /* Data only needs to be shifted once here to */
2193 /* get the correct x-bpp offset. */
2194 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2195 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2196 /* lsb's were == 1 (Only valid for active group) */
2197 psrlq mm2
, 1 /* divide raw bytes by 2 */
2198 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2199 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2200 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2201 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2203 /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */
2204 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 6 & 7 */
2205 movq mm2
, mm0
/* mov updated Raws to mm2 */
2206 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2207 /* Data only needs to be shifted once here to */
2208 /* get the correct x-bpp offset. */
2210 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2211 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2212 /* lsb's were == 1 (Only valid for active group) */
2213 psrlq mm2
, 1 /* divide raw bytes by 2 */
2214 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2215 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2216 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2217 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2220 /* Now ready to write back to memory */
2221 movq
[edi
+ ebx
- 8], mm0
2222 /* Prep Raw(x-bpp) for next loop */
2223 movq mm2
, mm0
/* mov updated Raws to mm2 */
2225 } /* end _asm block */
2229 case 1: /* bpp == 1 */
2232 /* Re-init address pointers and offset */
2233 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2234 mov edi
, row
/* edi ==> Avg(x) */
2235 cmp ebx
, FullLength
/* Test if offset at end of array */
2237 /* Do Paeth decode for remaining bytes */
2238 mov esi
, prev_row
/* esi ==> Prior(x) */
2240 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
2241 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
2243 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2245 mov cl
, [esi
+ ebx
] /* load cl with Prior(x) */
2246 mov al
, [edx
+ ebx
] /* load al with Raw(x-bpp) */
2249 shr ax
, 1 /* divide by 2 */
2250 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
2251 cmp ebx
, FullLength
/* Check if at end of array */
2252 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
2253 /* mov does not affect flags; -1 to offset inc ebx */
2256 } /* end _asm block */
2260 case 8: /* bpp == 8 */
2263 /* Re-init address pointers and offset */
2264 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2265 movq mm5
, LBCarryMask
2266 mov edi
, row
/* edi ==> Avg(x) */
2267 movq mm4
, HBClearMask
2268 mov esi
, prev_row
/* esi ==> Prior(x) */
2269 /* PRIME the pump (load the first Raw(x-bpp) data set */
2270 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2271 /* (NO NEED to correct position in loop below) */
2273 movq mm0
, [edi
+ ebx
]
2275 movq mm1
, [esi
+ ebx
]
2277 pand mm3
, mm1
/* get lsb for each prev_row byte */
2278 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2279 pand mm3
, mm2
/* get LBCarrys for each byte where both */
2280 /* lsb's were == 1 */
2281 psrlq mm2
, 1 /* divide raw bytes by 2 */
2282 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2283 paddb mm0
, mm3
/* add LBCarrys to Avg for each byte */
2284 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2285 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2286 paddb mm0
, mm2
/* add (Raw/2) to Avg for each byte */
2288 movq
[edi
+ ebx
- 8], mm0
2289 movq mm2
, mm0
/* reuse as Raw(x-bpp) */
2291 } /* end _asm block */
2294 default: /* bpp greater than 8 */
2297 movq mm5
, LBCarryMask
2298 /* Re-init address pointers and offset */
2299 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2300 mov edi
, row
/* edi ==> Avg(x) */
2301 movq mm4
, HBClearMask
2303 mov esi
, prev_row
/* esi ==> Prior(x) */
2304 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
2306 movq mm0
, [edi
+ ebx
]
2308 movq mm1
, [esi
+ ebx
]
2309 pand mm3
, mm1
/* get lsb for each prev_row byte */
2310 movq mm2
, [edx
+ ebx
]
2311 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2312 pand mm3
, mm2
/* get LBCarrys for each byte where both */
2313 /* lsb's were == 1 */
2314 psrlq mm2
, 1 /* divide raw bytes by 2 */
2315 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2316 paddb mm0
, mm3
/* add LBCarrys to Avg for each byte */
2317 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2318 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2320 paddb mm0
, mm2
/* add (Raw/2) to Avg for each byte */
2322 movq
[edi
+ ebx
- 8], mm0
2324 } /* end _asm block */
2327 } /* end switch ( bpp ) */
2330 /* MMX acceleration complete now do clean-up */
2331 /* Check if any remaining bytes left to decode */
2332 mov ebx
, MMXLength
/* ebx ==> x = offset bytes remaining after MMX */
2333 mov edi
, row
/* edi ==> Avg(x) */
2334 cmp ebx
, FullLength
/* Test if offset at end of array */
2336 /* Do Paeth decode for remaining bytes */
2337 mov esi
, prev_row
/* esi ==> Prior(x) */
2339 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
2340 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
2342 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2344 mov cl
, [esi
+ ebx
] /* load cl with Prior(x) */
2345 mov al
, [edx
+ ebx
] /* load al with Raw(x-bpp) */
2348 shr ax
, 1 /* divide by 2 */
2349 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
2350 cmp ebx
, FullLength
/* Check if at end of array */
2351 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
2352 /* mov does not affect flags; -1 to offset inc ebx */
2355 emms
/* End MMX instructions; prep for possible FP instrs. */
2356 } /* end _asm block */
2359 /* Optimized code for PNG Paeth filter decoder */
2361 png_read_filter_row_mmx_paeth(png_row_infop row_info
, png_bytep row
,
2364 png_uint_32 FullLength
;
2365 png_uint_32 MMXLength
;
2366 /*png_uint_32 len; */
2370 int patemp
, pbtemp
, pctemp
;
2372 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
2373 FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
2376 xor ebx
, ebx
/* ebx ==> x offset */
2378 xor edx
, edx
/* edx ==> x-bpp offset */
2382 /* Compute the Raw value for the first bpp bytes */
2383 /* Note: the formula works out to be always */
2384 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
2390 mov
[edi
+ ebx
- 1], al
2392 /* get # of bytes to alignment */
2393 mov diff
, edi
/* take start of row */
2394 add diff
, ebx
/* add bpp */
2396 add diff
, 0xf /* add 7 + 8 to incr past alignment boundary */
2397 and diff
, 0xfffffff8 /* mask to alignment boundary */
2398 sub diff
, edi
/* subtract from start ==> value ebx at alignment */
2403 /* pav = p - a = (a + b - c) - a = b - c */
2404 mov al
, [esi
+ ebx
] /* load Prior(x) into al */
2405 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
2406 sub eax
, ecx
/* subtract Prior(x-bpp) */
2407 mov patemp
, eax
/* Save pav for later use */
2409 /* pbv = p - b = (a + b - c) - b = a - c */
2410 mov al
, [edi
+ edx
] /* load Raw(x-bpp) into al */
2411 sub eax
, ecx
/* subtract Prior(x-bpp) */
2413 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2414 add eax
, patemp
/* pcv = pav + pbv */
2416 test eax
, 0x80000000
2418 neg eax
/* reverse sign of neg values */
2420 mov pctemp
, eax
/* save pc for later use */
2422 test ecx
, 0x80000000
2424 neg ecx
/* reverse sign of neg values */
2426 mov pbtemp
, ecx
/* save pb for later use */
2429 test eax
, 0x80000000
2431 neg eax
/* reverse sign of neg values */
2433 mov patemp
, eax
/* save pa for later use */
2434 /* test if pa <= pb */
2437 /* pa > pb; now test if pb <= pc */
2440 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2441 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
2444 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
2445 mov cl
, [esi
+ ebx
] /* load Prior(x) into cl */
2448 /* pa <= pb; now test if pa <= pc */
2451 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2452 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
2455 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
2456 mov cl
, [edi
+ edx
] /* load Raw(x-bpp) into cl */
2460 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
2461 add
[edi
+ ebx
- 1], cl
2467 sub eax
, ebx
/* subtract alignment fix */
2468 and eax
, 0x00000007 /* calc bytes over mult of 8 */
2469 sub ecx
, eax
/* drop over bytes from original length */
2471 } /* end _asm block */
2472 /* Now do the math for the rest of the row */
2477 ActiveMask
.use
= 0x0000000000ffffff;
2478 ActiveMaskEnd
.use
= 0xffff000000000000;
2479 ShiftBpp
.use
= 24; /* == bpp(3) * 8 */
2480 ShiftRem
.use
= 40; /* == 64 - 24 */
2487 /* PRIME the pump (load the first Raw(x-bpp) data set */
2488 movq mm1
, [edi
+ebx
-8]
2490 psrlq mm1
, ShiftRem
/* shift last 3 bytes to 1st 3 bytes */
2491 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2492 punpcklbw mm1
, mm0
/* Unpack High bytes of a */
2493 movq mm3
, [esi
+ebx
-8] /* Prep c=Prior(x-bpp) bytes */
2494 punpcklbw mm2
, mm0
/* Unpack High bytes of b */
2495 psrlq mm3
, ShiftRem
/* shift last 3 bytes to 1st 3 bytes */
2496 /* pav = p - a = (a + b - c) - a = b - c */
2498 punpcklbw mm3
, mm0
/* Unpack High bytes of c */
2499 /* pbv = p - b = (a + b - c) - b = a - c */
2503 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2507 /* pa = abs(p-a) = abs(pav) */
2508 /* pb = abs(p-b) = abs(pbv) */
2509 /* pc = abs(p-c) = abs(pcv) */
2510 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2512 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2513 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2515 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2519 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2520 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2526 pcmpgtw mm7
, mm5
/* pa > pb? */
2528 /* use mm7 mask to merge pa & pb */
2530 /* use mm0 mask copy to merge a & b */
2536 /* test ((pa <= pb)? pa:pb) <= pc */
2537 pcmpgtw mm7
, mm6
/* pab > pc? */
2544 movq mm3
, [esi
+ ebx
] /* load c=Prior(x-bpp) */
2545 pand mm7
, ActiveMask
2546 movq mm2
, mm3
/* load b=Prior(x) step 1 */
2547 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2548 punpcklbw mm3
, mm0
/* Unpack High bytes of c */
2549 movq
[edi
+ ebx
], mm7
/* write back updated value */
2550 movq mm1
, mm7
/* Now mm1 will be used as Raw(x-bpp) */
2551 /* Now do Paeth for 2nd set of bytes (3-5) */
2552 psrlq mm2
, ShiftBpp
/* load b=Prior(x) step 2 */
2553 punpcklbw mm1
, mm0
/* Unpack High bytes of a */
2555 punpcklbw mm2
, mm0
/* Unpack High bytes of b */
2556 /* pbv = p - b = (a + b - c) - b = a - c */
2558 /* pav = p - a = (a + b - c) - a = b - c */
2562 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
2563 /* pav + pbv = pbv + pav */
2567 /* pa = abs(p-a) = abs(pav) */
2568 /* pb = abs(p-b) = abs(pbv) */
2569 /* pc = abs(p-c) = abs(pcv) */
2570 pcmpgtw mm0
, mm5
/* Create mask pbv bytes < 0 */
2571 pcmpgtw mm7
, mm4
/* Create mask pav bytes < 0 */
2572 pand mm0
, mm5
/* Only pbv bytes < 0 in mm0 */
2573 pand mm7
, mm4
/* Only pav bytes < 0 in mm7 */
2579 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2580 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2585 pcmpgtw mm7
, mm5
/* pa > pb? */
2587 /* use mm7 mask to merge pa & pb */
2589 /* use mm0 mask copy to merge a & b */
2595 /* test ((pa <= pb)? pa:pb) <= pc */
2596 pcmpgtw mm7
, mm6
/* pab > pc? */
2597 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2604 movq mm3
, mm2
/* load c=Prior(x-bpp) step 1 */
2605 pand mm7
, ActiveMask
2606 punpckhbw mm2
, mm0
/* Unpack High bytes of b */
2607 psllq mm7
, ShiftBpp
/* Shift bytes to 2nd group of 3 bytes */
2608 /* pav = p - a = (a + b - c) - a = b - c */
2610 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2611 psllq mm3
, ShiftBpp
/* load c=Prior(x-bpp) step 2 */
2612 movq
[edi
+ ebx
], mm7
/* write back updated value */
2614 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2615 psllq mm1
, ShiftBpp
/* Shift bytes */
2616 /* Now mm1 will be used as Raw(x-bpp) */
2617 /* Now do Paeth for 3rd, and final, set of bytes (6-7) */
2619 punpckhbw mm1
, mm0
/* Unpack High bytes of a */
2621 /* pbv = p - b = (a + b - c) - b = a - c */
2623 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2629 /* pa = abs(p-a) = abs(pav) */
2630 /* pb = abs(p-b) = abs(pbv) */
2631 /* pc = abs(p-c) = abs(pcv) */
2632 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2633 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2634 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2635 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2641 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2642 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2647 pcmpgtw mm7
, mm5
/* pa > pb? */
2649 /* use mm0 mask copy to merge a & b */
2651 /* use mm7 mask to merge pa & pb */
2657 /* test ((pa <= pb)? pa:pb) <= pc */
2658 pcmpgtw mm7
, mm6
/* pab > pc? */
2664 /* Step ebx to next set of 8 bytes and repeat loop til done */
2666 pand mm1
, ActiveMaskEnd
2667 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
2670 pxor mm0
, mm0
/* pxor does not affect flags */
2671 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
2672 /* mm1 will be used as Raw(x-bpp) next loop */
2673 /* mm3 ready to be used as Prior(x-bpp) next loop */
2675 } /* end _asm block */
2683 ActiveMask
.use
= 0x00000000ffffffff;
2684 ActiveMask2
.use
= 0xffffffff00000000;
2685 ShiftBpp
.use
= bpp
<< 3; /* == bpp * 8 */
2686 ShiftRem
.use
= 64 - ShiftBpp
.use
;
2692 /* PRIME the pump (load the first Raw(x-bpp) data set */
2693 movq mm1
, [edi
+ebx
-8]
2696 /* Must shift to position Raw(x-bpp) data */
2698 /* Do first set of 4 bytes */
2699 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2700 punpcklbw mm1
, mm0
/* Unpack Low bytes of a */
2701 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2702 punpcklbw mm2
, mm0
/* Unpack Low bytes of b */
2703 /* Must shift to position Prior(x-bpp) data */
2705 /* pav = p - a = (a + b - c) - a = b - c */
2707 punpcklbw mm3
, mm0
/* Unpack Low bytes of c */
2708 /* pbv = p - b = (a + b - c) - b = a - c */
2712 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2715 /* pa = abs(p-a) = abs(pav) */
2716 /* pb = abs(p-b) = abs(pbv) */
2717 /* pc = abs(p-c) = abs(pcv) */
2718 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2720 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2721 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2723 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2727 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2728 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2734 pcmpgtw mm7
, mm5
/* pa > pb? */
2736 /* use mm7 mask to merge pa & pb */
2738 /* use mm0 mask copy to merge a & b */
2744 /* test ((pa <= pb)? pa:pb) <= pc */
2745 pcmpgtw mm7
, mm6
/* pab > pc? */
2752 movq mm3
, [esi
+ ebx
- 8] /* load c=Prior(x-bpp) */
2753 pand mm7
, ActiveMask
2755 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) step 1 */
2756 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2758 movq
[edi
+ ebx
], mm7
/* write back updated value */
2759 movq mm1
, [edi
+ebx
-8]
2765 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2767 /* Do second set of 4 bytes */
2768 punpckhbw mm2
, mm0
/* Unpack High bytes of b */
2769 punpckhbw mm1
, mm0
/* Unpack High bytes of a */
2770 /* pav = p - a = (a + b - c) - a = b - c */
2772 /* pbv = p - b = (a + b - c) - b = a - c */
2776 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2779 /* pa = abs(p-a) = abs(pav) */
2780 /* pb = abs(p-b) = abs(pbv) */
2781 /* pc = abs(p-c) = abs(pcv) */
2782 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2784 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2785 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2787 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2791 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2792 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2798 pcmpgtw mm7
, mm5
/* pa > pb? */
2800 /* use mm7 mask to merge pa & pb */
2802 /* use mm0 mask copy to merge a & b */
2808 /* test ((pa <= pb)? pa:pb) <= pc */
2809 pcmpgtw mm7
, mm6
/* pab > pc? */
2816 /* Step ex to next set of 8 bytes and repeat loop til done */
2819 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
2821 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
2822 /* mm1 will be used as Raw(x-bpp) next loop */
2824 } /* end _asm block */
2830 ActiveMask
.use
= 0x00000000ffffffff;
2836 /* PRIME the pump (load the first Raw(x-bpp) data set */
2837 movq mm1
, [edi
+ebx
-8] /* Only time should need to read */
2838 /* a=Raw(x-bpp) bytes */
2840 /* Do first set of 4 bytes */
2841 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2842 punpckhbw mm1
, mm0
/* Unpack Low bytes of a */
2843 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2844 punpcklbw mm2
, mm0
/* Unpack High bytes of b */
2845 /* pav = p - a = (a + b - c) - a = b - c */
2847 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2848 /* pbv = p - b = (a + b - c) - b = a - c */
2852 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2855 /* pa = abs(p-a) = abs(pav) */
2856 /* pb = abs(p-b) = abs(pbv) */
2857 /* pc = abs(p-c) = abs(pcv) */
2858 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2860 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2861 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2863 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2867 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2868 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2874 pcmpgtw mm7
, mm5
/* pa > pb? */
2876 /* use mm7 mask to merge pa & pb */
2878 /* use mm0 mask copy to merge a & b */
2884 /* test ((pa <= pb)? pa:pb) <= pc */
2885 pcmpgtw mm7
, mm6
/* pab > pc? */
2892 movq mm3
, [esi
+ ebx
] /* load c=Prior(x-bpp) */
2893 pand mm7
, ActiveMask
2894 movq mm2
, mm3
/* load b=Prior(x) step 1 */
2895 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2896 punpcklbw mm3
, mm0
/* Unpack High bytes of c */
2897 movq
[edi
+ ebx
], mm7
/* write back updated value */
2898 movq mm1
, mm7
/* Now mm1 will be used as Raw(x-bpp) */
2899 /* Do second set of 4 bytes */
2900 punpckhbw mm2
, mm0
/* Unpack Low bytes of b */
2901 punpcklbw mm1
, mm0
/* Unpack Low bytes of a */
2902 /* pav = p - a = (a + b - c) - a = b - c */
2904 /* pbv = p - b = (a + b - c) - b = a - c */
2908 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2911 /* pa = abs(p-a) = abs(pav) */
2912 /* pb = abs(p-b) = abs(pbv) */
2913 /* pc = abs(p-c) = abs(pcv) */
2914 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2916 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2917 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2919 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2923 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2924 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2930 pcmpgtw mm7
, mm5
/* pa > pb? */
2932 /* use mm7 mask to merge pa & pb */
2934 /* use mm0 mask copy to merge a & b */
2940 /* test ((pa <= pb)? pa:pb) <= pc */
2941 pcmpgtw mm7
, mm6
/* pab > pc? */
2948 /* Step ex to next set of 8 bytes and repeat loop til done */
2951 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
2953 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
2954 /* mm1 will be used as Raw(x-bpp) next loop */
2956 } /* end _asm block */
2959 case 8: /* bpp == 8 */
2961 ActiveMask
.use
= 0x00000000ffffffff;
2967 /* PRIME the pump (load the first Raw(x-bpp) data set */
2968 movq mm1
, [edi
+ebx
-8] /* Only time should need to read */
2969 /* a=Raw(x-bpp) bytes */
2971 /* Do first set of 4 bytes */
2972 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2973 punpcklbw mm1
, mm0
/* Unpack Low bytes of a */
2974 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2975 punpcklbw mm2
, mm0
/* Unpack Low bytes of b */
2976 /* pav = p - a = (a + b - c) - a = b - c */
2978 punpcklbw mm3
, mm0
/* Unpack Low bytes of c */
2979 /* pbv = p - b = (a + b - c) - b = a - c */
2983 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2986 /* pa = abs(p-a) = abs(pav) */
2987 /* pb = abs(p-b) = abs(pbv) */
2988 /* pc = abs(p-c) = abs(pcv) */
2989 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2991 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2992 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2994 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2998 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2999 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
3005 pcmpgtw mm7
, mm5
/* pa > pb? */
3007 /* use mm7 mask to merge pa & pb */
3009 /* use mm0 mask copy to merge a & b */
3015 /* test ((pa <= pb)? pa:pb) <= pc */
3016 pcmpgtw mm7
, mm6
/* pab > pc? */
3023 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
3024 pand mm7
, ActiveMask
3025 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
3026 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
3027 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
3028 movq
[edi
+ ebx
], mm7
/* write back updated value */
3029 movq mm1
, [edi
+ebx
-8] /* read a=Raw(x-bpp) bytes */
3031 /* Do second set of 4 bytes */
3032 punpckhbw mm2
, mm0
/* Unpack High bytes of b */
3033 punpckhbw mm1
, mm0
/* Unpack High bytes of a */
3034 /* pav = p - a = (a + b - c) - a = b - c */
3036 /* pbv = p - b = (a + b - c) - b = a - c */
3040 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3043 /* pa = abs(p-a) = abs(pav) */
3044 /* pb = abs(p-b) = abs(pbv) */
3045 /* pc = abs(p-c) = abs(pcv) */
3046 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
3048 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
3049 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
3051 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
3055 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
3056 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
3062 pcmpgtw mm7
, mm5
/* pa > pb? */
3064 /* use mm7 mask to merge pa & pb */
3066 /* use mm0 mask copy to merge a & b */
3072 /* test ((pa <= pb)? pa:pb) <= pc */
3073 pcmpgtw mm7
, mm6
/* pab > pc? */
3080 /* Step ex to next set of 8 bytes and repeat loop til done */
3083 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
3085 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
3086 /* mm1 will be used as Raw(x-bpp) next loop */
3088 } /* end _asm block */
3092 case 1: /* bpp = 1 */
3093 case 2: /* bpp = 2 */
3094 default: /* bpp > 8 */
3102 /* Do Paeth decode for remaining bytes */
3104 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
3105 sub edx
, bpp
/* Set edx = ebx - bpp */
3108 /* pav = p - a = (a + b - c) - a = b - c */
3109 mov al
, [esi
+ ebx
] /* load Prior(x) into al */
3110 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3111 sub eax
, ecx
/* subtract Prior(x-bpp) */
3112 mov patemp
, eax
/* Save pav for later use */
3114 /* pbv = p - b = (a + b - c) - b = a - c */
3115 mov al
, [edi
+ edx
] /* load Raw(x-bpp) into al */
3116 sub eax
, ecx
/* subtract Prior(x-bpp) */
3118 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3119 add eax
, patemp
/* pcv = pav + pbv */
3121 test eax
, 0x80000000
3123 neg eax
/* reverse sign of neg values */
3125 mov pctemp
, eax
/* save pc for later use */
3127 test ecx
, 0x80000000
3129 neg ecx
/* reverse sign of neg values */
3131 mov pbtemp
, ecx
/* save pb for later use */
3134 test eax
, 0x80000000
3136 neg eax
/* reverse sign of neg values */
3138 mov patemp
, eax
/* save pa for later use */
3139 /* test if pa <= pb */
3142 /* pa > pb; now test if pb <= pc */
3145 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3146 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3149 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3150 mov cl
, [esi
+ ebx
] /* load Prior(x) into cl */
3153 /* pa <= pb; now test if pa <= pc */
3156 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3157 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3160 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3161 mov cl
, [edi
+ edx
] /* load Raw(x-bpp) into cl */
3165 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3166 add
[edi
+ ebx
- 1], cl
3170 } /* end _asm block */
3172 return; /* No need to go further with this one */
3173 } /* end switch ( bpp ) */
3176 /* MMX acceleration complete now do clean-up */
3177 /* Check if any remaining bytes left to decode */
3183 /* Do Paeth decode for remaining bytes */
3185 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
3186 sub edx
, bpp
/* Set edx = ebx - bpp */
3189 /* pav = p - a = (a + b - c) - a = b - c */
3190 mov al
, [esi
+ ebx
] /* load Prior(x) into al */
3191 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3192 sub eax
, ecx
/* subtract Prior(x-bpp) */
3193 mov patemp
, eax
/* Save pav for later use */
3195 /* pbv = p - b = (a + b - c) - b = a - c */
3196 mov al
, [edi
+ edx
] /* load Raw(x-bpp) into al */
3197 sub eax
, ecx
/* subtract Prior(x-bpp) */
3199 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3200 add eax
, patemp
/* pcv = pav + pbv */
3202 test eax
, 0x80000000
3204 neg eax
/* reverse sign of neg values */
3206 mov pctemp
, eax
/* save pc for later use */
3208 test ecx
, 0x80000000
3210 neg ecx
/* reverse sign of neg values */
3212 mov pbtemp
, ecx
/* save pb for later use */
3215 test eax
, 0x80000000
3217 neg eax
/* reverse sign of neg values */
3219 mov patemp
, eax
/* save pa for later use */
3220 /* test if pa <= pb */
3223 /* pa > pb; now test if pb <= pc */
3226 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3227 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3230 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3231 mov cl
, [esi
+ ebx
] /* load Prior(x) into cl */
3234 /* pa <= pb; now test if pa <= pc */
3237 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3238 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3241 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3242 mov cl
, [edi
+ edx
] /* load Raw(x-bpp) into cl */
3246 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3247 add
[edi
+ ebx
- 1], cl
3251 emms
/* End MMX instructions; prep for possible FP instrs. */
3252 } /* end _asm block */
3255 /* Optimized code for PNG Sub filter decoder */
3257 png_read_filter_row_mmx_sub(png_row_infop row_info
, png_bytep row
)
3261 png_uint_32 FullLength
;
3262 png_uint_32 MMXLength
;
3265 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
3266 FullLength
= row_info
->rowbytes
- bpp
; /* # of bytes to filter */
3269 mov esi
, edi
/* lp = row */
3270 add edi
, bpp
/* rp = row + bpp */
3272 /* get # of bytes to alignment */
3273 mov diff
, edi
/* take start of row */
3274 add diff
, 0xf /* add 7 + 8 to incr past */
3275 /* alignment boundary */
3277 and diff
, 0xfffffff8 /* mask to alignment boundary */
3278 sub diff
, edi
/* subtract from start ==> value */
3279 /* ebx at alignment */
3291 sub edx
, ebx
/* subtract alignment fix */
3292 and edx
, 0x00000007 /* calc bytes over mult of 8 */
3293 sub ecx
, edx
/* drop over bytes from length */
3295 } /* end _asm block */
3297 /* Now do the math for the rest of the row */
3302 ActiveMask
.use
= 0x0000ffffff000000;
3303 ShiftBpp
.use
= 24; /* == 3 * 8 */
3304 ShiftRem
.use
= 40; /* == 64 - 24 */
3307 movq mm7
, ActiveMask
/* Load ActiveMask for 2nd active byte group */
3308 mov esi
, edi
/* lp = row */
3309 add edi
, bpp
/* rp = row + bpp */
3312 psllq mm6
, ShiftBpp
/* Move mask in mm6 to cover 3rd active */
3314 /* PRIME the pump (load the first Raw(x-bpp) data set */
3315 movq mm1
, [edi
+ebx
-8]
3317 psrlq mm1
, ShiftRem
/* Shift data for adding 1st bpp bytes */
3318 /* no need for mask; shift clears inactive bytes */
3319 /* Add 1st active group */
3322 /* Add 2nd active group */
3323 movq mm1
, mm0
/* mov updated Raws to mm1 */
3324 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3325 pand mm1
, mm7
/* mask to use only 2nd active group */
3327 /* Add 3rd active group */
3328 movq mm1
, mm0
/* mov updated Raws to mm1 */
3329 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3330 pand mm1
, mm6
/* mask to use only 3rd active group */
3334 movq
[edi
+ebx
-8], mm0
/* Write updated Raws back to array */
3335 /* Prep for doing 1st add at top of loop */
3338 } /* end _asm block */
3344 /* Placed here just in case this is a duplicate of the */
3345 /* non-MMX code for the SUB filter in png_read_filter_row below */
3349 /* png_uint_32 i; */
3350 /* bpp = (row_info->pixel_depth + 7) >> 3; */
3351 /* for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */
3352 /* i < row_info->rowbytes; i++, rp++, lp++) */
3354 /* *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */
3361 mov esi
, edi
/* lp = row */
3363 add edi
, bpp
/* rp = row + bpp */
3371 } /* end _asm block */
3380 ShiftBpp
.use
= bpp
<< 3;
3381 ShiftRem
.use
= 64 - ShiftBpp
.use
;
3385 mov esi
, edi
/* lp = row */
3386 add edi
, bpp
/* rp = row + bpp */
3387 /* PRIME the pump (load the first Raw(x-bpp) data set */
3388 movq mm1
, [edi
+ebx
-8]
3390 psrlq mm1
, ShiftRem
/* Shift data for adding 1st bpp bytes */
3391 /* no need for mask; shift clears inactive bytes */
3394 /* Add 2nd active group */
3395 movq mm1
, mm0
/* mov updated Raws to mm1 */
3396 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3397 /* there is no need for any mask */
3398 /* since shift clears inactive bits/bytes */
3402 movq
[edi
+ebx
-8], mm0
3403 movq mm1
, mm0
/* Prep for doing 1st add at top of loop */
3405 } /* end _asm block */
3411 ActiveMask
.use
= 0x00000000ffff0000;
3412 ShiftBpp
.use
= 16; /* == 2 * 8 */
3413 ShiftRem
.use
= 48; /* == 64 - 16 */
3415 movq mm7
, ActiveMask
/* Load ActiveMask for 2nd active byte group */
3419 psllq mm6
, ShiftBpp
/* Move mask in mm6 to cover 3rd active */
3421 mov esi
, edi
/* lp = row */
3423 add edi
, bpp
/* rp = row + bpp */
3424 psllq mm5
, ShiftBpp
/* Move mask in mm5 to cover 4th active */
3426 /* PRIME the pump (load the first Raw(x-bpp) data set */
3427 movq mm1
, [edi
+ebx
-8]
3429 /* Add 1st active group */
3430 psrlq mm1
, ShiftRem
/* Shift data for adding 1st bpp bytes */
3431 /* no need for mask; shift clears inactive */
3435 /* Add 2nd active group */
3436 movq mm1
, mm0
/* mov updated Raws to mm1 */
3437 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3438 pand mm1
, mm7
/* mask to use only 2nd active group */
3440 /* Add 3rd active group */
3441 movq mm1
, mm0
/* mov updated Raws to mm1 */
3442 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3443 pand mm1
, mm6
/* mask to use only 3rd active group */
3445 /* Add 4th active group */
3446 movq mm1
, mm0
/* mov updated Raws to mm1 */
3447 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3448 pand mm1
, mm5
/* mask to use only 4th active group */
3452 movq
[edi
+ebx
-8], mm0
/* Write updated Raws back to array */
3453 movq mm1
, mm0
/* Prep for doing 1st add at top of loop */
3455 } /* end _asm block */
3463 mov esi
, edi
/* lp = row */
3464 add edi
, bpp
/* rp = row + bpp */
3466 movq mm7
, [edi
+ebx
-8] /* PRIME the pump (load the first */
3467 /* Raw(x-bpp) data set */
3468 and ecx
, 0x0000003f /* calc bytes over mult of 64 */
3470 movq mm0
, [edi
+ebx
] /* Load Sub(x) for 1st 8 bytes */
3472 movq mm1
, [edi
+ebx
+8] /* Load Sub(x) for 2nd 8 bytes */
3473 movq
[edi
+ebx
], mm0
/* Write Raw(x) for 1st 8 bytes */
3474 /* Now mm0 will be used as Raw(x-bpp) for */
3475 /* the 2nd group of 8 bytes. This will be */
3476 /* repeated for each group of 8 bytes with */
3477 /* the 8th group being used as the Raw(x-bpp) */
3478 /* for the 1st group of the next loop. */
3480 movq mm2
, [edi
+ebx
+16] /* Load Sub(x) for 3rd 8 bytes */
3481 movq
[edi
+ebx
+8], mm1
/* Write Raw(x) for 2nd 8 bytes */
3483 movq mm3
, [edi
+ebx
+24] /* Load Sub(x) for 4th 8 bytes */
3484 movq
[edi
+ebx
+16], mm2
/* Write Raw(x) for 3rd 8 bytes */
3486 movq mm4
, [edi
+ebx
+32] /* Load Sub(x) for 5th 8 bytes */
3487 movq
[edi
+ebx
+24], mm3
/* Write Raw(x) for 4th 8 bytes */
3489 movq mm5
, [edi
+ebx
+40] /* Load Sub(x) for 6th 8 bytes */
3490 movq
[edi
+ebx
+32], mm4
/* Write Raw(x) for 5th 8 bytes */
3492 movq mm6
, [edi
+ebx
+48] /* Load Sub(x) for 7th 8 bytes */
3493 movq
[edi
+ebx
+40], mm5
/* Write Raw(x) for 6th 8 bytes */
3495 movq mm7
, [edi
+ebx
+56] /* Load Sub(x) for 8th 8 bytes */
3496 movq
[edi
+ebx
+48], mm6
/* Write Raw(x) for 7th 8 bytes */
3500 movq
[edi
+ebx
-8], mm7
/* Write Raw(x) for 8th 8 bytes */
3509 movq
[edi
+ebx
-8], mm0
/* use -8 to offset early add to ebx */
3510 movq mm7
, mm0
/* Move calculated Raw(x) data to mm1 to */
3511 /* be the new Raw(x-bpp) for the next loop */
3514 } /* end _asm block */
3518 default: /* bpp greater than 8 bytes */
3523 mov esi
, edi
/* lp = row */
3524 add edi
, bpp
/* rp = row + bpp */
3531 movq
[edi
+ebx
-8], mm0
/* mov does not affect flags; -8 to offset */
3534 } /* end _asm block */
3538 } /* end switch ( bpp ) */
3545 mov esi
, edi
/* lp = row */
3547 add edi
, bpp
/* rp = row + bpp */
3555 emms
/* End MMX instructions; prep for possible FP instrs. */
3556 } /* end _asm block */
3559 /* Optimized code for PNG Up filter decoder */
3561 png_read_filter_row_mmx_up(png_row_infop row_info
, png_bytep row
,
3565 len
= row_info
->rowbytes
; /* # of bytes to filter */
3568 /* get # of bytes to alignment */
3583 mov
[edi
+ ebx
-1], al
/* mov does not affect flags; -1 to offset inc ebx */
3588 sub edx
, ebx
/* subtract alignment fix */
3589 and edx
, 0x0000003f /* calc bytes over mult of 64 */
3590 sub ecx
, edx
/* drop over bytes from length */
3591 /* Unrolled loop - use all MMX registers and interleave to reduce */
3592 /* number of branch instructions (loops) and reduce partial stalls */
3596 movq mm3
, [esi
+ebx
+8]
3598 movq mm2
, [edi
+ebx
+8]
3601 movq mm5
, [esi
+ebx
+16]
3602 movq
[edi
+ebx
+8], mm2
3603 movq mm4
, [edi
+ebx
+16]
3604 movq mm7
, [esi
+ebx
+24]
3606 movq mm6
, [edi
+ebx
+24]
3607 movq
[edi
+ebx
+16], mm4
3609 movq mm1
, [esi
+ebx
+32]
3610 movq
[edi
+ebx
+24], mm6
3611 movq mm0
, [edi
+ebx
+32]
3612 movq mm3
, [esi
+ebx
+40]
3614 movq mm2
, [edi
+ebx
+40]
3615 movq
[edi
+ebx
+32], mm0
3617 movq mm5
, [esi
+ebx
+48]
3618 movq
[edi
+ebx
+40], mm2
3619 movq mm4
, [edi
+ebx
+48]
3620 movq mm7
, [esi
+ebx
+56]
3622 movq mm6
, [edi
+ebx
+56]
3623 movq
[edi
+ebx
+48], mm4
3627 movq
[edi
+ebx
-8], mm6
/* (+56)movq does not affect flags; */
3628 /* -8 to offset add ebx */
3631 cmp edx
, 0 /* Test for bytes over mult of 64 */
3635 /* 2 lines added by lcreeve@netins.net */
3636 /* (mail 11 Jul 98 in png-implement list) */
3637 cmp edx
, 8 /*test for less than 8 bytes */
3642 and edx
, 0x00000007 /* calc bytes over mult of 8 */
3643 sub ecx
, edx
/* drop over bytes from length */
3645 /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */
3652 movq
[edi
+ebx
-8], mm0
/* movq does not affect flags; -8 to offset add ebx */
3654 cmp edx
, 0 /* Test for bytes over mult of 8 */
3658 add ecx
, edx
/* move over byte count into counter */
3659 /* Loop using x86 registers to update remaining bytes */
3665 mov
[edi
+ ebx
-1], al
/* mov does not affect flags; -1 to offset inc ebx */
3668 /* Conversion of filtered row completed */
3669 emms
/* End MMX instructions; prep for possible FP instrs. */
3670 } /* end _asm block */
3674 /* Optimized png_read_filter_row routines */
3676 png_read_filter_row(png_structp png_ptr
, png_row_infop row_info
, png_bytep
3677 row
, png_bytep prev_row
, int filter
)
3683 if (mmx_supported
== 2) {
3684 #if !defined(PNG_1_0_X)
3685 /* this should have happened in png_init_mmx_flags() already */
3686 png_warning(png_ptr
, "asm_flags may not have been initialized");
3692 png_debug(1, "in png_read_filter_row\n");
3695 case 0: sprintf(filnm
, "none");
3697 #if !defined(PNG_1_0_X)
3698 case 1: sprintf(filnm
, "sub-%s",
3699 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
)? "MMX" : "x86");
3701 case 2: sprintf(filnm
, "up-%s",
3702 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
)? "MMX" : "x86");
3704 case 3: sprintf(filnm
, "avg-%s",
3705 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
)? "MMX" : "x86");
3707 case 4: sprintf(filnm
, "Paeth-%s",
3708 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
)? "MMX":"x86");
3711 case 1: sprintf(filnm
, "sub");
3713 case 2: sprintf(filnm
, "up");
3715 case 3: sprintf(filnm
, "avg");
3717 case 4: sprintf(filnm
, "Paeth");
3720 default: sprintf(filnm
, "unknw");
3723 png_debug2(0,"row=%5d, %s, ", png_ptr
->row_number
, filnm
);
3724 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info
->pixel_depth
,
3725 (int)((row_info
->pixel_depth
+ 7) >> 3));
3726 png_debug1(0,"len=%8d, ", row_info
->rowbytes
);
3727 #endif /* PNG_DEBUG */
3731 case PNG_FILTER_VALUE_NONE
:
3734 case PNG_FILTER_VALUE_SUB
:
3736 #if !defined(PNG_1_0_X)
3737 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
) &&
3738 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3739 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3744 png_read_filter_row_mmx_sub(row_info
, row
);
3749 png_uint_32 istop
= row_info
->rowbytes
;
3750 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3751 png_bytep rp
= row
+ bpp
;
3754 for (i
= bpp
; i
< istop
; i
++)
3756 *rp
= (png_byte
)(((int)(*rp
) + (int)(*lp
++)) & 0xff);
3763 case PNG_FILTER_VALUE_UP
:
3765 #if !defined(PNG_1_0_X)
3766 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
) &&
3767 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3768 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3773 png_read_filter_row_mmx_up(row_info
, row
, prev_row
);
3778 png_uint_32 istop
= row_info
->rowbytes
;
3780 png_bytep pp
= prev_row
;
3782 for (i
= 0; i
< istop
; ++i
)
3784 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
3791 case PNG_FILTER_VALUE_AVG
:
3793 #if !defined(PNG_1_0_X)
3794 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
) &&
3795 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3796 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3801 png_read_filter_row_mmx_avg(row_info
, row
, prev_row
);
3807 png_bytep pp
= prev_row
;
3809 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3810 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
3812 for (i
= 0; i
< bpp
; i
++)
3814 *rp
= (png_byte
)(((int)(*rp
) +
3815 ((int)(*pp
++) >> 1)) & 0xff);
3819 for (i
= 0; i
< istop
; i
++)
3821 *rp
= (png_byte
)(((int)(*rp
) +
3822 ((int)(*pp
++ + *lp
++) >> 1)) & 0xff);
3829 case PNG_FILTER_VALUE_PAETH
:
3831 #if !defined(PNG_1_0_X)
3832 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
) &&
3833 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3834 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3839 png_read_filter_row_mmx_paeth(row_info
, row
, prev_row
);
3845 png_bytep pp
= prev_row
;
3847 png_bytep cp
= prev_row
;
3848 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3849 png_uint_32 istop
=row_info
->rowbytes
- bpp
;
3851 for (i
= 0; i
< bpp
; i
++)
3853 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
3857 for (i
= 0; i
< istop
; i
++) /* use leftover rp,pp */
3859 int a
, b
, c
, pa
, pb
, pc
, p
;
3873 pa
= p
< 0 ? -p
: p
;
3874 pb
= pc
< 0 ? -pc
: pc
;
3875 pc
= (p
+ pc
) < 0 ? -(p
+ pc
) : p
+ pc
;
3879 if (pa <= pb && pa <= pc)
3887 p
= (pa
<= pb
&& pa
<=pc
) ? a
: (pb
<= pc
) ? b
: c
;
3889 *rp
= (png_byte
)(((int)(*rp
) + p
) & 0xff);
3897 png_warning(png_ptr
, "Ignoring bad row filter type");
3903 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */