1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 * libpng version 1.2.5rc3 - September 18, 2002
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2002 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 * [runtime MMX configuration, GRR 20010102]
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 static int mmx_supported
=2;
37 int mmx_supported_local
= 0;
39 push ebx
/*CPUID will trash these */
43 pushfd
/*Save Eflag to stack */
44 pop eax
/*Get Eflag from stack into eax */
45 mov ecx
, eax
/*Make another copy of Eflag in ecx */
46 xor eax
, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */
47 push eax
/*Save modified Eflag back to stack */
49 popfd
/*Restored modified value back to Eflag reg */
50 pushfd
/*Save Eflag to stack */
51 pop eax
/*Get Eflag from stack */
52 push ecx
/* save original Eflag to stack */
53 popfd
/* restore original Eflag */
54 xor eax
, ecx
/*Compare the new Eflag with the original Eflag */
55 jz NOT_SUPPORTED
/*If the same, CPUID instruction is not supported, */
56 /*skip following instructions and jump to */
57 /*NOT_SUPPORTED label */
59 xor eax
, eax
/*Set eax to zero */
61 _asm _emit
0x0f /*CPUID instruction (two bytes opcode) */
64 cmp eax
, 1 /*make sure eax return non-zero value */
65 jl NOT_SUPPORTED
/*If eax is zero, mmx not supported */
67 xor eax
, eax
/*set eax to zero */
68 inc eax
/*Now increment eax to 1. This instruction is */
69 /*faster than the instruction "mov eax, 1" */
71 _asm _emit
0x0f /*CPUID instruction */
74 and edx
, 0x00800000 /*mask out all bits but mmx bit(24) */
75 cmp edx
, 0 /* 0 = mmx not supported */
76 jz NOT_SUPPORTED
/* non-zero = Yes, mmx IS supported */
78 mov mmx_supported_local
, 1 /*set return value to 1 */
81 mov eax
, mmx_supported_local
/*move return value to eax */
82 pop edx
/*CPUID trashed these */
87 /*mmx_supported_local=0; // test code for force don't support MMX */
88 /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */
90 mmx_supported
= mmx_supported_local
;
91 return mmx_supported_local
;
94 /* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
105 /* Use this routine for x86 platform - uses faster MMX routine if machine
109 png_combine_row(png_structp png_ptr
, png_bytep row
, int mask
)
111 #ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
115 png_debug(1,"in png_combine_row_asm\n");
117 if (mmx_supported
== 2) {
118 /* this should have happened in png_init_mmx_flags() already */
119 png_warning(png_ptr
, "asm_flags may not have been initialized");
125 png_memcpy(row
, png_ptr
->row_buf
+ 1,
126 (png_size_t
)((png_ptr
->width
* png_ptr
->row_info
.pixel_depth
+ 7) >> 3));
128 /* GRR: add "else if (mask == 0)" case?
129 * or does png_combine_row() not even get called in that case? */
132 switch (png_ptr
->row_info
.pixel_depth
)
138 int s_inc
, s_start
, s_end
;
143 sp
= png_ptr
->row_buf
+ 1;
146 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
147 if (png_ptr
->transformations
& PNG_PACKSWAP
)
163 for (i
= 0; i
< png_ptr
->width
; i
++)
169 value
= (*sp
>> shift
) & 0x1;
170 *dp
&= (png_byte
)((0x7f7f >> (7 - shift
)) & 0xff);
171 *dp
|= (png_byte
)(value
<< shift
);
195 int s_start
, s_end
, s_inc
;
201 sp
= png_ptr
->row_buf
+ 1;
204 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
205 if (png_ptr
->transformations
& PNG_PACKSWAP
)
221 for (i
= 0; i
< png_ptr
->width
; i
++)
225 value
= (*sp
>> shift
) & 0x3;
226 *dp
&= (png_byte
)((0x3f3f >> (6 - shift
)) & 0xff);
227 *dp
|= (png_byte
)(value
<< shift
);
250 int s_start
, s_end
, s_inc
;
256 sp
= png_ptr
->row_buf
+ 1;
259 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
260 if (png_ptr
->transformations
& PNG_PACKSWAP
)
275 for (i
= 0; i
< png_ptr
->width
; i
++)
279 value
= (*sp
>> shift
) & 0xf;
280 *dp
&= (png_byte
)((0xf0f >> (4 - shift
)) & 0xff);
281 *dp
|= (png_byte
)(value
<< shift
);
308 __int64 mask0
=0x0102040810204080;
310 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
311 /* && mmx_supported */ )
313 srcptr
= png_ptr
->row_buf
+ 1;
317 len
= png_ptr
->width
&~7; /*reduce to multiple of 8 */
318 diff
= png_ptr
->width
& 7; /*amount lost */
322 movd mm7
, unmask
/*load bit pattern */
323 psubb mm6
,mm6
/*zero mm6 */
326 punpckldq mm7
,mm7
/*fill register with 8 masks */
330 pand mm0
,mm7
/*nonzero if keep byte */
331 pcmpeqb mm0
,mm6
/*zeros->1s, v versa */
333 mov ecx
,len
/*load length of line (pixels) */
334 mov esi
,srcptr
/*load source */
335 mov ebx
,dstptr
/*load dest */
347 add esi
,8 /*inc by 8 bytes processed */
349 sub ecx
,8 /*dec by 8 pixels processed */
359 sal edx
,24 /*make low byte the high byte */
362 sal edx
,1 /*move high bit to CF */
363 jnc skip8
/*if CF = 0 */
376 else /* mmx not supported - use modified C routine */
378 register unsigned int incr1
, initial_val
, final_val
;
379 png_size_t pixel_bytes
;
381 register int disp
= png_pass_inc
[png_ptr
->pass
];
382 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
384 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
385 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
387 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
388 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
389 final_val
= png_ptr
->width
*pixel_bytes
;
390 incr1
= (disp
)*pixel_bytes
;
391 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
393 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
408 __int64 mask1
=0x0101020204040808,
409 mask0
=0x1010202040408080;
411 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
412 /* && mmx_supported */ )
414 srcptr
= png_ptr
->row_buf
+ 1;
418 len
= (png_ptr
->width
)&~7;
419 diff
= (png_ptr
->width
)&7;
422 movd mm7
, unmask
/*load bit pattern */
423 psubb mm6
,mm6
/*zero mm6 */
426 punpckldq mm7
,mm7
/*fill register with 8 masks */
437 mov ecx
,len
/*load length of line */
438 mov esi
,srcptr
/*load source */
439 mov ebx
,dstptr
/*load dest */
460 add esi
,16 /*inc by 16 bytes processed */
462 sub ecx
,8 /*dec by 8 pixels processed */
472 sal edx
,24 /*make low byte the high byte */
474 sal edx
,1 /*move high bit to CF */
475 jnc skip16
/*if CF = 0 */
488 else /* mmx not supported - use modified C routine */
490 register unsigned int incr1
, initial_val
, final_val
;
491 png_size_t pixel_bytes
;
493 register int disp
= png_pass_inc
[png_ptr
->pass
];
494 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
496 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
497 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
499 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
500 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
501 final_val
= png_ptr
->width
*pixel_bytes
;
502 incr1
= (disp
)*pixel_bytes
;
503 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
505 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
521 __int64 mask2
=0x0101010202020404, /*24bpp */
522 mask1
=0x0408080810101020,
523 mask0
=0x2020404040808080;
525 srcptr
= png_ptr
->row_buf
+ 1;
529 len
= (png_ptr
->width
)&~7;
530 diff
= (png_ptr
->width
)&7;
532 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
533 /* && mmx_supported */ )
537 movd mm7
, unmask
/*load bit pattern */
538 psubb mm6
,mm6
/*zero mm6 */
541 punpckldq mm7
,mm7
/*fill register with 8 masks */
555 mov ecx
,len
/*load length of line */
556 mov esi
,srcptr
/*load source */
557 mov ebx
,dstptr
/*load dest */
587 add esi
,24 /*inc by 24 bytes processed */
589 sub ecx
,8 /*dec by 8 pixels processed */
599 sal edx
,24 /*make low byte the high byte */
601 sal edx
,1 /*move high bit to CF */
602 jnc skip24
/*if CF = 0 */
619 else /* mmx not supported - use modified C routine */
621 register unsigned int incr1
, initial_val
, final_val
;
622 png_size_t pixel_bytes
;
624 register int disp
= png_pass_inc
[png_ptr
->pass
];
625 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
627 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
628 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
630 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
631 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
632 final_val
= png_ptr
->width
*pixel_bytes
;
633 incr1
= (disp
)*pixel_bytes
;
634 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
636 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
652 __int64 mask3
=0x0101010102020202, /*32bpp */
653 mask2
=0x0404040408080808,
654 mask1
=0x1010101020202020,
655 mask0
=0x4040404080808080;
657 srcptr
= png_ptr
->row_buf
+ 1;
661 len
= (png_ptr
->width
)&~7;
662 diff
= (png_ptr
->width
)&7;
664 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
665 /* && mmx_supported */ )
669 movd mm7
, unmask
/*load bit pattern */
670 psubb mm6
,mm6
/*zero mm6 */
673 punpckldq mm7
,mm7
/*fill register with 8 masks */
690 mov ecx
,len
/*load length of line */
691 mov esi
,srcptr
/*load source */
692 mov ebx
,dstptr
/*load dest */
730 add esi
,32 /*inc by 32 bytes processed */
732 sub ecx
,8 /*dec by 8 pixels processed */
742 sal edx
,24 /*make low byte the high byte */
744 sal edx
,1 /*move high bit to CF */
745 jnc skip32
/*if CF = 0 */
759 else /* mmx _not supported - Use modified C routine */
761 register unsigned int incr1
, initial_val
, final_val
;
762 png_size_t pixel_bytes
;
764 register int disp
= png_pass_inc
[png_ptr
->pass
];
765 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
767 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
768 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
770 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
771 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
772 final_val
= png_ptr
->width
*pixel_bytes
;
773 incr1
= (disp
)*pixel_bytes
;
774 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
776 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
792 __int64 mask5
=0x0101010101010202,
793 mask4
=0x0202020204040404,
794 mask3
=0x0404080808080808,
795 mask2
=0x1010101010102020,
796 mask1
=0x2020202040404040,
797 mask0
=0x4040808080808080;
799 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
800 /* && mmx_supported */ )
802 srcptr
= png_ptr
->row_buf
+ 1;
806 len
= (png_ptr
->width
)&~7;
807 diff
= (png_ptr
->width
)&7;
810 movd mm7
, unmask
/*load bit pattern */
811 psubb mm6
,mm6
/*zero mm6 */
814 punpckldq mm7
,mm7
/*fill register with 8 masks */
837 mov ecx
,len
/*load length of line */
838 mov esi
,srcptr
/*load source */
839 mov ebx
,dstptr
/*load dest */
887 add esi
,48 /*inc by 32 bytes processed */
889 sub ecx
,8 /*dec by 8 pixels processed */
899 sal edx
,24 /*make low byte the high byte */
902 sal edx
,1 /*move high bit to CF */
903 jnc skip48
/*if CF = 0 */
917 else /* mmx _not supported - Use modified C routine */
919 register unsigned int incr1
, initial_val
, final_val
;
920 png_size_t pixel_bytes
;
922 register int disp
= png_pass_inc
[png_ptr
->pass
];
923 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
925 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
926 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
928 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
929 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
930 final_val
= png_ptr
->width
*pixel_bytes
;
931 incr1
= (disp
)*pixel_bytes
;
932 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
934 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
947 png_size_t pixel_bytes
;
948 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
950 register int disp
= png_pass_inc
[png_ptr
->pass
]; /* get the offset */
951 register unsigned int incr1
, initial_val
, final_val
;
953 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
954 sptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
956 dp
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
957 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
958 final_val
= png_ptr
->width
*pixel_bytes
;
959 incr1
= (disp
)*pixel_bytes
;
960 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
962 png_memcpy(dp
, sptr
, pixel_bytes
);
968 } /* end switch (png_ptr->row_info.pixel_depth) */
969 } /* end if (non-trivial mask) */
971 } /* end png_combine_row() */
974 #if defined(PNG_READ_INTERLACING_SUPPORTED)
977 png_do_read_interlace(png_structp png_ptr
)
979 png_row_infop row_info
= &(png_ptr
->row_info
);
980 png_bytep row
= png_ptr
->row_buf
+ 1;
981 int pass
= png_ptr
->pass
;
982 png_uint_32 transformations
= png_ptr
->transformations
;
983 #ifdef PNG_USE_LOCAL_ARRAYS
984 const int png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
987 png_debug(1,"in png_do_read_interlace\n");
989 if (mmx_supported
== 2) {
990 /* this should have happened in png_init_mmx_flags() already */
991 png_warning(png_ptr
, "asm_flags may not have been initialized");
995 if (row
!= NULL
&& row_info
!= NULL
)
997 png_uint_32 final_width
;
999 final_width
= row_info
->width
* png_pass_inc
[pass
];
1001 switch (row_info
->pixel_depth
)
1007 int s_start
, s_end
, s_inc
;
1012 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 3);
1013 dp
= row
+ (png_size_t
)((final_width
- 1) >> 3);
1014 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1015 if (transformations
& PNG_PACKSWAP
)
1017 sshift
= (int)((row_info
->width
+ 7) & 7);
1018 dshift
= (int)((final_width
+ 7) & 7);
1026 sshift
= 7 - (int)((row_info
->width
+ 7) & 7);
1027 dshift
= 7 - (int)((final_width
+ 7) & 7);
1033 for (i
= row_info
->width
; i
; i
--)
1035 v
= (png_byte
)((*sp
>> sshift
) & 0x1);
1036 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1038 *dp
&= (png_byte
)((0x7f7f >> (7 - dshift
)) & 0xff);
1039 *dp
|= (png_byte
)(v
<< dshift
);
1040 if (dshift
== s_end
)
1048 if (sshift
== s_end
)
1063 int s_start
, s_end
, s_inc
;
1066 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 2);
1067 dp
= row
+ (png_size_t
)((final_width
- 1) >> 2);
1068 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1069 if (transformations
& PNG_PACKSWAP
)
1071 sshift
= (png_size_t
)(((row_info
->width
+ 3) & 3) << 1);
1072 dshift
= (png_size_t
)(((final_width
+ 3) & 3) << 1);
1080 sshift
= (png_size_t
)((3 - ((row_info
->width
+ 3) & 3)) << 1);
1081 dshift
= (png_size_t
)((3 - ((final_width
+ 3) & 3)) << 1);
1087 for (i
= row_info
->width
; i
; i
--)
1092 v
= (png_byte
)((*sp
>> sshift
) & 0x3);
1093 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1095 *dp
&= (png_byte
)((0x3f3f >> (6 - dshift
)) & 0xff);
1096 *dp
|= (png_byte
)(v
<< dshift
);
1097 if (dshift
== s_end
)
1105 if (sshift
== s_end
)
1120 int s_start
, s_end
, s_inc
;
1123 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 1);
1124 dp
= row
+ (png_size_t
)((final_width
- 1) >> 1);
1125 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1126 if (transformations
& PNG_PACKSWAP
)
1128 sshift
= (png_size_t
)(((row_info
->width
+ 1) & 1) << 2);
1129 dshift
= (png_size_t
)(((final_width
+ 1) & 1) << 2);
1137 sshift
= (png_size_t
)((1 - ((row_info
->width
+ 1) & 1)) << 2);
1138 dshift
= (png_size_t
)((1 - ((final_width
+ 1) & 1)) << 2);
1144 for (i
= row_info
->width
; i
; i
--)
1149 v
= (png_byte
)((*sp
>> sshift
) & 0xf);
1150 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1152 *dp
&= (png_byte
)((0xf0f >> (4 - dshift
)) & 0xff);
1153 *dp
|= (png_byte
)(v
<< dshift
);
1154 if (dshift
== s_end
)
1162 if (sshift
== s_end
)
1173 default: /* This is the place where the routine is modified */
1175 __int64 const4
= 0x0000000000FFFFFF;
1176 /* __int64 const5 = 0x000000FFFFFF0000; // unused... */
1177 __int64 const6
= 0x00000000000000FF;
1180 png_size_t pixel_bytes
;
1181 int width
= row_info
->width
;
1183 pixel_bytes
= (row_info
->pixel_depth
>> 3);
1185 sptr
= row
+ (width
- 1) * pixel_bytes
;
1186 dp
= row
+ (final_width
- 1) * pixel_bytes
;
1187 /* New code by Nirav Chhatrapati - Intel Corporation */
1188 /* sign fix by GRR */
1189 /* NOTE: there is NO MMX code for 48-bit and 64-bit images */
1191 /* use MMX routine if machine supports it */
1192 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_INTERLACE
)
1193 /* && mmx_supported */ )
1195 if (pixel_bytes
== 3)
1197 if (((pass
== 0) || (pass
== 1)) && width
)
1204 sub edi
, 21 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1206 movd mm0
, [esi
] ; X X X X X v2 v1 v0
1207 pand mm0
, const4
; 0 0 0 0 0 v2 v1 v0
1208 movq mm1
, mm0
; 0 0 0 0 0 v2 v1 v0
1209 psllq mm0
, 16 ; 0 0 0 v2 v1 v0
0 0
1210 movq mm2
, mm0
; 0 0 0 v2 v1 v0
0 0
1211 psllq mm0
, 24 ; v2 v1 v0
0 0 0 0 0
1212 psrlq mm1
, 8 ; 0 0 0 0 0 0 v2 v1
1213 por mm0
, mm2
; v2 v1 v0 v2 v1 v0
0 0
1214 por mm0
, mm1
; v2 v1 v0 v2 v1 v0 v2 v1
1215 movq mm3
, mm0
; v2 v1 v0 v2 v1 v0 v2 v1
1216 psllq mm0
, 16 ; v0 v2 v1 v0 v2 v1
0 0
1217 movq mm4
, mm3
; v2 v1 v0 v2 v1 v0 v2 v1
1218 punpckhdq mm3
, mm0
; v0 v2 v1 v0 v2 v1 v0 v2
1220 psrlq mm0
, 32 ; 0 0 0 0 v0 v2 v1 v0
1222 punpckldq mm0
, mm4
; v1 v0 v2 v1 v0 v2 v1 v0
1232 else if (((pass
== 2) || (pass
== 3)) && width
)
1239 sub edi
, 9 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1241 movd mm0
, [esi
] ; X X X X X v2 v1 v0
1242 pand mm0
, const4
; 0 0 0 0 0 v2 v1 v0
1243 movq mm1
, mm0
; 0 0 0 0 0 v2 v1 v0
1244 psllq mm0
, 16 ; 0 0 0 v2 v1 v0
0 0
1245 movq mm2
, mm0
; 0 0 0 v2 v1 v0
0 0
1246 psllq mm0
, 24 ; v2 v1 v0
0 0 0 0 0
1247 psrlq mm1
, 8 ; 0 0 0 0 0 0 v2 v1
1248 por mm0
, mm2
; v2 v1 v0 v2 v1 v0
0 0
1249 por mm0
, mm1
; v2 v1 v0 v2 v1 v0 v2 v1
1250 movq
[edi
+4], mm0
; move to memory
1251 psrlq mm0
, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1252 movd
[edi
], mm0
; move to memory
1260 else if (width
) /* && ((pass == 4) || (pass == 5)) */
1262 int width_mmx
= ((width
>> 1) << 1) - 8;
1265 width
-= width_mmx
; /* 8 or 9 pix, 24 or 27 bytes */
1276 movq mm0
, [esi
] ; X X v2 v1 v0 v5 v4 v3
1277 movq mm7
, mm0
; X X v2 v1 v0 v5 v4 v3
1278 movq mm6
, mm0
; X X v2 v1 v0 v5 v4 v3
1279 psllq mm0
, 24 ; v1 v0 v5 v4 v3
0 0 0
1280 pand mm7
, const4
; 0 0 0 0 0 v5 v4 v3
1281 psrlq mm6
, 24 ; 0 0 0 X X v2 v1 v0
1282 por mm0
, mm7
; v1 v0 v5 v4 v3 v5 v4 v3
1283 movq mm5
, mm6
; 0 0 0 X X v2 v1 v0
1284 psllq mm6
, 8 ; 0 0 X X v2 v1 v0
0
1285 movq
[edi
], mm0
; move quad to memory
1286 psrlq mm5
, 16 ; 0 0 0 0 0 X X v2
1287 pand mm5
, const6
; 0 0 0 0 0 0 0 v2
1288 por mm6
, mm5
; 0 0 X X v2 v1 v0 v2
1289 movd
[edi
+8], mm6
; move
double to memory
1298 sptr
-= width_mmx
*3;
1300 for (i
= width
; i
; i
--)
1305 png_memcpy(v
, sptr
, 3);
1306 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1308 png_memcpy(dp
, v
, 3);
1314 } /* end of pixel_bytes == 3 */
1316 else if (pixel_bytes
== 1)
1318 if (((pass
== 0) || (pass
== 1)) && width
)
1320 int width_mmx
= ((width
>> 2) << 2);
1332 movd mm0
, [esi
] ; X X X X v0 v1 v2 v3
1333 movq mm1
, mm0
; X X X X v0 v1 v2 v3
1334 punpcklbw mm0
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1335 movq mm2
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1336 punpcklwd mm0
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1337 movq mm3
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1338 punpckldq mm0
, mm0
; v3 v3 v3 v3 v3 v3 v3 v3
1339 punpckhdq mm3
, mm3
; v2 v2 v2 v2 v2 v2 v2 v2
1340 movq
[edi
], mm0
; move to memory v3
1341 punpckhwd mm2
, mm2
; v0 v0 v0 v0 v1 v1 v1 v1
1342 movq
[edi
+8], mm3
; move to memory v2
1343 movq mm4
, mm2
; v0 v0 v0 v0 v1 v1 v1 v1
1344 punpckldq mm2
, mm2
; v1 v1 v1 v1 v1 v1 v1 v1
1345 punpckhdq mm4
, mm4
; v0 v0 v0 v0 v0 v0 v0 v0
1346 movq
[edi
+16], mm2
; move to memory v1
1347 movq
[edi
+24], mm4
; move to memory v0
1358 for (i
= width
; i
; i
--)
1362 /* I simplified this part in version 1.0.4e
1363 * here and in several other instances where
1364 * pixel_bytes == 1 -- GR-P
1369 * png_memcpy(v, sptr, pixel_bytes);
1370 * for (j = 0; j < png_pass_inc[pass]; j++)
1372 * png_memcpy(dp, v, pixel_bytes);
1373 * dp -= pixel_bytes;
1375 * sptr -= pixel_bytes;
1377 * Replacement code is in the next three lines:
1380 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1385 else if (((pass
== 2) || (pass
== 3)) && width
)
1387 int width_mmx
= ((width
>> 2) << 2);
1399 movd mm0
, [esi
] ; X X X X v0 v1 v2 v3
1400 punpcklbw mm0
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1401 movq mm1
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1402 punpcklwd mm0
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1403 punpckhwd mm1
, mm1
; v0 v0 v0 v0 v1 v1 v1 v1
1404 movq
[edi
], mm0
; move to memory v2
and v3
1406 movq
[edi
+8], mm1
; move to memory v1
and v0
1416 for (i
= width
; i
; i
--)
1420 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1427 else if (width
) /* && ((pass == 4) || (pass == 5))) */
1429 int width_mmx
= ((width
>> 3) << 3);
1441 movq mm0
, [esi
] ; v0 v1 v2 v3 v4 v5 v6 v7
1442 movq mm1
, mm0
; v0 v1 v2 v3 v4 v5 v6 v7
1443 punpcklbw mm0
, mm0
; v4 v4 v5 v5 v6 v6 v7 v7
1444 /*movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 */
1445 punpckhbw mm1
, mm1
;v0 v0 v1 v1 v2 v2 v3 v3
1446 movq
[edi
+8], mm1
; move to memory v0 v1 v2
and v3
1448 movq
[edi
], mm0
; move to memory v4 v5 v6
and v7
1459 for (i
= width
; i
; i
--)
1463 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1470 } /* end of pixel_bytes == 1 */
1472 else if (pixel_bytes
== 2)
1474 if (((pass
== 0) || (pass
== 1)) && width
)
1476 int width_mmx
= ((width
>> 1) << 1);
1488 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1489 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1490 movq mm1
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1491 punpckldq mm0
, mm0
; v3 v2 v3 v2 v3 v2 v3 v2
1492 punpckhdq mm1
, mm1
; v1 v0 v1 v0 v1 v0 v1 v0
1495 movq
[edi
+ 16], mm1
1496 movq
[edi
+ 24], mm1
1505 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
1506 dp
-= (width_mmx
*16 - 2); /* sign fixed */
1507 for (i
= width
; i
; i
--)
1512 png_memcpy(v
, sptr
, 2);
1513 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1516 png_memcpy(dp
, v
, 2);
1520 else if (((pass
== 2) || (pass
== 3)) && width
)
1522 int width_mmx
= ((width
>> 1) << 1) ;
1534 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1535 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1536 movq mm1
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1537 punpckldq mm0
, mm0
; v3 v2 v3 v2 v3 v2 v3 v2
1538 punpckhdq mm1
, mm1
; v1 v0 v1 v0 v1 v0 v1 v0
1550 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
1551 dp
-= (width_mmx
*8 - 2); /* sign fixed */
1552 for (i
= width
; i
; i
--)
1557 png_memcpy(v
, sptr
, 2);
1558 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1561 png_memcpy(dp
, v
, 2);
1565 else if (width
) /* pass == 4 or 5 */
1567 int width_mmx
= ((width
>> 1) << 1) ;
1579 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1580 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1590 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
1591 dp
-= (width_mmx
*4 - 2); /* sign fixed */
1592 for (i
= width
; i
; i
--)
1597 png_memcpy(v
, sptr
, 2);
1598 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1601 png_memcpy(dp
, v
, 2);
1605 } /* end of pixel_bytes == 2 */
1607 else if (pixel_bytes
== 4)
1609 if (((pass
== 0) || (pass
== 1)) && width
)
1611 int width_mmx
= ((width
>> 1) << 1) ;
1623 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1624 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1625 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1626 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1629 movq
[edi
+ 16], mm0
1630 movq
[edi
+ 24], mm0
1632 movq
[edi
+ 40], mm1
1635 movq
[edi
+ 56], mm1
1643 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
1644 dp
-= (width_mmx
*32 - 4); /* sign fixed */
1645 for (i
= width
; i
; i
--)
1650 png_memcpy(v
, sptr
, 4);
1651 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1654 png_memcpy(dp
, v
, 4);
1658 else if (((pass
== 2) || (pass
== 3)) && width
)
1660 int width_mmx
= ((width
>> 1) << 1) ;
1672 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1673 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1674 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1675 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1679 movq
[edi
+ 24], mm1
1688 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
1689 dp
-= (width_mmx
*16 - 4); /* sign fixed */
1690 for (i
= width
; i
; i
--)
1695 png_memcpy(v
, sptr
, 4);
1696 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1699 png_memcpy(dp
, v
, 4);
1703 else if (width
) /* pass == 4 or 5 */
1705 int width_mmx
= ((width
>> 1) << 1) ;
1717 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1718 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1719 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1720 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1731 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
1732 dp
-= (width_mmx
*8 - 4); /* sign fixed */
1733 for (i
= width
; i
; i
--)
1738 png_memcpy(v
, sptr
, 4);
1739 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1742 png_memcpy(dp
, v
, 4);
1747 } /* end of pixel_bytes == 4 */
1749 else if (pixel_bytes
== 6)
1751 for (i
= width
; i
; i
--)
1755 png_memcpy(v
, sptr
, 6);
1756 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1758 png_memcpy(dp
, v
, 6);
1763 } /* end of pixel_bytes == 6 */
1767 for (i
= width
; i
; i
--)
1771 png_memcpy(v
, sptr
, pixel_bytes
);
1772 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1774 png_memcpy(dp
, v
, pixel_bytes
);
1780 } /* end of mmx_supported */
1782 else /* MMX not supported: use modified C code - takes advantage
1783 * of inlining of memcpy for a constant */
1785 if (pixel_bytes
== 1)
1787 for (i
= width
; i
; i
--)
1790 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1795 else if (pixel_bytes
== 3)
1797 for (i
= width
; i
; i
--)
1801 png_memcpy(v
, sptr
, pixel_bytes
);
1802 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1804 png_memcpy(dp
, v
, pixel_bytes
);
1807 sptr
-= pixel_bytes
;
1810 else if (pixel_bytes
== 2)
1812 for (i
= width
; i
; i
--)
1816 png_memcpy(v
, sptr
, pixel_bytes
);
1817 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1819 png_memcpy(dp
, v
, pixel_bytes
);
1822 sptr
-= pixel_bytes
;
1825 else if (pixel_bytes
== 4)
1827 for (i
= width
; i
; i
--)
1831 png_memcpy(v
, sptr
, pixel_bytes
);
1832 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1834 png_memcpy(dp
, v
, pixel_bytes
);
1837 sptr
-= pixel_bytes
;
1840 else if (pixel_bytes
== 6)
1842 for (i
= width
; i
; i
--)
1846 png_memcpy(v
, sptr
, pixel_bytes
);
1847 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1849 png_memcpy(dp
, v
, pixel_bytes
);
1852 sptr
-= pixel_bytes
;
1857 for (i
= width
; i
; i
--)
1861 png_memcpy(v
, sptr
, pixel_bytes
);
1862 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1864 png_memcpy(dp
, v
, pixel_bytes
);
1867 sptr
-= pixel_bytes
;
1871 } /* end of MMX not supported */
1874 } /* end switch (row_info->pixel_depth) */
1876 row_info
->width
= final_width
;
1877 row_info
->rowbytes
= ((final_width
*
1878 (png_uint_32
)row_info
->pixel_depth
+ 7) >> 3);
1883 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1886 /* These variables are utilized in the functions below. They are declared */
1887 /* globally here to ensure alignment on 8-byte boundaries. */
1892 } LBCarryMask
= {0x0101010101010101},
1893 HBClearMask
= {0x7f7f7f7f7f7f7f7f},
1894 ActiveMask
, ActiveMask2
, ActiveMaskEnd
, ShiftBpp
, ShiftRem
;
1897 /* Optimized code for PNG Average filter decoder */
1899 png_read_filter_row_mmx_avg(png_row_infop row_info
, png_bytep row
1900 , png_bytep prev_row
)
1903 png_uint_32 FullLength
;
1904 png_uint_32 MMXLength
;
1905 /*png_uint_32 len; */
1908 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
1909 FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
1911 /* Init address pointers and offset */
1912 mov edi
, row
/* edi ==> Avg(x) */
1913 xor ebx
, ebx
/* ebx ==> x */
1915 mov esi
, prev_row
/* esi ==> Prior(x) */
1916 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
1919 /* Compute the Raw value for the first bpp bytes */
1920 /* Raw(x) = Avg(x) + (Prior(x)/2) */
1922 mov al
, [esi
+ ebx
] /* Load al with Prior(x) */
1924 shr al
, 1 /* divide by 2 */
1925 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
1927 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
1928 /* mov does not affect flags; -1 to offset inc ebx */
1930 /* get # of bytes to alignment */
1931 mov diff
, edi
/* take start of row */
1932 add diff
, ebx
/* add bpp */
1933 add diff
, 0xf /* add 7 + 8 to incr past alignment boundary */
1934 and diff
, 0xfffffff8 /* mask to alignment boundary */
1935 sub diff
, edi
/* subtract from start ==> value ebx at alignment */
1938 /* Compute the Raw value for the bytes upto the alignment boundary */
1939 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
1943 mov cl
, [esi
+ ebx
] /* load cl with Prior(x) */
1944 mov al
, [edx
+ ebx
] /* load al with Raw(x-bpp) */
1947 shr ax
, 1 /* divide by 2 */
1948 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
1949 cmp ebx
, diff
/* Check if at alignment boundary */
1950 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
1951 /* mov does not affect flags; -1 to offset inc ebx */
1952 jb davglp1
/* Repeat until at alignment boundary */
1956 sub eax
, ebx
/* subtract alignment fix */
1957 and eax
, 0x00000007 /* calc bytes over mult of 8 */
1958 sub ecx
, eax
/* drop over bytes from original length */
1960 } /* end _asm block */
1961 /* Now do the math for the rest of the row */
1966 ActiveMask
.use
= 0x0000000000ffffff;
1967 ShiftBpp
.use
= 24; /* == 3 * 8 */
1968 ShiftRem
.use
= 40; /* == 64 - 24 */
1970 /* Re-init address pointers and offset */
1971 movq mm7
, ActiveMask
1972 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
1973 movq mm5
, LBCarryMask
1974 mov edi
, row
/* edi ==> Avg(x) */
1975 movq mm4
, HBClearMask
1976 mov esi
, prev_row
/* esi ==> Prior(x) */
1977 /* PRIME the pump (load the first Raw(x-bpp) data set */
1978 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
1979 /* (we correct position in loop below) */
1981 movq mm0
, [edi
+ ebx
] /* Load mm0 with Avg(x) */
1982 /* Add (Prev_row/2) to Average */
1984 psrlq mm2
, ShiftRem
/* Correct position Raw(x-bpp) data */
1985 movq mm1
, [esi
+ ebx
] /* Load mm1 with Prior(x) */
1987 pand mm3
, mm1
/* get lsb for each prev_row byte */
1988 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
1989 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
1990 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
1991 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
1992 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
1993 pand mm1
, mm2
/* get LBCarrys for each byte where both */
1994 /* lsb's were == 1 (Only valid for active group) */
1995 psrlq mm2
, 1 /* divide raw bytes by 2 */
1996 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
1997 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
1998 pand mm2
, mm6
/* Leave only Active Group 1 bytes to add to Avg */
1999 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2001 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2002 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 3-5 */
2003 movq mm2
, mm0
/* mov updated Raws to mm2 */
2004 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2005 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2006 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2007 /* lsb's were == 1 (Only valid for active group) */
2008 psrlq mm2
, 1 /* divide raw bytes by 2 */
2009 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2010 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2011 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2012 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2015 /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */
2016 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover the last two */
2018 movq mm2
, mm0
/* mov updated Raws to mm2 */
2019 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2020 /* Data only needs to be shifted once here to */
2021 /* get the correct x-bpp offset. */
2022 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2023 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2024 /* lsb's were == 1 (Only valid for active group) */
2025 psrlq mm2
, 1 /* divide raw bytes by 2 */
2026 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2027 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2028 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2030 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2033 /* Now ready to write back to memory */
2034 movq
[edi
+ ebx
- 8], mm0
2035 /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */
2037 movq mm2
, mm0
/* mov updated Raw(x) to mm2 */
2039 } /* end _asm block */
2048 ActiveMask
.use
= 0xffffffffffffffff; /* use shift below to clear */
2049 /* appropriate inactive bytes */
2050 ShiftBpp
.use
= bpp
<< 3;
2051 ShiftRem
.use
= 64 - ShiftBpp
.use
;
2053 movq mm4
, HBClearMask
2054 /* Re-init address pointers and offset */
2055 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2056 /* Load ActiveMask and clear all bytes except for 1st active group */
2057 movq mm7
, ActiveMask
2058 mov edi
, row
/* edi ==> Avg(x) */
2060 mov esi
, prev_row
/* esi ==> Prior(x) */
2062 movq mm5
, LBCarryMask
2063 psllq mm6
, ShiftBpp
/* Create mask for 2nd active group */
2064 /* PRIME the pump (load the first Raw(x-bpp) data set */
2065 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2066 /* (we correct position in loop below) */
2068 movq mm0
, [edi
+ ebx
]
2069 psrlq mm2
, ShiftRem
/* shift data to position correctly */
2070 movq mm1
, [esi
+ ebx
]
2071 /* Add (Prev_row/2) to Average */
2073 pand mm3
, mm1
/* get lsb for each prev_row byte */
2074 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2075 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2076 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2077 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2078 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2079 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2080 /* lsb's were == 1 (Only valid for active group) */
2081 psrlq mm2
, 1 /* divide raw bytes by 2 */
2082 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2083 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2084 pand mm2
, mm7
/* Leave only Active Group 1 bytes to add to Avg */
2085 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2087 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2088 movq mm2
, mm0
/* mov updated Raws to mm2 */
2089 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2091 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2092 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2093 /* lsb's were == 1 (Only valid for active group) */
2094 psrlq mm2
, 1 /* divide raw bytes by 2 */
2095 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2096 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2097 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2098 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active */
2101 /* Now ready to write back to memory */
2102 movq
[edi
+ ebx
- 8], mm0
2103 /* Prep Raw(x-bpp) for next loop */
2104 movq mm2
, mm0
/* mov updated Raws to mm2 */
2106 } /* end _asm block */
2111 ActiveMask
.use
= 0x000000000000ffff;
2112 ShiftBpp
.use
= 16; /* == 2 * 8 [BUGFIX] */
2113 ShiftRem
.use
= 48; /* == 64 - 16 [BUGFIX] */
2115 /* Load ActiveMask */
2116 movq mm7
, ActiveMask
2117 /* Re-init address pointers and offset */
2118 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2119 movq mm5
, LBCarryMask
2120 mov edi
, row
/* edi ==> Avg(x) */
2121 movq mm4
, HBClearMask
2122 mov esi
, prev_row
/* esi ==> Prior(x) */
2123 /* PRIME the pump (load the first Raw(x-bpp) data set */
2124 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2125 /* (we correct position in loop below) */
2127 movq mm0
, [edi
+ ebx
]
2128 psrlq mm2
, ShiftRem
/* shift data to position correctly [BUGFIX] */
2129 movq mm1
, [esi
+ ebx
]
2130 /* Add (Prev_row/2) to Average */
2132 pand mm3
, mm1
/* get lsb for each prev_row byte */
2133 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2134 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2136 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2137 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2138 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2139 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2140 /* lsb's were == 1 (Only valid for active group) */
2141 psrlq mm2
, 1 /* divide raw bytes by 2 */
2142 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2143 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2144 pand mm2
, mm6
/* Leave only Active Group 1 bytes to add to Avg */
2145 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2146 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2147 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 2 & 3 */
2148 movq mm2
, mm0
/* mov updated Raws to mm2 */
2149 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2150 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2151 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2152 /* lsb's were == 1 (Only valid for active group) */
2153 psrlq mm2
, 1 /* divide raw bytes by 2 */
2154 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2155 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2156 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2157 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2159 /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */
2160 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 4 & 5 */
2161 movq mm2
, mm0
/* mov updated Raws to mm2 */
2162 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2163 /* Data only needs to be shifted once here to */
2164 /* get the correct x-bpp offset. */
2165 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2166 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2167 /* lsb's were == 1 (Only valid for active group) */
2168 psrlq mm2
, 1 /* divide raw bytes by 2 */
2169 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2170 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2171 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2172 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2174 /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */
2175 psllq mm6
, ShiftBpp
/* shift the mm6 mask to cover bytes 6 & 7 */
2176 movq mm2
, mm0
/* mov updated Raws to mm2 */
2177 psllq mm2
, ShiftBpp
/* shift data to position correctly */
2178 /* Data only needs to be shifted once here to */
2179 /* get the correct x-bpp offset. */
2181 movq mm1
, mm3
/* now use mm1 for getting LBCarrys */
2182 pand mm1
, mm2
/* get LBCarrys for each byte where both */
2183 /* lsb's were == 1 (Only valid for active group) */
2184 psrlq mm2
, 1 /* divide raw bytes by 2 */
2185 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2186 paddb mm2
, mm1
/* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2187 pand mm2
, mm6
/* Leave only Active Group 2 bytes to add to Avg */
2188 paddb mm0
, mm2
/* add (Raw/2) + LBCarrys to Avg for each Active byte */
2191 /* Now ready to write back to memory */
2192 movq
[edi
+ ebx
- 8], mm0
2193 /* Prep Raw(x-bpp) for next loop */
2194 movq mm2
, mm0
/* mov updated Raws to mm2 */
2196 } /* end _asm block */
2200 case 1: /* bpp == 1 */
2203 /* Re-init address pointers and offset */
2204 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2205 mov edi
, row
/* edi ==> Avg(x) */
2206 cmp ebx
, FullLength
/* Test if offset at end of array */
2208 /* Do Paeth decode for remaining bytes */
2209 mov esi
, prev_row
/* esi ==> Prior(x) */
2211 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
2212 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
2214 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2216 mov cl
, [esi
+ ebx
] /* load cl with Prior(x) */
2217 mov al
, [edx
+ ebx
] /* load al with Raw(x-bpp) */
2220 shr ax
, 1 /* divide by 2 */
2221 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
2222 cmp ebx
, FullLength
/* Check if at end of array */
2223 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
2224 /* mov does not affect flags; -1 to offset inc ebx */
2227 } /* end _asm block */
2231 case 8: /* bpp == 8 */
2234 /* Re-init address pointers and offset */
2235 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2236 movq mm5
, LBCarryMask
2237 mov edi
, row
/* edi ==> Avg(x) */
2238 movq mm4
, HBClearMask
2239 mov esi
, prev_row
/* esi ==> Prior(x) */
2240 /* PRIME the pump (load the first Raw(x-bpp) data set */
2241 movq mm2
, [edi
+ ebx
- 8] /* Load previous aligned 8 bytes */
2242 /* (NO NEED to correct position in loop below) */
2244 movq mm0
, [edi
+ ebx
]
2246 movq mm1
, [esi
+ ebx
]
2248 pand mm3
, mm1
/* get lsb for each prev_row byte */
2249 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2250 pand mm3
, mm2
/* get LBCarrys for each byte where both */
2251 /* lsb's were == 1 */
2252 psrlq mm2
, 1 /* divide raw bytes by 2 */
2253 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2254 paddb mm0
, mm3
/* add LBCarrys to Avg for each byte */
2255 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2256 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2257 paddb mm0
, mm2
/* add (Raw/2) to Avg for each byte */
2259 movq
[edi
+ ebx
- 8], mm0
2260 movq mm2
, mm0
/* reuse as Raw(x-bpp) */
2262 } /* end _asm block */
2265 default: /* bpp greater than 8 */
2268 movq mm5
, LBCarryMask
2269 /* Re-init address pointers and offset */
2270 mov ebx
, diff
/* ebx ==> x = offset to alignment boundary */
2271 mov edi
, row
/* edi ==> Avg(x) */
2272 movq mm4
, HBClearMask
2274 mov esi
, prev_row
/* esi ==> Prior(x) */
2275 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
2277 movq mm0
, [edi
+ ebx
]
2279 movq mm1
, [esi
+ ebx
]
2280 pand mm3
, mm1
/* get lsb for each prev_row byte */
2281 movq mm2
, [edx
+ ebx
]
2282 psrlq mm1
, 1 /* divide prev_row bytes by 2 */
2283 pand mm3
, mm2
/* get LBCarrys for each byte where both */
2284 /* lsb's were == 1 */
2285 psrlq mm2
, 1 /* divide raw bytes by 2 */
2286 pand mm1
, mm4
/* clear invalid bit 7 of each byte */
2287 paddb mm0
, mm3
/* add LBCarrys to Avg for each byte */
2288 pand mm2
, mm4
/* clear invalid bit 7 of each byte */
2289 paddb mm0
, mm1
/* add (Prev_row/2) to Avg for each byte */
2291 paddb mm0
, mm2
/* add (Raw/2) to Avg for each byte */
2293 movq
[edi
+ ebx
- 8], mm0
2295 } /* end _asm block */
2298 } /* end switch ( bpp ) */
2301 /* MMX acceleration complete now do clean-up */
2302 /* Check if any remaining bytes left to decode */
2303 mov ebx
, MMXLength
/* ebx ==> x = offset bytes remaining after MMX */
2304 mov edi
, row
/* edi ==> Avg(x) */
2305 cmp ebx
, FullLength
/* Test if offset at end of array */
2307 /* Do Paeth decode for remaining bytes */
2308 mov esi
, prev_row
/* esi ==> Prior(x) */
2310 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
2311 sub edx
, bpp
/* edx ==> Raw(x-bpp) */
2313 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2315 mov cl
, [esi
+ ebx
] /* load cl with Prior(x) */
2316 mov al
, [edx
+ ebx
] /* load al with Raw(x-bpp) */
2319 shr ax
, 1 /* divide by 2 */
2320 add al
, [edi
+ebx
-1] /* Add Avg(x); -1 to offset inc ebx */
2321 cmp ebx
, FullLength
/* Check if at end of array */
2322 mov
[edi
+ebx
-1], al
/* Write back Raw(x); */
2323 /* mov does not affect flags; -1 to offset inc ebx */
2326 emms
/* End MMX instructions; prep for possible FP instrs. */
2327 } /* end _asm block */
2330 /* Optimized code for PNG Paeth filter decoder */
2332 png_read_filter_row_mmx_paeth(png_row_infop row_info
, png_bytep row
,
2335 png_uint_32 FullLength
;
2336 png_uint_32 MMXLength
;
2337 /*png_uint_32 len; */
2341 int patemp
, pbtemp
, pctemp
;
2343 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
2344 FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
2347 xor ebx
, ebx
/* ebx ==> x offset */
2349 xor edx
, edx
/* edx ==> x-bpp offset */
2353 /* Compute the Raw value for the first bpp bytes */
2354 /* Note: the formula works out to be always */
2355 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
2361 mov
[edi
+ ebx
- 1], al
2363 /* get # of bytes to alignment */
2364 mov diff
, edi
/* take start of row */
2365 add diff
, ebx
/* add bpp */
2367 add diff
, 0xf /* add 7 + 8 to incr past alignment boundary */
2368 and diff
, 0xfffffff8 /* mask to alignment boundary */
2369 sub diff
, edi
/* subtract from start ==> value ebx at alignment */
2374 /* pav = p - a = (a + b - c) - a = b - c */
2375 mov al
, [esi
+ ebx
] /* load Prior(x) into al */
2376 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
2377 sub eax
, ecx
/* subtract Prior(x-bpp) */
2378 mov patemp
, eax
/* Save pav for later use */
2380 /* pbv = p - b = (a + b - c) - b = a - c */
2381 mov al
, [edi
+ edx
] /* load Raw(x-bpp) into al */
2382 sub eax
, ecx
/* subtract Prior(x-bpp) */
2384 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2385 add eax
, patemp
/* pcv = pav + pbv */
2387 test eax
, 0x80000000
2389 neg eax
/* reverse sign of neg values */
2391 mov pctemp
, eax
/* save pc for later use */
2393 test ecx
, 0x80000000
2395 neg ecx
/* reverse sign of neg values */
2397 mov pbtemp
, ecx
/* save pb for later use */
2400 test eax
, 0x80000000
2402 neg eax
/* reverse sign of neg values */
2404 mov patemp
, eax
/* save pa for later use */
2405 /* test if pa <= pb */
2408 /* pa > pb; now test if pb <= pc */
2411 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2412 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
2415 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
2416 mov cl
, [esi
+ ebx
] /* load Prior(x) into cl */
2419 /* pa <= pb; now test if pa <= pc */
2422 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2423 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
2426 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
2427 mov cl
, [edi
+ edx
] /* load Raw(x-bpp) into cl */
2431 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
2432 add
[edi
+ ebx
- 1], cl
2438 sub eax
, ebx
/* subtract alignment fix */
2439 and eax
, 0x00000007 /* calc bytes over mult of 8 */
2440 sub ecx
, eax
/* drop over bytes from original length */
2442 } /* end _asm block */
2443 /* Now do the math for the rest of the row */
2448 ActiveMask
.use
= 0x0000000000ffffff;
2449 ActiveMaskEnd
.use
= 0xffff000000000000;
2450 ShiftBpp
.use
= 24; /* == bpp(3) * 8 */
2451 ShiftRem
.use
= 40; /* == 64 - 24 */
2458 /* PRIME the pump (load the first Raw(x-bpp) data set */
2459 movq mm1
, [edi
+ebx
-8]
2461 psrlq mm1
, ShiftRem
/* shift last 3 bytes to 1st 3 bytes */
2462 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2463 punpcklbw mm1
, mm0
/* Unpack High bytes of a */
2464 movq mm3
, [esi
+ebx
-8] /* Prep c=Prior(x-bpp) bytes */
2465 punpcklbw mm2
, mm0
/* Unpack High bytes of b */
2466 psrlq mm3
, ShiftRem
/* shift last 3 bytes to 1st 3 bytes */
2467 /* pav = p - a = (a + b - c) - a = b - c */
2469 punpcklbw mm3
, mm0
/* Unpack High bytes of c */
2470 /* pbv = p - b = (a + b - c) - b = a - c */
2474 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2478 /* pa = abs(p-a) = abs(pav) */
2479 /* pb = abs(p-b) = abs(pbv) */
2480 /* pc = abs(p-c) = abs(pcv) */
2481 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2483 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2484 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2486 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2490 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2491 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2497 pcmpgtw mm7
, mm5
/* pa > pb? */
2499 /* use mm7 mask to merge pa & pb */
2501 /* use mm0 mask copy to merge a & b */
2507 /* test ((pa <= pb)? pa:pb) <= pc */
2508 pcmpgtw mm7
, mm6
/* pab > pc? */
2515 movq mm3
, [esi
+ ebx
] /* load c=Prior(x-bpp) */
2516 pand mm7
, ActiveMask
2517 movq mm2
, mm3
/* load b=Prior(x) step 1 */
2518 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2519 punpcklbw mm3
, mm0
/* Unpack High bytes of c */
2520 movq
[edi
+ ebx
], mm7
/* write back updated value */
2521 movq mm1
, mm7
/* Now mm1 will be used as Raw(x-bpp) */
2522 /* Now do Paeth for 2nd set of bytes (3-5) */
2523 psrlq mm2
, ShiftBpp
/* load b=Prior(x) step 2 */
2524 punpcklbw mm1
, mm0
/* Unpack High bytes of a */
2526 punpcklbw mm2
, mm0
/* Unpack High bytes of b */
2527 /* pbv = p - b = (a + b - c) - b = a - c */
2529 /* pav = p - a = (a + b - c) - a = b - c */
2533 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
2534 /* pav + pbv = pbv + pav */
2538 /* pa = abs(p-a) = abs(pav) */
2539 /* pb = abs(p-b) = abs(pbv) */
2540 /* pc = abs(p-c) = abs(pcv) */
2541 pcmpgtw mm0
, mm5
/* Create mask pbv bytes < 0 */
2542 pcmpgtw mm7
, mm4
/* Create mask pav bytes < 0 */
2543 pand mm0
, mm5
/* Only pbv bytes < 0 in mm0 */
2544 pand mm7
, mm4
/* Only pav bytes < 0 in mm7 */
2550 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2551 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2556 pcmpgtw mm7
, mm5
/* pa > pb? */
2558 /* use mm7 mask to merge pa & pb */
2560 /* use mm0 mask copy to merge a & b */
2566 /* test ((pa <= pb)? pa:pb) <= pc */
2567 pcmpgtw mm7
, mm6
/* pab > pc? */
2568 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2575 movq mm3
, mm2
/* load c=Prior(x-bpp) step 1 */
2576 pand mm7
, ActiveMask
2577 punpckhbw mm2
, mm0
/* Unpack High bytes of b */
2578 psllq mm7
, ShiftBpp
/* Shift bytes to 2nd group of 3 bytes */
2579 /* pav = p - a = (a + b - c) - a = b - c */
2581 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2582 psllq mm3
, ShiftBpp
/* load c=Prior(x-bpp) step 2 */
2583 movq
[edi
+ ebx
], mm7
/* write back updated value */
2585 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2586 psllq mm1
, ShiftBpp
/* Shift bytes */
2587 /* Now mm1 will be used as Raw(x-bpp) */
2588 /* Now do Paeth for 3rd, and final, set of bytes (6-7) */
2590 punpckhbw mm1
, mm0
/* Unpack High bytes of a */
2592 /* pbv = p - b = (a + b - c) - b = a - c */
2594 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2600 /* pa = abs(p-a) = abs(pav) */
2601 /* pb = abs(p-b) = abs(pbv) */
2602 /* pc = abs(p-c) = abs(pcv) */
2603 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2604 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2605 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2606 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2612 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2613 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2618 pcmpgtw mm7
, mm5
/* pa > pb? */
2620 /* use mm0 mask copy to merge a & b */
2622 /* use mm7 mask to merge pa & pb */
2628 /* test ((pa <= pb)? pa:pb) <= pc */
2629 pcmpgtw mm7
, mm6
/* pab > pc? */
2635 /* Step ebx to next set of 8 bytes and repeat loop til done */
2637 pand mm1
, ActiveMaskEnd
2638 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
2641 pxor mm0
, mm0
/* pxor does not affect flags */
2642 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
2643 /* mm1 will be used as Raw(x-bpp) next loop */
2644 /* mm3 ready to be used as Prior(x-bpp) next loop */
2646 } /* end _asm block */
2654 ActiveMask
.use
= 0x00000000ffffffff;
2655 ActiveMask2
.use
= 0xffffffff00000000;
2656 ShiftBpp
.use
= bpp
<< 3; /* == bpp * 8 */
2657 ShiftRem
.use
= 64 - ShiftBpp
.use
;
2663 /* PRIME the pump (load the first Raw(x-bpp) data set */
2664 movq mm1
, [edi
+ebx
-8]
2667 /* Must shift to position Raw(x-bpp) data */
2669 /* Do first set of 4 bytes */
2670 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2671 punpcklbw mm1
, mm0
/* Unpack Low bytes of a */
2672 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2673 punpcklbw mm2
, mm0
/* Unpack Low bytes of b */
2674 /* Must shift to position Prior(x-bpp) data */
2676 /* pav = p - a = (a + b - c) - a = b - c */
2678 punpcklbw mm3
, mm0
/* Unpack Low bytes of c */
2679 /* pbv = p - b = (a + b - c) - b = a - c */
2683 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2686 /* pa = abs(p-a) = abs(pav) */
2687 /* pb = abs(p-b) = abs(pbv) */
2688 /* pc = abs(p-c) = abs(pcv) */
2689 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2691 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2692 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2694 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2698 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2699 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2705 pcmpgtw mm7
, mm5
/* pa > pb? */
2707 /* use mm7 mask to merge pa & pb */
2709 /* use mm0 mask copy to merge a & b */
2715 /* test ((pa <= pb)? pa:pb) <= pc */
2716 pcmpgtw mm7
, mm6
/* pab > pc? */
2723 movq mm3
, [esi
+ ebx
- 8] /* load c=Prior(x-bpp) */
2724 pand mm7
, ActiveMask
2726 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) step 1 */
2727 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2729 movq
[edi
+ ebx
], mm7
/* write back updated value */
2730 movq mm1
, [edi
+ebx
-8]
2736 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2738 /* Do second set of 4 bytes */
2739 punpckhbw mm2
, mm0
/* Unpack High bytes of b */
2740 punpckhbw mm1
, mm0
/* Unpack High bytes of a */
2741 /* pav = p - a = (a + b - c) - a = b - c */
2743 /* pbv = p - b = (a + b - c) - b = a - c */
2747 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2750 /* pa = abs(p-a) = abs(pav) */
2751 /* pb = abs(p-b) = abs(pbv) */
2752 /* pc = abs(p-c) = abs(pcv) */
2753 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2755 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2756 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2758 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2762 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2763 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2769 pcmpgtw mm7
, mm5
/* pa > pb? */
2771 /* use mm7 mask to merge pa & pb */
2773 /* use mm0 mask copy to merge a & b */
2779 /* test ((pa <= pb)? pa:pb) <= pc */
2780 pcmpgtw mm7
, mm6
/* pab > pc? */
2787 /* Step ex to next set of 8 bytes and repeat loop til done */
2790 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
2792 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
2793 /* mm1 will be used as Raw(x-bpp) next loop */
2795 } /* end _asm block */
2801 ActiveMask
.use
= 0x00000000ffffffff;
2807 /* PRIME the pump (load the first Raw(x-bpp) data set */
2808 movq mm1
, [edi
+ebx
-8] /* Only time should need to read */
2809 /* a=Raw(x-bpp) bytes */
2811 /* Do first set of 4 bytes */
2812 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2813 punpckhbw mm1
, mm0
/* Unpack Low bytes of a */
2814 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2815 punpcklbw mm2
, mm0
/* Unpack High bytes of b */
2816 /* pav = p - a = (a + b - c) - a = b - c */
2818 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2819 /* pbv = p - b = (a + b - c) - b = a - c */
2823 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2826 /* pa = abs(p-a) = abs(pav) */
2827 /* pb = abs(p-b) = abs(pbv) */
2828 /* pc = abs(p-c) = abs(pcv) */
2829 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2831 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2832 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2834 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2838 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2839 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2845 pcmpgtw mm7
, mm5
/* pa > pb? */
2847 /* use mm7 mask to merge pa & pb */
2849 /* use mm0 mask copy to merge a & b */
2855 /* test ((pa <= pb)? pa:pb) <= pc */
2856 pcmpgtw mm7
, mm6
/* pab > pc? */
2863 movq mm3
, [esi
+ ebx
] /* load c=Prior(x-bpp) */
2864 pand mm7
, ActiveMask
2865 movq mm2
, mm3
/* load b=Prior(x) step 1 */
2866 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2867 punpcklbw mm3
, mm0
/* Unpack High bytes of c */
2868 movq
[edi
+ ebx
], mm7
/* write back updated value */
2869 movq mm1
, mm7
/* Now mm1 will be used as Raw(x-bpp) */
2870 /* Do second set of 4 bytes */
2871 punpckhbw mm2
, mm0
/* Unpack Low bytes of b */
2872 punpcklbw mm1
, mm0
/* Unpack Low bytes of a */
2873 /* pav = p - a = (a + b - c) - a = b - c */
2875 /* pbv = p - b = (a + b - c) - b = a - c */
2879 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2882 /* pa = abs(p-a) = abs(pav) */
2883 /* pb = abs(p-b) = abs(pbv) */
2884 /* pc = abs(p-c) = abs(pcv) */
2885 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2887 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2888 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2890 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2894 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2895 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2901 pcmpgtw mm7
, mm5
/* pa > pb? */
2903 /* use mm7 mask to merge pa & pb */
2905 /* use mm0 mask copy to merge a & b */
2911 /* test ((pa <= pb)? pa:pb) <= pc */
2912 pcmpgtw mm7
, mm6
/* pab > pc? */
2919 /* Step ex to next set of 8 bytes and repeat loop til done */
2922 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
2924 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
2925 /* mm1 will be used as Raw(x-bpp) next loop */
2927 } /* end _asm block */
2930 case 8: /* bpp == 8 */
2932 ActiveMask
.use
= 0x00000000ffffffff;
2938 /* PRIME the pump (load the first Raw(x-bpp) data set */
2939 movq mm1
, [edi
+ebx
-8] /* Only time should need to read */
2940 /* a=Raw(x-bpp) bytes */
2942 /* Do first set of 4 bytes */
2943 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2944 punpcklbw mm1
, mm0
/* Unpack Low bytes of a */
2945 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2946 punpcklbw mm2
, mm0
/* Unpack Low bytes of b */
2947 /* pav = p - a = (a + b - c) - a = b - c */
2949 punpcklbw mm3
, mm0
/* Unpack Low bytes of c */
2950 /* pbv = p - b = (a + b - c) - b = a - c */
2954 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2957 /* pa = abs(p-a) = abs(pav) */
2958 /* pb = abs(p-b) = abs(pbv) */
2959 /* pc = abs(p-c) = abs(pcv) */
2960 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
2962 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
2963 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
2965 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
2969 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
2970 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
2976 pcmpgtw mm7
, mm5
/* pa > pb? */
2978 /* use mm7 mask to merge pa & pb */
2980 /* use mm0 mask copy to merge a & b */
2986 /* test ((pa <= pb)? pa:pb) <= pc */
2987 pcmpgtw mm7
, mm6
/* pab > pc? */
2994 movq mm3
, [esi
+ebx
-8] /* read c=Prior(x-bpp) bytes */
2995 pand mm7
, ActiveMask
2996 movq mm2
, [esi
+ ebx
] /* load b=Prior(x) */
2997 paddb mm7
, [edi
+ ebx
] /* add Paeth predictor with Raw(x) */
2998 punpckhbw mm3
, mm0
/* Unpack High bytes of c */
2999 movq
[edi
+ ebx
], mm7
/* write back updated value */
3000 movq mm1
, [edi
+ebx
-8] /* read a=Raw(x-bpp) bytes */
3002 /* Do second set of 4 bytes */
3003 punpckhbw mm2
, mm0
/* Unpack High bytes of b */
3004 punpckhbw mm1
, mm0
/* Unpack High bytes of a */
3005 /* pav = p - a = (a + b - c) - a = b - c */
3007 /* pbv = p - b = (a + b - c) - b = a - c */
3011 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3014 /* pa = abs(p-a) = abs(pav) */
3015 /* pb = abs(p-b) = abs(pbv) */
3016 /* pc = abs(p-c) = abs(pcv) */
3017 pcmpgtw mm0
, mm4
/* Create mask pav bytes < 0 */
3019 pand mm0
, mm4
/* Only pav bytes < 0 in mm7 */
3020 pcmpgtw mm7
, mm5
/* Create mask pbv bytes < 0 */
3022 pand mm7
, mm5
/* Only pbv bytes < 0 in mm0 */
3026 pcmpgtw mm0
, mm6
/* Create mask pcv bytes < 0 */
3027 pand mm0
, mm6
/* Only pav bytes < 0 in mm7 */
3033 pcmpgtw mm7
, mm5
/* pa > pb? */
3035 /* use mm7 mask to merge pa & pb */
3037 /* use mm0 mask copy to merge a & b */
3043 /* test ((pa <= pb)? pa:pb) <= pc */
3044 pcmpgtw mm7
, mm6
/* pab > pc? */
3051 /* Step ex to next set of 8 bytes and repeat loop til done */
3054 paddb mm1
, [edi
+ ebx
- 8] /* add Paeth predictor with Raw(x) */
3056 movq
[edi
+ ebx
- 8], mm1
/* write back updated value */
3057 /* mm1 will be used as Raw(x-bpp) next loop */
3059 } /* end _asm block */
3063 case 1: /* bpp = 1 */
3064 case 2: /* bpp = 2 */
3065 default: /* bpp > 8 */
3073 /* Do Paeth decode for remaining bytes */
3075 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
3076 sub edx
, bpp
/* Set edx = ebx - bpp */
3079 /* pav = p - a = (a + b - c) - a = b - c */
3080 mov al
, [esi
+ ebx
] /* load Prior(x) into al */
3081 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3082 sub eax
, ecx
/* subtract Prior(x-bpp) */
3083 mov patemp
, eax
/* Save pav for later use */
3085 /* pbv = p - b = (a + b - c) - b = a - c */
3086 mov al
, [edi
+ edx
] /* load Raw(x-bpp) into al */
3087 sub eax
, ecx
/* subtract Prior(x-bpp) */
3089 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3090 add eax
, patemp
/* pcv = pav + pbv */
3092 test eax
, 0x80000000
3094 neg eax
/* reverse sign of neg values */
3096 mov pctemp
, eax
/* save pc for later use */
3098 test ecx
, 0x80000000
3100 neg ecx
/* reverse sign of neg values */
3102 mov pbtemp
, ecx
/* save pb for later use */
3105 test eax
, 0x80000000
3107 neg eax
/* reverse sign of neg values */
3109 mov patemp
, eax
/* save pa for later use */
3110 /* test if pa <= pb */
3113 /* pa > pb; now test if pb <= pc */
3116 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3117 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3120 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3121 mov cl
, [esi
+ ebx
] /* load Prior(x) into cl */
3124 /* pa <= pb; now test if pa <= pc */
3127 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3128 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3131 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3132 mov cl
, [edi
+ edx
] /* load Raw(x-bpp) into cl */
3136 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3137 add
[edi
+ ebx
- 1], cl
3141 } /* end _asm block */
3143 return; /* No need to go further with this one */
3144 } /* end switch ( bpp ) */
3147 /* MMX acceleration complete now do clean-up */
3148 /* Check if any remaining bytes left to decode */
3154 /* Do Paeth decode for remaining bytes */
3156 xor ecx
, ecx
/* zero ecx before using cl & cx in loop below */
3157 sub edx
, bpp
/* Set edx = ebx - bpp */
3160 /* pav = p - a = (a + b - c) - a = b - c */
3161 mov al
, [esi
+ ebx
] /* load Prior(x) into al */
3162 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3163 sub eax
, ecx
/* subtract Prior(x-bpp) */
3164 mov patemp
, eax
/* Save pav for later use */
3166 /* pbv = p - b = (a + b - c) - b = a - c */
3167 mov al
, [edi
+ edx
] /* load Raw(x-bpp) into al */
3168 sub eax
, ecx
/* subtract Prior(x-bpp) */
3170 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3171 add eax
, patemp
/* pcv = pav + pbv */
3173 test eax
, 0x80000000
3175 neg eax
/* reverse sign of neg values */
3177 mov pctemp
, eax
/* save pc for later use */
3179 test ecx
, 0x80000000
3181 neg ecx
/* reverse sign of neg values */
3183 mov pbtemp
, ecx
/* save pb for later use */
3186 test eax
, 0x80000000
3188 neg eax
/* reverse sign of neg values */
3190 mov patemp
, eax
/* save pa for later use */
3191 /* test if pa <= pb */
3194 /* pa > pb; now test if pb <= pc */
3197 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3198 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3201 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3202 mov cl
, [esi
+ ebx
] /* load Prior(x) into cl */
3205 /* pa <= pb; now test if pa <= pc */
3208 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3209 mov cl
, [esi
+ edx
] /* load Prior(x-bpp) into cl */
3212 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3213 mov cl
, [edi
+ edx
] /* load Raw(x-bpp) into cl */
3217 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3218 add
[edi
+ ebx
- 1], cl
3222 emms
/* End MMX instructions; prep for possible FP instrs. */
3223 } /* end _asm block */
3226 /* Optimized code for PNG Sub filter decoder */
3228 png_read_filter_row_mmx_sub(png_row_infop row_info
, png_bytep row
)
3232 png_uint_32 FullLength
;
3233 png_uint_32 MMXLength
;
3236 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
3237 FullLength
= row_info
->rowbytes
- bpp
; /* # of bytes to filter */
3240 mov esi
, edi
/* lp = row */
3241 add edi
, bpp
/* rp = row + bpp */
3243 /* get # of bytes to alignment */
3244 mov diff
, edi
/* take start of row */
3245 add diff
, 0xf /* add 7 + 8 to incr past */
3246 /* alignment boundary */
3248 and diff
, 0xfffffff8 /* mask to alignment boundary */
3249 sub diff
, edi
/* subtract from start ==> value */
3250 /* ebx at alignment */
3262 sub edx
, ebx
/* subtract alignment fix */
3263 and edx
, 0x00000007 /* calc bytes over mult of 8 */
3264 sub ecx
, edx
/* drop over bytes from length */
3266 } /* end _asm block */
3268 /* Now do the math for the rest of the row */
3273 ActiveMask
.use
= 0x0000ffffff000000;
3274 ShiftBpp
.use
= 24; /* == 3 * 8 */
3275 ShiftRem
.use
= 40; /* == 64 - 24 */
3278 movq mm7
, ActiveMask
/* Load ActiveMask for 2nd active byte group */
3279 mov esi
, edi
/* lp = row */
3280 add edi
, bpp
/* rp = row + bpp */
3283 psllq mm6
, ShiftBpp
/* Move mask in mm6 to cover 3rd active */
3285 /* PRIME the pump (load the first Raw(x-bpp) data set */
3286 movq mm1
, [edi
+ebx
-8]
3288 psrlq mm1
, ShiftRem
/* Shift data for adding 1st bpp bytes */
3289 /* no need for mask; shift clears inactive bytes */
3290 /* Add 1st active group */
3293 /* Add 2nd active group */
3294 movq mm1
, mm0
/* mov updated Raws to mm1 */
3295 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3296 pand mm1
, mm7
/* mask to use only 2nd active group */
3298 /* Add 3rd active group */
3299 movq mm1
, mm0
/* mov updated Raws to mm1 */
3300 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3301 pand mm1
, mm6
/* mask to use only 3rd active group */
3305 movq
[edi
+ebx
-8], mm0
/* Write updated Raws back to array */
3306 /* Prep for doing 1st add at top of loop */
3309 } /* end _asm block */
3315 /* Placed here just in case this is a duplicate of the */
3316 /* non-MMX code for the SUB filter in png_read_filter_row below */
3320 /* png_uint_32 i; */
3321 /* bpp = (row_info->pixel_depth + 7) >> 3; */
3322 /* for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */
3323 /* i < row_info->rowbytes; i++, rp++, lp++) */
3325 /* *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */
3332 mov esi
, edi
/* lp = row */
3334 add edi
, bpp
/* rp = row + bpp */
3342 } /* end _asm block */
3351 ShiftBpp
.use
= bpp
<< 3;
3352 ShiftRem
.use
= 64 - ShiftBpp
.use
;
3356 mov esi
, edi
/* lp = row */
3357 add edi
, bpp
/* rp = row + bpp */
3358 /* PRIME the pump (load the first Raw(x-bpp) data set */
3359 movq mm1
, [edi
+ebx
-8]
3361 psrlq mm1
, ShiftRem
/* Shift data for adding 1st bpp bytes */
3362 /* no need for mask; shift clears inactive bytes */
3365 /* Add 2nd active group */
3366 movq mm1
, mm0
/* mov updated Raws to mm1 */
3367 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3368 /* there is no need for any mask */
3369 /* since shift clears inactive bits/bytes */
3373 movq
[edi
+ebx
-8], mm0
3374 movq mm1
, mm0
/* Prep for doing 1st add at top of loop */
3376 } /* end _asm block */
3382 ActiveMask
.use
= 0x00000000ffff0000;
3383 ShiftBpp
.use
= 16; /* == 2 * 8 */
3384 ShiftRem
.use
= 48; /* == 64 - 16 */
3386 movq mm7
, ActiveMask
/* Load ActiveMask for 2nd active byte group */
3390 psllq mm6
, ShiftBpp
/* Move mask in mm6 to cover 3rd active */
3392 mov esi
, edi
/* lp = row */
3394 add edi
, bpp
/* rp = row + bpp */
3395 psllq mm5
, ShiftBpp
/* Move mask in mm5 to cover 4th active */
3397 /* PRIME the pump (load the first Raw(x-bpp) data set */
3398 movq mm1
, [edi
+ebx
-8]
3400 /* Add 1st active group */
3401 psrlq mm1
, ShiftRem
/* Shift data for adding 1st bpp bytes */
3402 /* no need for mask; shift clears inactive */
3406 /* Add 2nd active group */
3407 movq mm1
, mm0
/* mov updated Raws to mm1 */
3408 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3409 pand mm1
, mm7
/* mask to use only 2nd active group */
3411 /* Add 3rd active group */
3412 movq mm1
, mm0
/* mov updated Raws to mm1 */
3413 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3414 pand mm1
, mm6
/* mask to use only 3rd active group */
3416 /* Add 4th active group */
3417 movq mm1
, mm0
/* mov updated Raws to mm1 */
3418 psllq mm1
, ShiftBpp
/* shift data to position correctly */
3419 pand mm1
, mm5
/* mask to use only 4th active group */
3423 movq
[edi
+ebx
-8], mm0
/* Write updated Raws back to array */
3424 movq mm1
, mm0
/* Prep for doing 1st add at top of loop */
3426 } /* end _asm block */
3434 mov esi
, edi
/* lp = row */
3435 add edi
, bpp
/* rp = row + bpp */
3437 movq mm7
, [edi
+ebx
-8] /* PRIME the pump (load the first */
3438 /* Raw(x-bpp) data set */
3439 and ecx
, 0x0000003f /* calc bytes over mult of 64 */
3441 movq mm0
, [edi
+ebx
] /* Load Sub(x) for 1st 8 bytes */
3443 movq mm1
, [edi
+ebx
+8] /* Load Sub(x) for 2nd 8 bytes */
3444 movq
[edi
+ebx
], mm0
/* Write Raw(x) for 1st 8 bytes */
3445 /* Now mm0 will be used as Raw(x-bpp) for */
3446 /* the 2nd group of 8 bytes. This will be */
3447 /* repeated for each group of 8 bytes with */
3448 /* the 8th group being used as the Raw(x-bpp) */
3449 /* for the 1st group of the next loop. */
3451 movq mm2
, [edi
+ebx
+16] /* Load Sub(x) for 3rd 8 bytes */
3452 movq
[edi
+ebx
+8], mm1
/* Write Raw(x) for 2nd 8 bytes */
3454 movq mm3
, [edi
+ebx
+24] /* Load Sub(x) for 4th 8 bytes */
3455 movq
[edi
+ebx
+16], mm2
/* Write Raw(x) for 3rd 8 bytes */
3457 movq mm4
, [edi
+ebx
+32] /* Load Sub(x) for 5th 8 bytes */
3458 movq
[edi
+ebx
+24], mm3
/* Write Raw(x) for 4th 8 bytes */
3460 movq mm5
, [edi
+ebx
+40] /* Load Sub(x) for 6th 8 bytes */
3461 movq
[edi
+ebx
+32], mm4
/* Write Raw(x) for 5th 8 bytes */
3463 movq mm6
, [edi
+ebx
+48] /* Load Sub(x) for 7th 8 bytes */
3464 movq
[edi
+ebx
+40], mm5
/* Write Raw(x) for 6th 8 bytes */
3466 movq mm7
, [edi
+ebx
+56] /* Load Sub(x) for 8th 8 bytes */
3467 movq
[edi
+ebx
+48], mm6
/* Write Raw(x) for 7th 8 bytes */
3471 movq
[edi
+ebx
-8], mm7
/* Write Raw(x) for 8th 8 bytes */
3480 movq
[edi
+ebx
-8], mm0
/* use -8 to offset early add to ebx */
3481 movq mm7
, mm0
/* Move calculated Raw(x) data to mm1 to */
3482 /* be the new Raw(x-bpp) for the next loop */
3485 } /* end _asm block */
3489 default: /* bpp greater than 8 bytes */
3494 mov esi
, edi
/* lp = row */
3495 add edi
, bpp
/* rp = row + bpp */
3502 movq
[edi
+ebx
-8], mm0
/* mov does not affect flags; -8 to offset */
3505 } /* end _asm block */
3509 } /* end switch ( bpp ) */
3516 mov esi
, edi
/* lp = row */
3518 add edi
, bpp
/* rp = row + bpp */
3526 emms
/* End MMX instructions; prep for possible FP instrs. */
3527 } /* end _asm block */
3530 /* Optimized code for PNG Up filter decoder */
3532 png_read_filter_row_mmx_up(png_row_infop row_info
, png_bytep row
,
3536 len
= row_info
->rowbytes
; /* # of bytes to filter */
3539 /* get # of bytes to alignment */
3554 mov
[edi
+ ebx
-1], al
/* mov does not affect flags; -1 to offset inc ebx */
3559 sub edx
, ebx
/* subtract alignment fix */
3560 and edx
, 0x0000003f /* calc bytes over mult of 64 */
3561 sub ecx
, edx
/* drop over bytes from length */
3562 /* Unrolled loop - use all MMX registers and interleave to reduce */
3563 /* number of branch instructions (loops) and reduce partial stalls */
3567 movq mm3
, [esi
+ebx
+8]
3569 movq mm2
, [edi
+ebx
+8]
3572 movq mm5
, [esi
+ebx
+16]
3573 movq
[edi
+ebx
+8], mm2
3574 movq mm4
, [edi
+ebx
+16]
3575 movq mm7
, [esi
+ebx
+24]
3577 movq mm6
, [edi
+ebx
+24]
3578 movq
[edi
+ebx
+16], mm4
3580 movq mm1
, [esi
+ebx
+32]
3581 movq
[edi
+ebx
+24], mm6
3582 movq mm0
, [edi
+ebx
+32]
3583 movq mm3
, [esi
+ebx
+40]
3585 movq mm2
, [edi
+ebx
+40]
3586 movq
[edi
+ebx
+32], mm0
3588 movq mm5
, [esi
+ebx
+48]
3589 movq
[edi
+ebx
+40], mm2
3590 movq mm4
, [edi
+ebx
+48]
3591 movq mm7
, [esi
+ebx
+56]
3593 movq mm6
, [edi
+ebx
+56]
3594 movq
[edi
+ebx
+48], mm4
3598 movq
[edi
+ebx
-8], mm6
/* (+56)movq does not affect flags; */
3599 /* -8 to offset add ebx */
3602 cmp edx
, 0 /* Test for bytes over mult of 64 */
3606 /* 2 lines added by lcreeve@netins.net */
3607 /* (mail 11 Jul 98 in png-implement list) */
3608 cmp edx
, 8 /*test for less than 8 bytes */
3613 and edx
, 0x00000007 /* calc bytes over mult of 8 */
3614 sub ecx
, edx
/* drop over bytes from length */
3616 /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */
3623 movq
[edi
+ebx
-8], mm0
/* movq does not affect flags; -8 to offset add ebx */
3625 cmp edx
, 0 /* Test for bytes over mult of 8 */
3629 add ecx
, edx
/* move over byte count into counter */
3630 /* Loop using x86 registers to update remaining bytes */
3636 mov
[edi
+ ebx
-1], al
/* mov does not affect flags; -1 to offset inc ebx */
3639 /* Conversion of filtered row completed */
3640 emms
/* End MMX instructions; prep for possible FP instrs. */
3641 } /* end _asm block */
3645 /* Optimized png_read_filter_row routines */
3647 png_read_filter_row(png_structp png_ptr
, png_row_infop row_info
, png_bytep
3648 row
, png_bytep prev_row
, int filter
)
3654 if (mmx_supported
== 2) {
3655 /* this should have happened in png_init_mmx_flags() already */
3656 png_warning(png_ptr
, "asm_flags may not have been initialized");
3661 png_debug(1, "in png_read_filter_row\n");
3664 case 0: sprintf(filnm
, "none");
3666 case 1: sprintf(filnm
, "sub-%s",
3667 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
)? "MMX" : "x86");
3669 case 2: sprintf(filnm
, "up-%s",
3670 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
)? "MMX" : "x86");
3672 case 3: sprintf(filnm
, "avg-%s",
3673 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
)? "MMX" : "x86");
3675 case 4: sprintf(filnm
, "Paeth-%s",
3676 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
)? "MMX":"x86");
3678 default: sprintf(filnm
, "unknw");
3681 png_debug2(0,"row=%5d, %s, ", png_ptr
->row_number
, filnm
);
3682 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info
->pixel_depth
,
3683 (int)((row_info
->pixel_depth
+ 7) >> 3));
3684 png_debug1(0,"len=%8d, ", row_info
->rowbytes
);
3685 #endif /* PNG_DEBUG */
3689 case PNG_FILTER_VALUE_NONE
:
3692 case PNG_FILTER_VALUE_SUB
:
3694 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
) &&
3695 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3696 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3698 png_read_filter_row_mmx_sub(row_info
, row
);
3703 png_uint_32 istop
= row_info
->rowbytes
;
3704 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3705 png_bytep rp
= row
+ bpp
;
3708 for (i
= bpp
; i
< istop
; i
++)
3710 *rp
= (png_byte
)(((int)(*rp
) + (int)(*lp
++)) & 0xff);
3717 case PNG_FILTER_VALUE_UP
:
3719 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
) &&
3720 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3721 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3723 png_read_filter_row_mmx_up(row_info
, row
, prev_row
);
3728 png_uint_32 istop
= row_info
->rowbytes
;
3730 png_bytep pp
= prev_row
;
3732 for (i
= 0; i
< istop
; ++i
)
3734 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
3741 case PNG_FILTER_VALUE_AVG
:
3743 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
) &&
3744 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3745 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3747 png_read_filter_row_mmx_avg(row_info
, row
, prev_row
);
3753 png_bytep pp
= prev_row
;
3755 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3756 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
3758 for (i
= 0; i
< bpp
; i
++)
3760 *rp
= (png_byte
)(((int)(*rp
) +
3761 ((int)(*pp
++) >> 1)) & 0xff);
3765 for (i
= 0; i
< istop
; i
++)
3767 *rp
= (png_byte
)(((int)(*rp
) +
3768 ((int)(*pp
++ + *lp
++) >> 1)) & 0xff);
3775 case PNG_FILTER_VALUE_PAETH
:
3777 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
) &&
3778 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3779 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3781 png_read_filter_row_mmx_paeth(row_info
, row
, prev_row
);
3787 png_bytep pp
= prev_row
;
3789 png_bytep cp
= prev_row
;
3790 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3791 png_uint_32 istop
=row_info
->rowbytes
- bpp
;
3793 for (i
= 0; i
< bpp
; i
++)
3795 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
3799 for (i
= 0; i
< istop
; i
++) /* use leftover rp,pp */
3801 int a
, b
, c
, pa
, pb
, pc
, p
;
3815 pa
= p
< 0 ? -p
: p
;
3816 pb
= pc
< 0 ? -pc
: pc
;
3817 pc
= (p
+ pc
) < 0 ? -(p
+ pc
) : p
+ pc
;
3821 if (pa <= pb && pa <= pc)
3829 p
= (pa
<= pb
&& pa
<=pc
) ? a
: (pb
<= pc
) ? b
: c
;
3831 *rp
= (png_byte
)(((int)(*rp
) + p
) & 0xff);
3839 png_warning(png_ptr
, "Ignoring bad row filter type");
3845 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */