1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
9 * libpng version 1.2.5rc3 - September 18, 2002
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2002 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
38 * For other platforms, see the main GNU site:
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
42 * Version 2.5.2l.15 is definitely too old...
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_read_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
207 * - added new png_init_mmx_flags() function (here only because it needs to
208 * call mmxsupport(), which should probably become global png_mmxsupport());
209 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 * and made it public; moved png_init_mmx_flags() to png.c as internal func
216 * - removed dependency on png_read_filter_row_c() (C code already duplicated
217 * within MMX version of png_read_filter_row()) so no longer necessary to
218 * compile it into pngrutil.o
221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
224 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
227 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
228 * - write MMX code for 48-bit case (pixel_bytes == 6)
229 * - figure out what's up with 24-bit case (pixel_bytes == 3):
230 * why subtract 8 from width_mmx in the pass 4/5 case?
231 * (only width_mmx case) (near line 1606)
232 * - rewrite all MMX interlacing code so it's aligned with beginning
233 * of the row buffer, not the end (see 19991007 for details)
234 * x pick one version of mmxsupport() and get rid of the other
235 * - add error messages to any remaining bogus default cases
236 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
237 * x add support for runtime enable/disable/query of various MMX routines
243 #if defined(PNG_USE_PNGGCCRD)
245 int PNGAPI
png_mmx_support(void);
247 #ifdef PNG_USE_LOCAL_ARRAYS
248 static const int FARDATA png_pass_start
[7] = {0, 4, 0, 2, 0, 1, 0};
249 static const int FARDATA png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
250 static const int FARDATA png_pass_width
[7] = {8, 4, 4, 2, 2, 1, 1};
253 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
254 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
255 * so define them without: */
256 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
257 # define _mmx_supported mmx_supported
258 # define _const4 const4
259 # define _const6 const6
260 # define _mask8_0 mask8_0
261 # define _mask16_1 mask16_1
262 # define _mask16_0 mask16_0
263 # define _mask24_2 mask24_2
264 # define _mask24_1 mask24_1
265 # define _mask24_0 mask24_0
266 # define _mask32_3 mask32_3
267 # define _mask32_2 mask32_2
268 # define _mask32_1 mask32_1
269 # define _mask32_0 mask32_0
270 # define _mask48_5 mask48_5
271 # define _mask48_4 mask48_4
272 # define _mask48_3 mask48_3
273 # define _mask48_2 mask48_2
274 # define _mask48_1 mask48_1
275 # define _mask48_0 mask48_0
276 # define _LBCarryMask LBCarryMask
277 # define _HBClearMask HBClearMask
278 # define _ActiveMask ActiveMask
279 # define _ActiveMask2 ActiveMask2
280 # define _ActiveMaskEnd ActiveMaskEnd
281 # define _ShiftBpp ShiftBpp
282 # define _ShiftRem ShiftRem
283 #ifdef PNG_THREAD_UNSAFE_OK
284 # define _unmask unmask
285 # define _FullLength FullLength
286 # define _MMXLength MMXLength
288 # define _patemp patemp
289 # define _pbtemp pbtemp
290 # define _pctemp pctemp
295 /* These constants are used in the inlined MMX assembly code.
296 Ignore gcc's "At top level: defined but not used" warnings. */
298 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
299 * since that case uses the %ebx register for indexing the Global Offset Table
300 * and there were no other registers available. But gcc 2.95 and later emit
301 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
302 * in the non-PIC case, so we'll just use the global unconditionally now.
304 #ifdef PNG_THREAD_UNSAFE_OK
308 static unsigned long long _mask8_0
= 0x0102040810204080LL
;
310 static unsigned long long _mask16_1
= 0x0101020204040808LL
;
311 static unsigned long long _mask16_0
= 0x1010202040408080LL
;
313 static unsigned long long _mask24_2
= 0x0101010202020404LL
;
314 static unsigned long long _mask24_1
= 0x0408080810101020LL
;
315 static unsigned long long _mask24_0
= 0x2020404040808080LL
;
317 static unsigned long long _mask32_3
= 0x0101010102020202LL
;
318 static unsigned long long _mask32_2
= 0x0404040408080808LL
;
319 static unsigned long long _mask32_1
= 0x1010101020202020LL
;
320 static unsigned long long _mask32_0
= 0x4040404080808080LL
;
322 static unsigned long long _mask48_5
= 0x0101010101010202LL
;
323 static unsigned long long _mask48_4
= 0x0202020204040404LL
;
324 static unsigned long long _mask48_3
= 0x0404080808080808LL
;
325 static unsigned long long _mask48_2
= 0x1010101010102020LL
;
326 static unsigned long long _mask48_1
= 0x2020202040404040LL
;
327 static unsigned long long _mask48_0
= 0x4040808080808080LL
;
329 static unsigned long long _const4
= 0x0000000000FFFFFFLL
;
330 /* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */ /* NOT USED */
331 static unsigned long long _const6
= 0x00000000000000FFLL
;
333 /* These are used in the row-filter routines and should/would be local */
334 /* variables if not for gcc addressing limitations. */
335 /* WARNING: Their presence probably defeats the thread safety of libpng. */
337 #ifdef PNG_THREAD_UNSAFE_OK
338 static png_uint_32 _FullLength
;
339 static png_uint_32 _MMXLength
;
341 static int _patemp
; /* temp variables for Paeth routine */
347 png_squelch_warnings(void)
349 #ifdef PNG_THREAD_UNSAFE_OK
354 _MMXLength
= _MMXLength
;
359 _mask16_1
= _mask16_1
;
360 _mask16_0
= _mask16_0
;
361 _mask24_2
= _mask24_2
;
362 _mask24_1
= _mask24_1
;
363 _mask24_0
= _mask24_0
;
364 _mask32_3
= _mask32_3
;
365 _mask32_2
= _mask32_2
;
366 _mask32_1
= _mask32_1
;
367 _mask32_0
= _mask32_0
;
368 _mask48_5
= _mask48_5
;
369 _mask48_4
= _mask48_4
;
370 _mask48_3
= _mask48_3
;
371 _mask48_2
= _mask48_2
;
372 _mask48_1
= _mask48_1
;
373 _mask48_0
= _mask48_0
;
375 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
378 static int _mmx_supported
= 2;
380 /*===========================================================================*/
382 /* P N G _ C O M B I N E _ R O W */
384 /*===========================================================================*/
386 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
389 #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
391 #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
394 /* Combines the row recently read in with the previous row.
395 This routine takes care of alpha and transparency if requested.
396 This routine also handles the two methods of progressive display
397 of interlaced images, depending on the mask value.
398 The mask value describes which pixels are to be combined with
399 the row. The pattern always repeats every 8 pixels, so just 8
400 bits are needed. A one indicates the pixel is to be combined; a
401 zero indicates the pixel is to be skipped. This is in addition
402 to any alpha or transparency value associated with the pixel.
403 If you want all pixels to be combined, pass 0xff (255) in mask. */
405 /* Use this routine for the x86 platform - it uses a faster MMX routine
406 if the machine supports MMX. */
409 png_combine_row(png_structp png_ptr
, png_bytep row
, int mask
)
411 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
413 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
414 if (_mmx_supported
== 2) {
415 /* this should have happened in png_init_mmx_flags() already */
416 png_warning(png_ptr
, "asm_flags may not have been initialized");
423 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
424 png_memcpy(row
, png_ptr
->row_buf
+ 1,
425 (png_size_t
)((png_ptr
->width
* png_ptr
->row_info
.pixel_depth
+ 7) >> 3));
427 else /* (png_combine_row() is never called with mask == 0) */
429 switch (png_ptr
->row_info
.pixel_depth
)
431 case 1: /* png_ptr->row_info.pixel_depth */
435 int s_inc
, s_start
, s_end
;
440 sp
= png_ptr
->row_buf
+ 1;
443 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
444 if (png_ptr
->transformations
& PNG_PACKSWAP
)
460 for (i
= 0; i
< png_ptr
->width
; i
++)
466 value
= (*sp
>> shift
) & 0x1;
467 *dp
&= (png_byte
)((0x7f7f >> (7 - shift
)) & 0xff);
468 *dp
|= (png_byte
)(value
<< shift
);
488 case 2: /* png_ptr->row_info.pixel_depth */
492 int s_start
, s_end
, s_inc
;
498 sp
= png_ptr
->row_buf
+ 1;
501 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
502 if (png_ptr
->transformations
& PNG_PACKSWAP
)
518 for (i
= 0; i
< png_ptr
->width
; i
++)
522 value
= (*sp
>> shift
) & 0x3;
523 *dp
&= (png_byte
)((0x3f3f >> (6 - shift
)) & 0xff);
524 *dp
|= (png_byte
)(value
<< shift
);
543 case 4: /* png_ptr->row_info.pixel_depth */
547 int s_start
, s_end
, s_inc
;
553 sp
= png_ptr
->row_buf
+ 1;
556 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
557 if (png_ptr
->transformations
& PNG_PACKSWAP
)
572 for (i
= 0; i
< png_ptr
->width
; i
++)
576 value
= (*sp
>> shift
) & 0xf;
577 *dp
&= (png_byte
)((0xf0f >> (4 - shift
)) & 0xff);
578 *dp
|= (png_byte
)(value
<< shift
);
597 case 8: /* png_ptr->row_info.pixel_depth */
602 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
603 #if !defined(PNG_1_0_X)
604 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
605 /* && _mmx_supported */ )
612 int dummy_value_a
; /* fix 'forbidden register spilled' error */
617 _unmask
= ~mask
; /* global variable for -fPIC version */
618 srcptr
= png_ptr
->row_buf
+ 1;
620 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
621 diff
= (int) (png_ptr
->width
& 7); /* amount lost */
623 __asm__
__volatile__ (
624 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
625 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
626 "punpcklbw %%mm7, %%mm7 \n\t"
627 "punpcklwd %%mm7, %%mm7 \n\t"
628 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
630 "movq _mask8_0, %%mm0 \n\t"
631 "pand %%mm7, %%mm0 \n\t" /* nonzero if keep byte */
632 "pcmpeqb %%mm6, %%mm0 \n\t" /* zeros->1s, v versa */
634 /* preload "movl len, %%ecx \n\t" // load length of line */
635 /* preload "movl srcptr, %%esi \n\t" // load source */
636 /* preload "movl dstptr, %%edi \n\t" // load dest */
638 "cmpl $0, %%ecx \n\t" /* len == 0 ? */
639 "je mainloop8end \n\t"
642 "movq (%%esi), %%mm4 \n\t" /* *srcptr */
643 "pand %%mm0, %%mm4 \n\t"
644 "movq %%mm0, %%mm6 \n\t"
645 "pandn (%%edi), %%mm6 \n\t" /* *dstptr */
646 "por %%mm6, %%mm4 \n\t"
647 "movq %%mm4, (%%edi) \n\t"
648 "addl $8, %%esi \n\t" /* inc by 8 bytes processed */
649 "addl $8, %%edi \n\t"
650 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
654 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
655 "movl %%eax, %%ecx \n\t"
656 "cmpl $0, %%ecx \n\t"
658 /* preload "movl mask, %%edx \n\t" */
659 "sall $24, %%edx \n\t" /* make low byte, high byte */
662 "sall %%edx \n\t" /* move high bit to CF */
663 "jnc skip8 \n\t" /* if CF = 0 */
664 "movb (%%esi), %%al \n\t"
665 "movb %%al, (%%edi) \n\t"
671 "jnz secondloop8 \n\t"
674 "EMMS \n\t" /* DONE */
676 : "=a" (dummy_value_a
), /* output regs (dummy) */
677 "=d" (dummy_value_d
),
678 "=c" (dummy_value_c
),
679 "=S" (dummy_value_S
),
682 : "3" (srcptr
), /* esi // input regs */
683 "4" (dstptr
), /* edi */
684 "0" (diff
), /* eax */
685 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
689 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
690 : "%mm0", "%mm4", "%mm6", "%mm7" /* clobber list */
694 else /* mmx _not supported - Use modified C routine */
695 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
697 register png_uint_32 i
;
698 png_uint_32 initial_val
= png_pass_start
[png_ptr
->pass
];
699 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
700 register int stride
= png_pass_inc
[png_ptr
->pass
];
701 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
702 register int rep_bytes
= png_pass_width
[png_ptr
->pass
];
703 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
704 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
705 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
706 register png_uint_32 final_val
= len
; /* GRR bugfix */
708 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
709 dstptr
= row
+ initial_val
;
711 for (i
= initial_val
; i
< final_val
; i
+= stride
)
713 png_memcpy(dstptr
, srcptr
, rep_bytes
);
717 if (diff
) /* number of leftover pixels: 3 for pngtest */
719 final_val
+=diff
/* *BPP1 */ ;
720 for (; i
< final_val
; i
+= stride
)
722 if (rep_bytes
> (int)(final_val
-i
))
723 rep_bytes
= (int)(final_val
-i
);
724 png_memcpy(dstptr
, srcptr
, rep_bytes
);
730 } /* end of else (_mmx_supported) */
735 case 16: /* png_ptr->row_info.pixel_depth */
740 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
741 #if !defined(PNG_1_0_X)
742 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
743 /* && _mmx_supported */ )
750 int dummy_value_a
; /* fix 'forbidden register spilled' error */
755 _unmask
= ~mask
; /* global variable for -fPIC version */
756 srcptr
= png_ptr
->row_buf
+ 1;
758 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
759 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
761 __asm__
__volatile__ (
762 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
763 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
764 "punpcklbw %%mm7, %%mm7 \n\t"
765 "punpcklwd %%mm7, %%mm7 \n\t"
766 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
768 "movq _mask16_0, %%mm0 \n\t"
769 "movq _mask16_1, %%mm1 \n\t"
771 "pand %%mm7, %%mm0 \n\t"
772 "pand %%mm7, %%mm1 \n\t"
774 "pcmpeqb %%mm6, %%mm0 \n\t"
775 "pcmpeqb %%mm6, %%mm1 \n\t"
777 /* preload "movl len, %%ecx \n\t" // load length of line */
778 /* preload "movl srcptr, %%esi \n\t" // load source */
779 /* preload "movl dstptr, %%edi \n\t" // load dest */
781 "cmpl $0, %%ecx \n\t"
782 "jz mainloop16end \n\t"
785 "movq (%%esi), %%mm4 \n\t"
786 "pand %%mm0, %%mm4 \n\t"
787 "movq %%mm0, %%mm6 \n\t"
788 "movq (%%edi), %%mm7 \n\t"
789 "pandn %%mm7, %%mm6 \n\t"
790 "por %%mm6, %%mm4 \n\t"
791 "movq %%mm4, (%%edi) \n\t"
793 "movq 8(%%esi), %%mm5 \n\t"
794 "pand %%mm1, %%mm5 \n\t"
795 "movq %%mm1, %%mm7 \n\t"
796 "movq 8(%%edi), %%mm6 \n\t"
797 "pandn %%mm6, %%mm7 \n\t"
798 "por %%mm7, %%mm5 \n\t"
799 "movq %%mm5, 8(%%edi) \n\t"
801 "addl $16, %%esi \n\t" /* inc by 16 bytes processed */
802 "addl $16, %%edi \n\t"
803 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
806 "mainloop16end: \n\t"
807 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
808 "movl %%eax, %%ecx \n\t"
809 "cmpl $0, %%ecx \n\t"
811 /* preload "movl mask, %%edx \n\t" */
812 "sall $24, %%edx \n\t" /* make low byte, high byte */
815 "sall %%edx \n\t" /* move high bit to CF */
816 "jnc skip16 \n\t" /* if CF = 0 */
817 "movw (%%esi), %%ax \n\t"
818 "movw %%ax, (%%edi) \n\t"
821 "addl $2, %%esi \n\t"
822 "addl $2, %%edi \n\t"
824 "jnz secondloop16 \n\t"
827 "EMMS \n\t" /* DONE */
829 : "=a" (dummy_value_a
), /* output regs (dummy) */
830 "=c" (dummy_value_c
),
831 "=d" (dummy_value_d
),
832 "=S" (dummy_value_S
),
835 : "0" (diff
), /* eax // input regs */
836 /* was (unmask) " " RESERVED // ebx // Global Offset Table idx */
838 "2" (mask
), /* edx */
839 "3" (srcptr
), /* esi */
840 "4" (dstptr
) /* edi */
842 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
843 : "%mm0", "%mm1", "%mm4" /* clobber list */
844 , "%mm5", "%mm6", "%mm7"
848 else /* mmx _not supported - Use modified C routine */
849 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
851 register png_uint_32 i
;
852 png_uint_32 initial_val
= BPP2
* png_pass_start
[png_ptr
->pass
];
853 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
854 register int stride
= BPP2
* png_pass_inc
[png_ptr
->pass
];
855 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
856 register int rep_bytes
= BPP2
* png_pass_width
[png_ptr
->pass
];
857 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
858 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
859 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
860 register png_uint_32 final_val
= BPP2
* len
; /* GRR bugfix */
862 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
863 dstptr
= row
+ initial_val
;
865 for (i
= initial_val
; i
< final_val
; i
+= stride
)
867 png_memcpy(dstptr
, srcptr
, rep_bytes
);
871 if (diff
) /* number of leftover pixels: 3 for pngtest */
873 final_val
+=diff
*BPP2
;
874 for (; i
< final_val
; i
+= stride
)
876 if (rep_bytes
> (int)(final_val
-i
))
877 rep_bytes
= (int)(final_val
-i
);
878 png_memcpy(dstptr
, srcptr
, rep_bytes
);
883 } /* end of else (_mmx_supported) */
888 case 24: /* png_ptr->row_info.pixel_depth */
893 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
894 #if !defined(PNG_1_0_X)
895 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
896 /* && _mmx_supported */ )
903 int dummy_value_a
; /* fix 'forbidden register spilled' error */
908 _unmask
= ~mask
; /* global variable for -fPIC version */
909 srcptr
= png_ptr
->row_buf
+ 1;
911 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
912 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
914 __asm__
__volatile__ (
915 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
916 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
917 "punpcklbw %%mm7, %%mm7 \n\t"
918 "punpcklwd %%mm7, %%mm7 \n\t"
919 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
921 "movq _mask24_0, %%mm0 \n\t"
922 "movq _mask24_1, %%mm1 \n\t"
923 "movq _mask24_2, %%mm2 \n\t"
925 "pand %%mm7, %%mm0 \n\t"
926 "pand %%mm7, %%mm1 \n\t"
927 "pand %%mm7, %%mm2 \n\t"
929 "pcmpeqb %%mm6, %%mm0 \n\t"
930 "pcmpeqb %%mm6, %%mm1 \n\t"
931 "pcmpeqb %%mm6, %%mm2 \n\t"
933 /* preload "movl len, %%ecx \n\t" // load length of line */
934 /* preload "movl srcptr, %%esi \n\t" // load source */
935 /* preload "movl dstptr, %%edi \n\t" // load dest */
937 "cmpl $0, %%ecx \n\t"
938 "jz mainloop24end \n\t"
941 "movq (%%esi), %%mm4 \n\t"
942 "pand %%mm0, %%mm4 \n\t"
943 "movq %%mm0, %%mm6 \n\t"
944 "movq (%%edi), %%mm7 \n\t"
945 "pandn %%mm7, %%mm6 \n\t"
946 "por %%mm6, %%mm4 \n\t"
947 "movq %%mm4, (%%edi) \n\t"
949 "movq 8(%%esi), %%mm5 \n\t"
950 "pand %%mm1, %%mm5 \n\t"
951 "movq %%mm1, %%mm7 \n\t"
952 "movq 8(%%edi), %%mm6 \n\t"
953 "pandn %%mm6, %%mm7 \n\t"
954 "por %%mm7, %%mm5 \n\t"
955 "movq %%mm5, 8(%%edi) \n\t"
957 "movq 16(%%esi), %%mm6 \n\t"
958 "pand %%mm2, %%mm6 \n\t"
959 "movq %%mm2, %%mm4 \n\t"
960 "movq 16(%%edi), %%mm7 \n\t"
961 "pandn %%mm7, %%mm4 \n\t"
962 "por %%mm4, %%mm6 \n\t"
963 "movq %%mm6, 16(%%edi) \n\t"
965 "addl $24, %%esi \n\t" /* inc by 24 bytes processed */
966 "addl $24, %%edi \n\t"
967 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
971 "mainloop24end: \n\t"
972 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
973 "movl %%eax, %%ecx \n\t"
974 "cmpl $0, %%ecx \n\t"
976 /* preload "movl mask, %%edx \n\t" */
977 "sall $24, %%edx \n\t" /* make low byte, high byte */
980 "sall %%edx \n\t" /* move high bit to CF */
981 "jnc skip24 \n\t" /* if CF = 0 */
982 "movw (%%esi), %%ax \n\t"
983 "movw %%ax, (%%edi) \n\t"
984 "xorl %%eax, %%eax \n\t"
985 "movb 2(%%esi), %%al \n\t"
986 "movb %%al, 2(%%edi) \n\t"
989 "addl $3, %%esi \n\t"
990 "addl $3, %%edi \n\t"
992 "jnz secondloop24 \n\t"
995 "EMMS \n\t" /* DONE */
997 : "=a" (dummy_value_a
), /* output regs (dummy) */
998 "=d" (dummy_value_d
),
999 "=c" (dummy_value_c
),
1000 "=S" (dummy_value_S
),
1001 "=D" (dummy_value_D
)
1003 : "3" (srcptr
), /* esi // input regs */
1004 "4" (dstptr
), /* edi */
1005 "0" (diff
), /* eax */
1006 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1007 "2" (len
), /* ecx */
1008 "1" (mask
) /* edx */
1010 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1011 : "%mm0", "%mm1", "%mm2" /* clobber list */
1012 , "%mm4", "%mm5", "%mm6", "%mm7"
1016 else /* mmx _not supported - Use modified C routine */
1017 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1019 register png_uint_32 i
;
1020 png_uint_32 initial_val
= BPP3
* png_pass_start
[png_ptr
->pass
];
1021 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1022 register int stride
= BPP3
* png_pass_inc
[png_ptr
->pass
];
1023 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1024 register int rep_bytes
= BPP3
* png_pass_width
[png_ptr
->pass
];
1025 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1026 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1027 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1028 register png_uint_32 final_val
= BPP3
* len
; /* GRR bugfix */
1030 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1031 dstptr
= row
+ initial_val
;
1033 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1035 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1039 if (diff
) /* number of leftover pixels: 3 for pngtest */
1041 final_val
+=diff
*BPP3
;
1042 for (; i
< final_val
; i
+= stride
)
1044 if (rep_bytes
> (int)(final_val
-i
))
1045 rep_bytes
= (int)(final_val
-i
);
1046 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1051 } /* end of else (_mmx_supported) */
1056 case 32: /* png_ptr->row_info.pixel_depth */
1061 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1062 #if !defined(PNG_1_0_X)
1063 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
1064 /* && _mmx_supported */ )
1071 int dummy_value_a
; /* fix 'forbidden register spilled' error */
1076 _unmask
= ~mask
; /* global variable for -fPIC version */
1077 srcptr
= png_ptr
->row_buf
+ 1;
1079 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
1080 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
1082 __asm__
__volatile__ (
1083 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1084 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
1085 "punpcklbw %%mm7, %%mm7 \n\t"
1086 "punpcklwd %%mm7, %%mm7 \n\t"
1087 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
1089 "movq _mask32_0, %%mm0 \n\t"
1090 "movq _mask32_1, %%mm1 \n\t"
1091 "movq _mask32_2, %%mm2 \n\t"
1092 "movq _mask32_3, %%mm3 \n\t"
1094 "pand %%mm7, %%mm0 \n\t"
1095 "pand %%mm7, %%mm1 \n\t"
1096 "pand %%mm7, %%mm2 \n\t"
1097 "pand %%mm7, %%mm3 \n\t"
1099 "pcmpeqb %%mm6, %%mm0 \n\t"
1100 "pcmpeqb %%mm6, %%mm1 \n\t"
1101 "pcmpeqb %%mm6, %%mm2 \n\t"
1102 "pcmpeqb %%mm6, %%mm3 \n\t"
1104 /* preload "movl len, %%ecx \n\t" // load length of line */
1105 /* preload "movl srcptr, %%esi \n\t" // load source */
1106 /* preload "movl dstptr, %%edi \n\t" // load dest */
1108 "cmpl $0, %%ecx \n\t" /* lcr */
1109 "jz mainloop32end \n\t"
1112 "movq (%%esi), %%mm4 \n\t"
1113 "pand %%mm0, %%mm4 \n\t"
1114 "movq %%mm0, %%mm6 \n\t"
1115 "movq (%%edi), %%mm7 \n\t"
1116 "pandn %%mm7, %%mm6 \n\t"
1117 "por %%mm6, %%mm4 \n\t"
1118 "movq %%mm4, (%%edi) \n\t"
1120 "movq 8(%%esi), %%mm5 \n\t"
1121 "pand %%mm1, %%mm5 \n\t"
1122 "movq %%mm1, %%mm7 \n\t"
1123 "movq 8(%%edi), %%mm6 \n\t"
1124 "pandn %%mm6, %%mm7 \n\t"
1125 "por %%mm7, %%mm5 \n\t"
1126 "movq %%mm5, 8(%%edi) \n\t"
1128 "movq 16(%%esi), %%mm6 \n\t"
1129 "pand %%mm2, %%mm6 \n\t"
1130 "movq %%mm2, %%mm4 \n\t"
1131 "movq 16(%%edi), %%mm7 \n\t"
1132 "pandn %%mm7, %%mm4 \n\t"
1133 "por %%mm4, %%mm6 \n\t"
1134 "movq %%mm6, 16(%%edi) \n\t"
1136 "movq 24(%%esi), %%mm7 \n\t"
1137 "pand %%mm3, %%mm7 \n\t"
1138 "movq %%mm3, %%mm5 \n\t"
1139 "movq 24(%%edi), %%mm4 \n\t"
1140 "pandn %%mm4, %%mm5 \n\t"
1141 "por %%mm5, %%mm7 \n\t"
1142 "movq %%mm7, 24(%%edi) \n\t"
1144 "addl $32, %%esi \n\t" /* inc by 32 bytes processed */
1145 "addl $32, %%edi \n\t"
1146 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
1147 "ja mainloop32 \n\t"
1149 "mainloop32end: \n\t"
1150 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
1151 "movl %%eax, %%ecx \n\t"
1152 "cmpl $0, %%ecx \n\t"
1154 /* preload "movl mask, %%edx \n\t" */
1155 "sall $24, %%edx \n\t" /* low byte => high byte */
1157 "secondloop32: \n\t"
1158 "sall %%edx \n\t" /* move high bit to CF */
1159 "jnc skip32 \n\t" /* if CF = 0 */
1160 "movl (%%esi), %%eax \n\t"
1161 "movl %%eax, (%%edi) \n\t"
1164 "addl $4, %%esi \n\t"
1165 "addl $4, %%edi \n\t"
1167 "jnz secondloop32 \n\t"
1170 "EMMS \n\t" /* DONE */
1172 : "=a" (dummy_value_a
), /* output regs (dummy) */
1173 "=d" (dummy_value_d
),
1174 "=c" (dummy_value_c
),
1175 "=S" (dummy_value_S
),
1176 "=D" (dummy_value_D
)
1178 : "3" (srcptr
), /* esi // input regs */
1179 "4" (dstptr
), /* edi */
1180 "0" (diff
), /* eax */
1181 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1182 "2" (len
), /* ecx */
1183 "1" (mask
) /* edx */
1185 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1186 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
1187 , "%mm4", "%mm5", "%mm6", "%mm7"
1191 else /* mmx _not supported - Use modified C routine */
1192 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1194 register png_uint_32 i
;
1195 png_uint_32 initial_val
= BPP4
* png_pass_start
[png_ptr
->pass
];
1196 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1197 register int stride
= BPP4
* png_pass_inc
[png_ptr
->pass
];
1198 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1199 register int rep_bytes
= BPP4
* png_pass_width
[png_ptr
->pass
];
1200 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1201 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1202 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1203 register png_uint_32 final_val
= BPP4
* len
; /* GRR bugfix */
1205 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1206 dstptr
= row
+ initial_val
;
1208 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1210 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1214 if (diff
) /* number of leftover pixels: 3 for pngtest */
1216 final_val
+=diff
*BPP4
;
1217 for (; i
< final_val
; i
+= stride
)
1219 if (rep_bytes
> (int)(final_val
-i
))
1220 rep_bytes
= (int)(final_val
-i
);
1221 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1226 } /* end of else (_mmx_supported) */
1231 case 48: /* png_ptr->row_info.pixel_depth */
1236 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1237 #if !defined(PNG_1_0_X)
1238 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
1239 /* && _mmx_supported */ )
1246 int dummy_value_a
; /* fix 'forbidden register spilled' error */
1251 _unmask
= ~mask
; /* global variable for -fPIC version */
1252 srcptr
= png_ptr
->row_buf
+ 1;
1254 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
1255 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
1257 __asm__
__volatile__ (
1258 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1259 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
1260 "punpcklbw %%mm7, %%mm7 \n\t"
1261 "punpcklwd %%mm7, %%mm7 \n\t"
1262 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
1264 "movq _mask48_0, %%mm0 \n\t"
1265 "movq _mask48_1, %%mm1 \n\t"
1266 "movq _mask48_2, %%mm2 \n\t"
1267 "movq _mask48_3, %%mm3 \n\t"
1268 "movq _mask48_4, %%mm4 \n\t"
1269 "movq _mask48_5, %%mm5 \n\t"
1271 "pand %%mm7, %%mm0 \n\t"
1272 "pand %%mm7, %%mm1 \n\t"
1273 "pand %%mm7, %%mm2 \n\t"
1274 "pand %%mm7, %%mm3 \n\t"
1275 "pand %%mm7, %%mm4 \n\t"
1276 "pand %%mm7, %%mm5 \n\t"
1278 "pcmpeqb %%mm6, %%mm0 \n\t"
1279 "pcmpeqb %%mm6, %%mm1 \n\t"
1280 "pcmpeqb %%mm6, %%mm2 \n\t"
1281 "pcmpeqb %%mm6, %%mm3 \n\t"
1282 "pcmpeqb %%mm6, %%mm4 \n\t"
1283 "pcmpeqb %%mm6, %%mm5 \n\t"
1285 /* preload "movl len, %%ecx \n\t" // load length of line */
1286 /* preload "movl srcptr, %%esi \n\t" // load source */
1287 /* preload "movl dstptr, %%edi \n\t" // load dest */
1289 "cmpl $0, %%ecx \n\t"
1290 "jz mainloop48end \n\t"
1293 "movq (%%esi), %%mm7 \n\t"
1294 "pand %%mm0, %%mm7 \n\t"
1295 "movq %%mm0, %%mm6 \n\t"
1296 "pandn (%%edi), %%mm6 \n\t"
1297 "por %%mm6, %%mm7 \n\t"
1298 "movq %%mm7, (%%edi) \n\t"
1300 "movq 8(%%esi), %%mm6 \n\t"
1301 "pand %%mm1, %%mm6 \n\t"
1302 "movq %%mm1, %%mm7 \n\t"
1303 "pandn 8(%%edi), %%mm7 \n\t"
1304 "por %%mm7, %%mm6 \n\t"
1305 "movq %%mm6, 8(%%edi) \n\t"
1307 "movq 16(%%esi), %%mm6 \n\t"
1308 "pand %%mm2, %%mm6 \n\t"
1309 "movq %%mm2, %%mm7 \n\t"
1310 "pandn 16(%%edi), %%mm7 \n\t"
1311 "por %%mm7, %%mm6 \n\t"
1312 "movq %%mm6, 16(%%edi) \n\t"
1314 "movq 24(%%esi), %%mm7 \n\t"
1315 "pand %%mm3, %%mm7 \n\t"
1316 "movq %%mm3, %%mm6 \n\t"
1317 "pandn 24(%%edi), %%mm6 \n\t"
1318 "por %%mm6, %%mm7 \n\t"
1319 "movq %%mm7, 24(%%edi) \n\t"
1321 "movq 32(%%esi), %%mm6 \n\t"
1322 "pand %%mm4, %%mm6 \n\t"
1323 "movq %%mm4, %%mm7 \n\t"
1324 "pandn 32(%%edi), %%mm7 \n\t"
1325 "por %%mm7, %%mm6 \n\t"
1326 "movq %%mm6, 32(%%edi) \n\t"
1328 "movq 40(%%esi), %%mm7 \n\t"
1329 "pand %%mm5, %%mm7 \n\t"
1330 "movq %%mm5, %%mm6 \n\t"
1331 "pandn 40(%%edi), %%mm6 \n\t"
1332 "por %%mm6, %%mm7 \n\t"
1333 "movq %%mm7, 40(%%edi) \n\t"
1335 "addl $48, %%esi \n\t" /* inc by 48 bytes processed */
1336 "addl $48, %%edi \n\t"
1337 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
1339 "ja mainloop48 \n\t"
1341 "mainloop48end: \n\t"
1342 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
1343 "movl %%eax, %%ecx \n\t"
1344 "cmpl $0, %%ecx \n\t"
1346 /* preload "movl mask, %%edx \n\t" */
1347 "sall $24, %%edx \n\t" /* make low byte, high byte */
1349 "secondloop48: \n\t"
1350 "sall %%edx \n\t" /* move high bit to CF */
1351 "jnc skip48 \n\t" /* if CF = 0 */
1352 "movl (%%esi), %%eax \n\t"
1353 "movl %%eax, (%%edi) \n\t"
1356 "addl $4, %%esi \n\t"
1357 "addl $4, %%edi \n\t"
1359 "jnz secondloop48 \n\t"
1362 "EMMS \n\t" /* DONE */
1364 : "=a" (dummy_value_a
), /* output regs (dummy) */
1365 "=d" (dummy_value_d
),
1366 "=c" (dummy_value_c
),
1367 "=S" (dummy_value_S
),
1368 "=D" (dummy_value_D
)
1370 : "3" (srcptr
), /* esi // input regs */
1371 "4" (dstptr
), /* edi */
1372 "0" (diff
), /* eax */
1373 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1374 "2" (len
), /* ecx */
1375 "1" (mask
) /* edx */
1377 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1378 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
1379 , "%mm4", "%mm5", "%mm6", "%mm7"
1383 else /* mmx _not supported - Use modified C routine */
1384 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1386 register png_uint_32 i
;
1387 png_uint_32 initial_val
= BPP6
* png_pass_start
[png_ptr
->pass
];
1388 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1389 register int stride
= BPP6
* png_pass_inc
[png_ptr
->pass
];
1390 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1391 register int rep_bytes
= BPP6
* png_pass_width
[png_ptr
->pass
];
1392 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1393 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1394 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1395 register png_uint_32 final_val
= BPP6
* len
; /* GRR bugfix */
1397 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1398 dstptr
= row
+ initial_val
;
1400 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1402 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1406 if (diff
) /* number of leftover pixels: 3 for pngtest */
1408 final_val
+=diff
*BPP6
;
1409 for (; i
< final_val
; i
+= stride
)
1411 if (rep_bytes
> (int)(final_val
-i
))
1412 rep_bytes
= (int)(final_val
-i
);
1413 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1418 } /* end of else (_mmx_supported) */
1423 case 64: /* png_ptr->row_info.pixel_depth */
1427 register png_uint_32 i
;
1428 png_uint_32 initial_val
= BPP8
* png_pass_start
[png_ptr
->pass
];
1429 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1430 register int stride
= BPP8
* png_pass_inc
[png_ptr
->pass
];
1431 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1432 register int rep_bytes
= BPP8
* png_pass_width
[png_ptr
->pass
];
1433 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1434 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1435 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1436 register png_uint_32 final_val
= BPP8
* len
; /* GRR bugfix */
1438 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1439 dstptr
= row
+ initial_val
;
1441 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1443 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1447 if (diff
) /* number of leftover pixels: 3 for pngtest */
1449 final_val
+=diff
*BPP8
;
1450 for (; i
< final_val
; i
+= stride
)
1452 if (rep_bytes
> (int)(final_val
-i
))
1453 rep_bytes
= (int)(final_val
-i
);
1454 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1463 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1465 /* this should never happen */
1466 png_warning(png_ptr
, "Invalid row_info.pixel_depth in pnggccrd");
1469 } /* end switch (png_ptr->row_info.pixel_depth) */
1471 } /* end if (non-trivial mask) */
1473 } /* end png_combine_row() */
1475 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1480 /*===========================================================================*/
1482 /* P N G _ D O _ R E A D _ I N T E R L A C E */
1484 /*===========================================================================*/
1486 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1487 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1489 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1490 * has taken place. [GRR: what other steps come before and/or after?]
1494 png_do_read_interlace(png_structp png_ptr
)
1496 png_row_infop row_info
= &(png_ptr
->row_info
);
1497 png_bytep row
= png_ptr
->row_buf
+ 1;
1498 int pass
= png_ptr
->pass
;
1499 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1500 png_uint_32 transformations
= png_ptr
->transformations
;
1503 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1505 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1506 if (_mmx_supported
== 2) {
1507 #if !defined(PNG_1_0_X)
1508 /* this should have happened in png_init_mmx_flags() already */
1509 png_warning(png_ptr
, "asm_flags may not have been initialized");
1515 if (row
!= NULL
&& row_info
!= NULL
)
1517 png_uint_32 final_width
;
1519 final_width
= row_info
->width
* png_pass_inc
[pass
];
1521 switch (row_info
->pixel_depth
)
1527 int s_start
, s_end
, s_inc
;
1532 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 3);
1533 dp
= row
+ (png_size_t
)((final_width
- 1) >> 3);
1534 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1535 if (transformations
& PNG_PACKSWAP
)
1537 sshift
= (int)((row_info
->width
+ 7) & 7);
1538 dshift
= (int)((final_width
+ 7) & 7);
1546 sshift
= 7 - (int)((row_info
->width
+ 7) & 7);
1547 dshift
= 7 - (int)((final_width
+ 7) & 7);
1553 for (i
= row_info
->width
; i
; i
--)
1555 v
= (png_byte
)((*sp
>> sshift
) & 0x1);
1556 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1558 *dp
&= (png_byte
)((0x7f7f >> (7 - dshift
)) & 0xff);
1559 *dp
|= (png_byte
)(v
<< dshift
);
1560 if (dshift
== s_end
)
1568 if (sshift
== s_end
)
1583 int s_start
, s_end
, s_inc
;
1586 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 2);
1587 dp
= row
+ (png_size_t
)((final_width
- 1) >> 2);
1588 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1589 if (transformations
& PNG_PACKSWAP
)
1591 sshift
= (png_size_t
)(((row_info
->width
+ 3) & 3) << 1);
1592 dshift
= (png_size_t
)(((final_width
+ 3) & 3) << 1);
1600 sshift
= (png_size_t
)((3 - ((row_info
->width
+ 3) & 3)) << 1);
1601 dshift
= (png_size_t
)((3 - ((final_width
+ 3) & 3)) << 1);
1607 for (i
= row_info
->width
; i
; i
--)
1612 v
= (png_byte
)((*sp
>> sshift
) & 0x3);
1613 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1615 *dp
&= (png_byte
)((0x3f3f >> (6 - dshift
)) & 0xff);
1616 *dp
|= (png_byte
)(v
<< dshift
);
1617 if (dshift
== s_end
)
1625 if (sshift
== s_end
)
1640 int s_start
, s_end
, s_inc
;
1643 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 1);
1644 dp
= row
+ (png_size_t
)((final_width
- 1) >> 1);
1645 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1646 if (transformations
& PNG_PACKSWAP
)
1648 sshift
= (png_size_t
)(((row_info
->width
+ 1) & 1) << 2);
1649 dshift
= (png_size_t
)(((final_width
+ 1) & 1) << 2);
1657 sshift
= (png_size_t
)((1 - ((row_info
->width
+ 1) & 1)) << 2);
1658 dshift
= (png_size_t
)((1 - ((final_width
+ 1) & 1)) << 2);
1664 for (i
= row_info
->width
; i
; i
--)
1669 v
= (png_byte
)((*sp
>> sshift
) & 0xf);
1670 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1672 *dp
&= (png_byte
)((0xf0f >> (4 - dshift
)) & 0xff);
1673 *dp
|= (png_byte
)(v
<< dshift
);
1674 if (dshift
== s_end
)
1682 if (sshift
== s_end
)
1693 /*====================================================================*/
1695 default: /* 8-bit or larger (this is where the routine is modified) */
1698 /* static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1699 /* static unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
1700 /* unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1701 /* unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
1705 png_size_t pixel_bytes
;
1706 int width
= (int)row_info
->width
;
1708 pixel_bytes
= (row_info
->pixel_depth
>> 3);
1710 /* point sptr at the last pixel in the pre-expanded row: */
1711 sptr
= row
+ (width
- 1) * pixel_bytes
;
1713 /* point dp at the last pixel position in the expanded row: */
1714 dp
= row
+ (final_width
- 1) * pixel_bytes
;
1716 /* New code by Nirav Chhatrapati - Intel Corporation */
1718 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1719 #if !defined(PNG_1_0_X)
1720 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_INTERLACE
)
1721 /* && _mmx_supported */ )
1726 //--------------------------------------------------------------
1727 if (pixel_bytes
== 3)
1729 if (((pass
== 0) || (pass
== 1)) && width
)
1731 int dummy_value_c
; /* fix 'forbidden register spilled' */
1735 __asm__
__volatile__ (
1736 "subl $21, %%edi \n\t"
1737 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1739 ".loop3_pass0: \n\t"
1740 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1741 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1742 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1743 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1744 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1745 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1746 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1747 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1748 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
1749 "movq %%mm0, %%mm3 \n\t" /* 2 1 0 2 1 0 2 1 */
1750 "psllq $16, %%mm0 \n\t" /* 0 2 1 0 2 1 z z */
1751 "movq %%mm3, %%mm4 \n\t" /* 2 1 0 2 1 0 2 1 */
1752 "punpckhdq %%mm0, %%mm3 \n\t" /* 0 2 1 0 2 1 0 2 */
1753 "movq %%mm4, 16(%%edi) \n\t"
1754 "psrlq $32, %%mm0 \n\t" /* z z z z 0 2 1 0 */
1755 "movq %%mm3, 8(%%edi) \n\t"
1756 "punpckldq %%mm4, %%mm0 \n\t" /* 1 0 2 1 0 2 1 0 */
1757 "subl $3, %%esi \n\t"
1758 "movq %%mm0, (%%edi) \n\t"
1759 "subl $24, %%edi \n\t"
1761 "jnz .loop3_pass0 \n\t"
1762 "EMMS \n\t" /* DONE */
1764 : "=c" (dummy_value_c
), /* output regs (dummy) */
1765 "=S" (dummy_value_S
),
1766 "=D" (dummy_value_D
)
1768 : "1" (sptr
), /* esi // input regs */
1770 "0" (width
) /* ecx */
1771 /* doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4) */
1773 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1774 : "%mm0", "%mm1", "%mm2" /* clobber list */
1779 else if (((pass
== 2) || (pass
== 3)) && width
)
1781 int dummy_value_c
; /* fix 'forbidden register spilled' */
1785 __asm__
__volatile__ (
1786 "subl $9, %%edi \n\t"
1787 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1789 ".loop3_pass2: \n\t"
1790 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1791 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1792 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1793 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1794 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1795 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1796 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1797 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1798 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
1799 "movq %%mm0, 4(%%edi) \n\t"
1800 "psrlq $16, %%mm0 \n\t" /* z z 2 1 0 2 1 0 */
1801 "subl $3, %%esi \n\t"
1802 "movd %%mm0, (%%edi) \n\t"
1803 "subl $12, %%edi \n\t"
1805 "jnz .loop3_pass2 \n\t"
1806 "EMMS \n\t" /* DONE */
1808 : "=c" (dummy_value_c
), /* output regs (dummy) */
1809 "=S" (dummy_value_S
),
1810 "=D" (dummy_value_D
)
1812 : "1" (sptr
), /* esi // input regs */
1814 "0" (width
) /* ecx */
1816 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1817 : "%mm0", "%mm1", "%mm2" /* clobber list */
1821 else if (width
) /* && ((pass == 4) || (pass == 5)) */
1823 int width_mmx
= ((width
>> 1) << 1) - 8; /* GRR: huh? */
1826 width
-= width_mmx
; /* 8 or 9 pix, 24 or 27 bytes */
1829 /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1830 /* sptr points at last pixel in pre-expanded row */
1831 /* dp points at last pixel position in expanded row */
1832 int dummy_value_c
; /* fix 'forbidden register spilled' */
1836 __asm__
__volatile__ (
1837 "subl $3, %%esi \n\t"
1838 "subl $9, %%edi \n\t"
1839 /* (png_pass_inc[pass] + 1)*pixel_bytes */
1841 ".loop3_pass4: \n\t"
1842 "movq (%%esi), %%mm0 \n\t" /* x x 5 4 3 2 1 0 */
1843 "movq %%mm0, %%mm1 \n\t" /* x x 5 4 3 2 1 0 */
1844 "movq %%mm0, %%mm2 \n\t" /* x x 5 4 3 2 1 0 */
1845 "psllq $24, %%mm0 \n\t" /* 4 3 2 1 0 z z z */
1846 "pand _const4, %%mm1 \n\t" /* z z z z z 2 1 0 */
1847 "psrlq $24, %%mm2 \n\t" /* z z z x x 5 4 3 */
1848 "por %%mm1, %%mm0 \n\t" /* 4 3 2 1 0 2 1 0 */
1849 "movq %%mm2, %%mm3 \n\t" /* z z z x x 5 4 3 */
1850 "psllq $8, %%mm2 \n\t" /* z z x x 5 4 3 z */
1851 "movq %%mm0, (%%edi) \n\t"
1852 "psrlq $16, %%mm3 \n\t" /* z z z z z x x 5 */
1853 "pand _const6, %%mm3 \n\t" /* z z z z z z z 5 */
1854 "por %%mm3, %%mm2 \n\t" /* z z x x 5 4 3 5 */
1855 "subl $6, %%esi \n\t"
1856 "movd %%mm2, 8(%%edi) \n\t"
1857 "subl $12, %%edi \n\t"
1858 "subl $2, %%ecx \n\t"
1859 "jnz .loop3_pass4 \n\t"
1860 "EMMS \n\t" /* DONE */
1862 : "=c" (dummy_value_c
), /* output regs (dummy) */
1863 "=S" (dummy_value_S
),
1864 "=D" (dummy_value_D
)
1866 : "1" (sptr
), /* esi // input regs */
1868 "0" (width_mmx
) /* ecx */
1870 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1871 : "%mm0", "%mm1" /* clobber list */
1877 sptr
-= width_mmx
*3;
1879 for (i
= width
; i
; i
--)
1884 png_memcpy(v
, sptr
, 3);
1885 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1887 png_memcpy(dp
, v
, 3);
1893 } /* end of pixel_bytes == 3 */
1895 //--------------------------------------------------------------
1896 else if (pixel_bytes
== 1)
1898 if (((pass
== 0) || (pass
== 1)) && width
)
1900 int width_mmx
= ((width
>> 2) << 2);
1901 width
-= width_mmx
; /* 0-3 pixels => 0-3 bytes */
1904 int dummy_value_c
; /* fix 'forbidden register spilled' */
1908 __asm__
__volatile__ (
1909 "subl $3, %%esi \n\t"
1910 "subl $31, %%edi \n\t"
1912 ".loop1_pass0: \n\t"
1913 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
1914 "movq %%mm0, %%mm1 \n\t" /* x x x x 3 2 1 0 */
1915 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
1916 "movq %%mm0, %%mm2 \n\t" /* 3 3 2 2 1 1 0 0 */
1917 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
1918 "movq %%mm0, %%mm3 \n\t" /* 1 1 1 1 0 0 0 0 */
1919 "punpckldq %%mm0, %%mm0 \n\t" /* 0 0 0 0 0 0 0 0 */
1920 "punpckhdq %%mm3, %%mm3 \n\t" /* 1 1 1 1 1 1 1 1 */
1921 "movq %%mm0, (%%edi) \n\t"
1922 "punpckhwd %%mm2, %%mm2 \n\t" /* 3 3 3 3 2 2 2 2 */
1923 "movq %%mm3, 8(%%edi) \n\t"
1924 "movq %%mm2, %%mm4 \n\t" /* 3 3 3 3 2 2 2 2 */
1925 "punpckldq %%mm2, %%mm2 \n\t" /* 2 2 2 2 2 2 2 2 */
1926 "punpckhdq %%mm4, %%mm4 \n\t" /* 3 3 3 3 3 3 3 3 */
1927 "movq %%mm2, 16(%%edi) \n\t"
1928 "subl $4, %%esi \n\t"
1929 "movq %%mm4, 24(%%edi) \n\t"
1930 "subl $32, %%edi \n\t"
1931 "subl $4, %%ecx \n\t"
1932 "jnz .loop1_pass0 \n\t"
1933 "EMMS \n\t" /* DONE */
1935 : "=c" (dummy_value_c
), /* output regs (dummy) */
1936 "=S" (dummy_value_S
),
1937 "=D" (dummy_value_D
)
1939 : "1" (sptr
), /* esi // input regs */
1941 "0" (width_mmx
) /* ecx */
1943 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1944 : "%mm0", "%mm1", "%mm2" /* clobber list */
1952 for (i
= width
; i
; i
--)
1956 /* I simplified this part in version 1.0.4e
1957 * here and in several other instances where
1958 * pixel_bytes == 1 -- GR-P
1963 * png_memcpy(v, sptr, pixel_bytes);
1964 * for (j = 0; j < png_pass_inc[pass]; j++)
1966 * png_memcpy(dp, v, pixel_bytes);
1967 * dp -= pixel_bytes;
1969 * sptr -= pixel_bytes;
1971 * Replacement code is in the next three lines:
1974 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1981 else if (((pass
== 2) || (pass
== 3)) && width
)
1983 int width_mmx
= ((width
>> 2) << 2);
1984 width
-= width_mmx
; /* 0-3 pixels => 0-3 bytes */
1987 int dummy_value_c
; /* fix 'forbidden register spilled' */
1991 __asm__
__volatile__ (
1992 "subl $3, %%esi \n\t"
1993 "subl $15, %%edi \n\t"
1995 ".loop1_pass2: \n\t"
1996 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
1997 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
1998 "movq %%mm0, %%mm1 \n\t" /* 3 3 2 2 1 1 0 0 */
1999 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
2000 "punpckhwd %%mm1, %%mm1 \n\t" /* 3 3 3 3 2 2 2 2 */
2001 "movq %%mm0, (%%edi) \n\t"
2002 "subl $4, %%esi \n\t"
2003 "movq %%mm1, 8(%%edi) \n\t"
2004 "subl $16, %%edi \n\t"
2005 "subl $4, %%ecx \n\t"
2006 "jnz .loop1_pass2 \n\t"
2007 "EMMS \n\t" /* DONE */
2009 : "=c" (dummy_value_c
), /* output regs (dummy) */
2010 "=S" (dummy_value_S
),
2011 "=D" (dummy_value_D
)
2013 : "1" (sptr
), /* esi // input regs */
2015 "0" (width_mmx
) /* ecx */
2017 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2018 : "%mm0", "%mm1" /* clobber list */
2025 for (i
= width
; i
; i
--)
2029 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2036 else if (width
) /* && ((pass == 4) || (pass == 5)) */
2038 int width_mmx
= ((width
>> 3) << 3);
2039 width
-= width_mmx
; /* 0-3 pixels => 0-3 bytes */
2042 int dummy_value_c
; /* fix 'forbidden register spilled' */
2046 __asm__
__volatile__ (
2047 "subl $7, %%esi \n\t"
2048 "subl $15, %%edi \n\t"
2050 ".loop1_pass4: \n\t"
2051 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2052 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2053 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
2054 "punpckhbw %%mm1, %%mm1 \n\t" /* 7 7 6 6 5 5 4 4 */
2055 "movq %%mm1, 8(%%edi) \n\t"
2056 "subl $8, %%esi \n\t"
2057 "movq %%mm0, (%%edi) \n\t"
2058 "subl $16, %%edi \n\t"
2059 "subl $8, %%ecx \n\t"
2060 "jnz .loop1_pass4 \n\t"
2061 "EMMS \n\t" /* DONE */
2063 : "=c" (dummy_value_c
), /* output regs (none) */
2064 "=S" (dummy_value_S
),
2065 "=D" (dummy_value_D
)
2067 : "1" (sptr
), /* esi // input regs */
2069 "0" (width_mmx
) /* ecx */
2071 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2072 : "%mm0", "%mm1" /* clobber list */
2079 for (i
= width
; i
; i
--)
2083 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2090 } /* end of pixel_bytes == 1 */
2092 //--------------------------------------------------------------
2093 else if (pixel_bytes
== 2)
2095 if (((pass
== 0) || (pass
== 1)) && width
)
2097 int width_mmx
= ((width
>> 1) << 1);
2098 width
-= width_mmx
; /* 0,1 pixels => 0,2 bytes */
2101 int dummy_value_c
; /* fix 'forbidden register spilled' */
2105 __asm__
__volatile__ (
2106 "subl $2, %%esi \n\t"
2107 "subl $30, %%edi \n\t"
2109 ".loop2_pass0: \n\t"
2110 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2111 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2112 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2113 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2114 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
2115 "movq %%mm0, (%%edi) \n\t"
2116 "movq %%mm0, 8(%%edi) \n\t"
2117 "movq %%mm1, 16(%%edi) \n\t"
2118 "subl $4, %%esi \n\t"
2119 "movq %%mm1, 24(%%edi) \n\t"
2120 "subl $32, %%edi \n\t"
2121 "subl $2, %%ecx \n\t"
2122 "jnz .loop2_pass0 \n\t"
2123 "EMMS \n\t" /* DONE */
2125 : "=c" (dummy_value_c
), /* output regs (dummy) */
2126 "=S" (dummy_value_S
),
2127 "=D" (dummy_value_D
)
2129 : "1" (sptr
), /* esi // input regs */
2131 "0" (width_mmx
) /* ecx */
2133 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2134 : "%mm0", "%mm1" /* clobber list */
2139 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
2140 dp
-= (width_mmx
*16 - 2); /* sign fixed */
2141 for (i
= width
; i
; i
--)
2146 png_memcpy(v
, sptr
, 2);
2147 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2150 png_memcpy(dp
, v
, 2);
2154 else if (((pass
== 2) || (pass
== 3)) && width
)
2156 int width_mmx
= ((width
>> 1) << 1) ;
2157 width
-= width_mmx
; /* 0,1 pixels => 0,2 bytes */
2160 int dummy_value_c
; /* fix 'forbidden register spilled' */
2164 __asm__
__volatile__ (
2165 "subl $2, %%esi \n\t"
2166 "subl $14, %%edi \n\t"
2168 ".loop2_pass2: \n\t"
2169 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2170 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2171 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2172 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2173 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
2174 "movq %%mm0, (%%edi) \n\t"
2175 "subl $4, %%esi \n\t"
2176 "movq %%mm1, 8(%%edi) \n\t"
2177 "subl $16, %%edi \n\t"
2178 "subl $2, %%ecx \n\t"
2179 "jnz .loop2_pass2 \n\t"
2180 "EMMS \n\t" /* DONE */
2182 : "=c" (dummy_value_c
), /* output regs (dummy) */
2183 "=S" (dummy_value_S
),
2184 "=D" (dummy_value_D
)
2186 : "1" (sptr
), /* esi // input regs */
2188 "0" (width_mmx
) /* ecx */
2190 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2191 : "%mm0", "%mm1" /* clobber list */
2196 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
2197 dp
-= (width_mmx
*8 - 2); /* sign fixed */
2198 for (i
= width
; i
; i
--)
2203 png_memcpy(v
, sptr
, 2);
2204 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2207 png_memcpy(dp
, v
, 2);
2211 else if (width
) /* pass == 4 or 5 */
2213 int width_mmx
= ((width
>> 1) << 1) ;
2214 width
-= width_mmx
; /* 0,1 pixels => 0,2 bytes */
2217 int dummy_value_c
; /* fix 'forbidden register spilled' */
2221 __asm__
__volatile__ (
2222 "subl $2, %%esi \n\t"
2223 "subl $6, %%edi \n\t"
2225 ".loop2_pass4: \n\t"
2226 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2227 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2228 "subl $4, %%esi \n\t"
2229 "movq %%mm0, (%%edi) \n\t"
2230 "subl $8, %%edi \n\t"
2231 "subl $2, %%ecx \n\t"
2232 "jnz .loop2_pass4 \n\t"
2233 "EMMS \n\t" /* DONE */
2235 : "=c" (dummy_value_c
), /* output regs (dummy) */
2236 "=S" (dummy_value_S
),
2237 "=D" (dummy_value_D
)
2239 : "1" (sptr
), /* esi // input regs */
2241 "0" (width_mmx
) /* ecx */
2243 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2244 : "%mm0" /* clobber list */
2249 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
2250 dp
-= (width_mmx
*4 - 2); /* sign fixed */
2251 for (i
= width
; i
; i
--)
2256 png_memcpy(v
, sptr
, 2);
2257 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2260 png_memcpy(dp
, v
, 2);
2264 } /* end of pixel_bytes == 2 */
2266 //--------------------------------------------------------------
2267 else if (pixel_bytes
== 4)
2269 if (((pass
== 0) || (pass
== 1)) && width
)
2271 int width_mmx
= ((width
>> 1) << 1);
2272 width
-= width_mmx
; /* 0,1 pixels => 0,4 bytes */
2275 int dummy_value_c
; /* fix 'forbidden register spilled' */
2279 __asm__
__volatile__ (
2280 "subl $4, %%esi \n\t"
2281 "subl $60, %%edi \n\t"
2283 ".loop4_pass0: \n\t"
2284 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2285 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2286 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2287 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
2288 "movq %%mm0, (%%edi) \n\t"
2289 "movq %%mm0, 8(%%edi) \n\t"
2290 "movq %%mm0, 16(%%edi) \n\t"
2291 "movq %%mm0, 24(%%edi) \n\t"
2292 "movq %%mm1, 32(%%edi) \n\t"
2293 "movq %%mm1, 40(%%edi) \n\t"
2294 "movq %%mm1, 48(%%edi) \n\t"
2295 "subl $8, %%esi \n\t"
2296 "movq %%mm1, 56(%%edi) \n\t"
2297 "subl $64, %%edi \n\t"
2298 "subl $2, %%ecx \n\t"
2299 "jnz .loop4_pass0 \n\t"
2300 "EMMS \n\t" /* DONE */
2302 : "=c" (dummy_value_c
), /* output regs (dummy) */
2303 "=S" (dummy_value_S
),
2304 "=D" (dummy_value_D
)
2306 : "1" (sptr
), /* esi // input regs */
2308 "0" (width_mmx
) /* ecx */
2310 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2311 : "%mm0", "%mm1" /* clobber list */
2316 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
2317 dp
-= (width_mmx
*32 - 4); /* sign fixed */
2318 for (i
= width
; i
; i
--)
2323 png_memcpy(v
, sptr
, 4);
2324 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2327 png_memcpy(dp
, v
, 4);
2331 else if (((pass
== 2) || (pass
== 3)) && width
)
2333 int width_mmx
= ((width
>> 1) << 1);
2334 width
-= width_mmx
; /* 0,1 pixels => 0,4 bytes */
2337 int dummy_value_c
; /* fix 'forbidden register spilled' */
2341 __asm__
__volatile__ (
2342 "subl $4, %%esi \n\t"
2343 "subl $28, %%edi \n\t"
2345 ".loop4_pass2: \n\t"
2346 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2347 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2348 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2349 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
2350 "movq %%mm0, (%%edi) \n\t"
2351 "movq %%mm0, 8(%%edi) \n\t"
2352 "movq %%mm1, 16(%%edi) \n\t"
2353 "movq %%mm1, 24(%%edi) \n\t"
2354 "subl $8, %%esi \n\t"
2355 "subl $32, %%edi \n\t"
2356 "subl $2, %%ecx \n\t"
2357 "jnz .loop4_pass2 \n\t"
2358 "EMMS \n\t" /* DONE */
2360 : "=c" (dummy_value_c
), /* output regs (dummy) */
2361 "=S" (dummy_value_S
),
2362 "=D" (dummy_value_D
)
2364 : "1" (sptr
), /* esi // input regs */
2366 "0" (width_mmx
) /* ecx */
2368 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2369 : "%mm0", "%mm1" /* clobber list */
2374 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
2375 dp
-= (width_mmx
*16 - 4); /* sign fixed */
2376 for (i
= width
; i
; i
--)
2381 png_memcpy(v
, sptr
, 4);
2382 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2385 png_memcpy(dp
, v
, 4);
2389 else if (width
) /* pass == 4 or 5 */
2391 int width_mmx
= ((width
>> 1) << 1) ;
2392 width
-= width_mmx
; /* 0,1 pixels => 0,4 bytes */
2395 int dummy_value_c
; /* fix 'forbidden register spilled' */
2399 __asm__
__volatile__ (
2400 "subl $4, %%esi \n\t"
2401 "subl $12, %%edi \n\t"
2403 ".loop4_pass4: \n\t"
2404 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2405 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2406 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2407 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
2408 "movq %%mm0, (%%edi) \n\t"
2409 "subl $8, %%esi \n\t"
2410 "movq %%mm1, 8(%%edi) \n\t"
2411 "subl $16, %%edi \n\t"
2412 "subl $2, %%ecx \n\t"
2413 "jnz .loop4_pass4 \n\t"
2414 "EMMS \n\t" /* DONE */
2416 : "=c" (dummy_value_c
), /* output regs (dummy) */
2417 "=S" (dummy_value_S
),
2418 "=D" (dummy_value_D
)
2420 : "1" (sptr
), /* esi // input regs */
2422 "0" (width_mmx
) /* ecx */
2424 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2425 : "%mm0", "%mm1" /* clobber list */
2430 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
2431 dp
-= (width_mmx
*8 - 4); /* sign fixed */
2432 for (i
= width
; i
; i
--)
2437 png_memcpy(v
, sptr
, 4);
2438 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2441 png_memcpy(dp
, v
, 4);
2445 } /* end of pixel_bytes == 4 */
2447 //--------------------------------------------------------------
2448 else if (pixel_bytes
== 8)
2450 /* GRR TEST: should work, but needs testing (special 64-bit version of rpng2?) */
2451 /* GRR NOTE: no need to combine passes here! */
2452 if (((pass
== 0) || (pass
== 1)) && width
)
2454 int dummy_value_c
; /* fix 'forbidden register spilled' */
2458 /* source is 8-byte RRGGBBAA */
2459 /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */
2460 __asm__
__volatile__ (
2461 "subl $56, %%edi \n\t" /* start of last block */
2463 ".loop8_pass0: \n\t"
2464 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2465 "movq %%mm0, (%%edi) \n\t"
2466 "movq %%mm0, 8(%%edi) \n\t"
2467 "movq %%mm0, 16(%%edi) \n\t"
2468 "movq %%mm0, 24(%%edi) \n\t"
2469 "movq %%mm0, 32(%%edi) \n\t"
2470 "movq %%mm0, 40(%%edi) \n\t"
2471 "movq %%mm0, 48(%%edi) \n\t"
2472 "subl $8, %%esi \n\t"
2473 "movq %%mm0, 56(%%edi) \n\t"
2474 "subl $64, %%edi \n\t"
2476 "jnz .loop8_pass0 \n\t"
2477 "EMMS \n\t" /* DONE */
2479 : "=c" (dummy_value_c
), /* output regs (dummy) */
2480 "=S" (dummy_value_S
),
2481 "=D" (dummy_value_D
)
2483 : "1" (sptr
), /* esi // input regs */
2485 "0" (width
) /* ecx */
2487 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2488 : "%mm0" /* clobber list */
2492 else if (((pass
== 2) || (pass
== 3)) && width
)
2494 /* source is 8-byte RRGGBBAA */
2495 /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
2496 /* (recall that expansion is _in place_: sptr and dp */
2497 /* both point at locations within same row buffer) */
2499 int dummy_value_c
; /* fix 'forbidden register spilled' */
2503 __asm__
__volatile__ (
2504 "subl $24, %%edi \n\t" /* start of last block */
2506 ".loop8_pass2: \n\t"
2507 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2508 "movq %%mm0, (%%edi) \n\t"
2509 "movq %%mm0, 8(%%edi) \n\t"
2510 "movq %%mm0, 16(%%edi) \n\t"
2511 "subl $8, %%esi \n\t"
2512 "movq %%mm0, 24(%%edi) \n\t"
2513 "subl $32, %%edi \n\t"
2515 "jnz .loop8_pass2 \n\t"
2516 "EMMS \n\t" /* DONE */
2518 : "=c" (dummy_value_c
), /* output regs (dummy) */
2519 "=S" (dummy_value_S
),
2520 "=D" (dummy_value_D
)
2522 : "1" (sptr
), /* esi // input regs */
2524 "0" (width
) /* ecx */
2526 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2527 : "%mm0" /* clobber list */
2532 else if (width
) /* pass == 4 or 5 */
2534 /* source is 8-byte RRGGBBAA */
2535 /* dest is 16-byte RRGGBBAA RRGGBBAA */
2537 int dummy_value_c
; /* fix 'forbidden register spilled' */
2541 __asm__
__volatile__ (
2542 "subl $8, %%edi \n\t" /* start of last block */
2544 ".loop8_pass4: \n\t"
2545 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2546 "movq %%mm0, (%%edi) \n\t"
2547 "subl $8, %%esi \n\t"
2548 "movq %%mm0, 8(%%edi) \n\t"
2549 "subl $16, %%edi \n\t"
2551 "jnz .loop8_pass4 \n\t"
2552 "EMMS \n\t" /* DONE */
2554 : "=c" (dummy_value_c
), /* output regs (dummy) */
2555 "=S" (dummy_value_S
),
2556 "=D" (dummy_value_D
)
2558 : "1" (sptr
), /* esi // input regs */
2560 "0" (width
) /* ecx */
2562 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2563 : "%mm0" /* clobber list */
2569 } /* end of pixel_bytes == 8 */
2571 //--------------------------------------------------------------
2572 else if (pixel_bytes
== 6)
2574 for (i
= width
; i
; i
--)
2578 png_memcpy(v
, sptr
, 6);
2579 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2581 png_memcpy(dp
, v
, 6);
2586 } /* end of pixel_bytes == 6 */
2588 //--------------------------------------------------------------
2591 for (i
= width
; i
; i
--)
2595 png_memcpy(v
, sptr
, pixel_bytes
);
2596 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2598 png_memcpy(dp
, v
, pixel_bytes
);
2604 } /* end of _mmx_supported ======================================== */
2606 else /* MMX not supported: use modified C code - takes advantage
2607 * of inlining of png_memcpy for a constant */
2608 /* GRR 19991007: does it? or should pixel_bytes in each
2609 * block be replaced with immediate value (e.g., 1)? */
2610 /* GRR 19991017: replaced with constants in each case */
2611 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2613 if (pixel_bytes
== 1)
2615 for (i
= width
; i
; i
--)
2618 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2625 else if (pixel_bytes
== 3)
2627 for (i
= width
; i
; i
--)
2631 png_memcpy(v
, sptr
, 3);
2632 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2634 png_memcpy(dp
, v
, 3);
2640 else if (pixel_bytes
== 2)
2642 for (i
= width
; i
; i
--)
2646 png_memcpy(v
, sptr
, 2);
2647 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2649 png_memcpy(dp
, v
, 2);
2655 else if (pixel_bytes
== 4)
2657 for (i
= width
; i
; i
--)
2661 png_memcpy(v
, sptr
, 4);
2662 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2665 if (dp
< row
|| dp
+3 > row
+png_ptr
->row_buf_size
)
2667 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2668 row
, dp
, row
+png_ptr
->row_buf_size
);
2669 printf("row_buf=%d\n",png_ptr
->row_buf_size
);
2672 png_memcpy(dp
, v
, 4);
2678 else if (pixel_bytes
== 6)
2680 for (i
= width
; i
; i
--)
2684 png_memcpy(v
, sptr
, 6);
2685 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2687 png_memcpy(dp
, v
, 6);
2693 else if (pixel_bytes
== 8)
2695 for (i
= width
; i
; i
--)
2699 png_memcpy(v
, sptr
, 8);
2700 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2702 png_memcpy(dp
, v
, 8);
2708 else /* GRR: should never be reached */
2710 for (i
= width
; i
; i
--)
2714 png_memcpy(v
, sptr
, pixel_bytes
);
2715 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2717 png_memcpy(dp
, v
, pixel_bytes
);
2720 sptr
-= pixel_bytes
;
2724 } /* end if (MMX not supported) */
2727 } /* end switch (row_info->pixel_depth) */
2729 row_info
->width
= final_width
;
2730 row_info
->rowbytes
= ((final_width
*
2731 (png_uint_32
)row_info
->pixel_depth
+ 7) >> 3);
2734 } /* end png_do_read_interlace() */
2736 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2737 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2741 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2742 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2744 /* These variables are utilized in the functions below. They are declared */
2745 /* globally here to ensure alignment on 8-byte boundaries. */
2750 } _LBCarryMask
= {0x0101010101010101LL
},
2751 _HBClearMask
= {0x7f7f7f7f7f7f7f7fLL
},
2752 _ActiveMask
, _ActiveMask2
, _ActiveMaskEnd
, _ShiftBpp
, _ShiftRem
;
2754 #ifdef PNG_THREAD_UNSAFE_OK
2755 /*===========================================================================*/
2757 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G */
2759 /*===========================================================================*/
2761 /* Optimized code for PNG Average filter decoder */
2763 static void /* PRIVATE */
2764 png_read_filter_row_mmx_avg(png_row_infop row_info
, png_bytep row
,
2768 int dummy_value_c
; /* fix 'forbidden register 2 (cx) was spilled' error */
2772 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* get # bytes per pixel */
2773 _FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
2775 __asm__
__volatile__ (
2776 /* initialize address pointers and offset */
2778 "pushl %%ebx \n\t" /* save index to Global Offset Table */
2780 /*pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
2781 "xorl %%ebx, %%ebx \n\t" /* ebx: x */
2782 "movl %%edi, %%edx \n\t"
2783 /*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
2784 /*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
2785 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
2787 "xorl %%eax,%%eax \n\t"
2789 /* Compute the Raw value for the first bpp bytes */
2790 /* Raw(x) = Avg(x) + (Prior(x)/2) */
2792 "movb (%%esi,%%ebx,),%%al \n\t" /* load al with Prior(x) */
2794 "shrb %%al \n\t" /* divide by 2 */
2795 "addb -1(%%edi,%%ebx,),%%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2796 /* pre "cmpl bpp, %%ebx \n\t" */ /* (bpp is preloaded into ecx) */
2797 "cmpl %%ecx, %%ebx \n\t"
2798 "movb %%al,-1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2799 "jb avg_rlp \n\t" /* mov does not affect flags */
2801 /* get # of bytes to alignment */
2802 "movl %%edi, _dif \n\t" /* take start of row */
2803 "addl %%ebx, _dif \n\t" /* add bpp */
2804 "addl $0xf, _dif \n\t" /* add 7+8 to incr past alignment bdry */
2805 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
2806 "subl %%edi, _dif \n\t" /* subtract from start => value ebx at */
2807 "jz avg_go \n\t" /* alignment */
2810 /* Compute the Raw value for the bytes up to the alignment boundary */
2811 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2812 "xorl %%ecx, %%ecx \n\t"
2815 "xorl %%eax, %%eax \n\t"
2816 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
2817 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
2818 "addw %%cx, %%ax \n\t"
2820 "shrw %%ax \n\t" /* divide by 2 */
2821 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2822 "cmpl _dif, %%ebx \n\t" /* check if at alignment boundary */
2823 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2824 "jb avg_lp1 \n\t" /* repeat until at alignment boundary */
2827 "movl _FullLength, %%eax \n\t"
2828 "movl %%eax, %%ecx \n\t"
2829 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
2830 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
2831 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
2832 "movl %%ecx, _MMXLength \n\t"
2834 "popl %%ebx \n\t" /* restore index to Global Offset Table */
2837 : "=c" (dummy_value_c
), /* output regs (dummy) */
2838 "=S" (dummy_value_S
),
2839 "=D" (dummy_value_D
)
2841 : "0" (bpp
), /* ecx // input regs */
2842 "1" (prev_row
), /* esi */
2845 : "%eax", "%edx" /* clobber list */
2849 /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */
2850 /* (seems to work fine without...) */
2853 /* now do the math for the rest of the row */
2858 _ActiveMask
.use
= 0x0000000000ffffffLL
;
2859 _ShiftBpp
.use
= 24; /* == 3 * 8 */
2860 _ShiftRem
.use
= 40; /* == 64 - 24 */
2862 __asm__
__volatile__ (
2863 /* re-init address pointers and offset */
2864 "movq _ActiveMask, %%mm7 \n\t"
2865 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2866 "movq _LBCarryMask, %%mm5 \n\t" /* alignment boundary */
2867 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
2868 "movq _HBClearMask, %%mm4 \n\t"
2869 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
2871 /* prime the pump: load the first Raw(x-bpp) data set */
2872 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2873 /* (correct pos. in loop below) */
2875 "movq (%%edi,%%ecx,), %%mm0 \n\t" /* load mm0 with Avg(x) */
2876 "movq %%mm5, %%mm3 \n\t"
2877 "psrlq _ShiftRem, %%mm2 \n\t" /* correct position Raw(x-bpp) */
2879 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* load mm1 with Prior(x) */
2880 "movq %%mm7, %%mm6 \n\t"
2881 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
2882 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
2883 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
2885 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
2887 /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */
2888 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2890 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2892 /* lsb's were == 1 (only valid for active group) */
2893 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2894 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2896 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2898 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
2899 /* bytes to add to Avg */
2900 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2901 /* Avg for each Active */
2903 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
2904 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
2906 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2907 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2908 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2910 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2912 /* lsb's were == 1 (only valid for active group) */
2913 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2914 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2916 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2918 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2919 /* bytes to add to Avg */
2920 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2921 /* Avg for each Active */
2924 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
2925 "psllq _ShiftBpp, %%mm6 \n\t" /* shift mm6 mask to cover last */
2928 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2929 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2930 /* Data only needs to be shifted once here to */
2931 /* get the correct x-bpp offset. */
2932 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2934 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2936 /* lsb's were == 1 (only valid for active group) */
2937 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2938 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2940 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2942 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2943 /* bytes to add to Avg */
2944 "addl $8, %%ecx \n\t"
2945 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2946 /* Avg for each Active */
2948 /* now ready to write back to memory */
2949 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2950 /* move updated Raw(x) to use as Raw(x-bpp) for next loop */
2951 "cmpl _MMXLength, %%ecx \n\t"
2952 "movq %%mm0, %%mm2 \n\t" /* mov updated Raw(x) to mm2 */
2955 : "=S" (dummy_value_S
), /* output regs (dummy) */
2956 "=D" (dummy_value_D
)
2958 : "0" (prev_row
), /* esi // input regs */
2961 : "%ecx" /* clobber list */
2962 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2963 , "%mm0", "%mm1", "%mm2", "%mm3"
2964 , "%mm4", "%mm5", "%mm6", "%mm7"
2968 break; /* end 3 bpp */
2972 //case 7: /* who wrote this? PNG doesn't support 5 or 7 bytes/pixel */
2973 //case 5: /* GRR BOGUS */
2975 _ActiveMask
.use
= 0xffffffffffffffffLL
; /* use shift below to clear */
2976 /* appropriate inactive bytes */
2977 _ShiftBpp
.use
= bpp
<< 3;
2978 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
2980 __asm__
__volatile__ (
2981 "movq _HBClearMask, %%mm4 \n\t"
2983 /* re-init address pointers and offset */
2984 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2985 /* alignment boundary */
2987 /* load _ActiveMask and clear all bytes except for 1st active group */
2988 "movq _ActiveMask, %%mm7 \n\t"
2989 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
2990 "psrlq _ShiftRem, %%mm7 \n\t"
2991 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
2992 "movq %%mm7, %%mm6 \n\t"
2993 "movq _LBCarryMask, %%mm5 \n\t"
2994 "psllq _ShiftBpp, %%mm6 \n\t" /* create mask for 2nd active */
2997 /* prime the pump: load the first Raw(x-bpp) data set */
2998 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2999 /* (we correct pos. in loop below) */
3001 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3002 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
3003 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3004 /* add (Prev_row/2) to average */
3005 "movq %%mm5, %%mm3 \n\t"
3006 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3007 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3008 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3010 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3012 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3013 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3015 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3017 /* lsb's were == 1 (only valid for active group) */
3018 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3019 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3021 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3023 "pand %%mm7, %%mm2 \n\t" /* leave only Active Group 1 */
3024 /* bytes to add to Avg */
3025 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3026 /* for each Active */
3028 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3029 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3030 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3031 "addl $8, %%ecx \n\t"
3032 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3034 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3036 /* lsb's were == 1 (only valid for active group) */
3037 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3038 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3040 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3042 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3043 /* bytes to add to Avg */
3044 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3045 /* Avg for each Active */
3047 "cmpl _MMXLength, %%ecx \n\t"
3048 /* now ready to write back to memory */
3049 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3050 /* prep Raw(x-bpp) for next loop */
3051 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3054 : "=S" (dummy_value_S
), /* output regs (dummy) */
3055 "=D" (dummy_value_D
)
3057 : "0" (prev_row
), /* esi // input regs */
3060 : "%ecx" /* clobber list */
3061 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3062 , "%mm0", "%mm1", "%mm2", "%mm3"
3063 , "%mm4", "%mm5", "%mm6", "%mm7"
3067 break; /* end 4,6 bpp */
3071 _ActiveMask
.use
= 0x000000000000ffffLL
;
3072 _ShiftBpp
.use
= 16; /* == 2 * 8 */
3073 _ShiftRem
.use
= 48; /* == 64 - 16 */
3075 __asm__
__volatile__ (
3076 /* load _ActiveMask */
3077 "movq _ActiveMask, %%mm7 \n\t"
3078 /* re-init address pointers and offset */
3079 "movl _dif, %%ecx \n\t" /* ecx: x = offset to alignment */
3081 "movq _LBCarryMask, %%mm5 \n\t"
3082 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3083 "movq _HBClearMask, %%mm4 \n\t"
3084 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3086 /* prime the pump: load the first Raw(x-bpp) data set */
3087 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3088 /* (we correct pos. in loop below) */
3090 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3091 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
3092 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* (GRR BUGFIX: was psllq) */
3093 /* add (Prev_row/2) to average */
3094 "movq %%mm5, %%mm3 \n\t"
3095 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3096 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3097 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3099 "movq %%mm7, %%mm6 \n\t"
3100 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3103 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3104 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3106 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3108 /* lsb's were == 1 (only valid */
3109 /* for active group) */
3110 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3111 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3113 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3115 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
3116 /* bytes to add to Avg */
3117 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3118 /* for each Active byte */
3120 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3121 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3123 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3124 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3125 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3127 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3129 /* lsb's were == 1 (only valid */
3130 /* for active group) */
3131 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3132 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3134 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3136 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3137 /* bytes to add to Avg */
3138 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3139 /* Avg for each Active byte */
3141 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
3142 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3144 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3145 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3146 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3148 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3149 /* where both lsb's were == 1 */
3150 /* (only valid for active group) */
3151 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3152 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3154 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3156 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3157 /* bytes to add to Avg */
3158 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3159 /* Avg for each Active byte */
3161 /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */
3162 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3164 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3165 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3166 "addl $8, %%ecx \n\t"
3167 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3169 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3171 /* lsb's were == 1 (only valid */
3172 /* for active group) */
3173 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3174 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3176 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3178 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3179 /* bytes to add to Avg */
3180 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3181 /* Avg for each Active byte */
3183 "cmpl _MMXLength, %%ecx \n\t"
3184 /* now ready to write back to memory */
3185 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3186 /* prep Raw(x-bpp) for next loop */
3187 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3190 : "=S" (dummy_value_S
), /* output regs (dummy) */
3191 "=D" (dummy_value_D
)
3193 : "0" (prev_row
), /* esi // input regs */
3196 : "%ecx" /* clobber list */
3197 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3198 , "%mm0", "%mm1", "%mm2", "%mm3"
3199 , "%mm4", "%mm5", "%mm6", "%mm7"
3203 break; /* end 2 bpp */
3207 __asm__
__volatile__ (
3208 /* re-init address pointers and offset */
3210 "pushl %%ebx \n\t" /* save Global Offset Table index */
3212 "movl _dif, %%ebx \n\t" /* ebx: x = offset to alignment */
3214 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3215 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
3217 /* do Paeth decode for remaining bytes */
3218 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3219 "movl %%edi, %%edx \n\t"
3220 /* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
3221 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3222 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
3225 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3226 "xorl %%eax, %%eax \n\t"
3227 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3228 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
3229 "addw %%cx, %%ax \n\t"
3231 "shrw %%ax \n\t" /* divide by 2 */
3232 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */
3234 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3235 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */
3236 /* mov does not affect flags; -1 to offset inc ebx */
3241 "popl %%ebx \n\t" /* Global Offset Table index */
3244 : "=c" (dummy_value_c
), /* output regs (dummy) */
3245 "=S" (dummy_value_S
),
3246 "=D" (dummy_value_D
)
3248 : "0" (bpp
), /* ecx // input regs */
3249 "1" (prev_row
), /* esi */
3252 : "%eax", "%edx" /* clobber list */
3258 return; /* end 1 bpp */
3262 __asm__
__volatile__ (
3263 /* re-init address pointers and offset */
3264 "movl _dif, %%ecx \n\t" /* ecx: x == offset to alignment */
3265 "movq _LBCarryMask, %%mm5 \n\t" /* boundary */
3266 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3267 "movq _HBClearMask, %%mm4 \n\t"
3268 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3270 /* prime the pump: load the first Raw(x-bpp) data set */
3271 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3272 /* (NO NEED to correct pos. in loop below) */
3275 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3276 "movq %%mm5, %%mm3 \n\t"
3277 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3278 "addl $8, %%ecx \n\t"
3279 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3280 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3281 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3282 /* where both lsb's were == 1 */
3283 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3284 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7, each byte */
3285 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg, each byte */
3286 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7, each byte */
3287 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg, each */
3288 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
3289 "cmpl _MMXLength, %%ecx \n\t"
3290 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3291 "movq %%mm0, %%mm2 \n\t" /* reuse as Raw(x-bpp) */
3294 : "=S" (dummy_value_S
), /* output regs (dummy) */
3295 "=D" (dummy_value_D
)
3297 : "0" (prev_row
), /* esi // input regs */
3300 : "%ecx" /* clobber list */
3301 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3302 , "%mm0", "%mm1", "%mm2"
3303 , "%mm3", "%mm4", "%mm5"
3307 break; /* end 8 bpp */
3309 default: /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */
3313 /* GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED */
3315 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3319 __asm__
__volatile__ (
3320 "movq _LBCarryMask, %%mm5 \n\t"
3321 /* re-init address pointers and offset */
3322 "movl _dif, %%ebx \n\t" /* ebx: x = offset to */
3323 /* alignment boundary */
3324 "movl row, %%edi \n\t" /* edi: Avg(x) */
3325 "movq _HBClearMask, %%mm4 \n\t"
3326 "movl %%edi, %%edx \n\t"
3327 "movl prev_row, %%esi \n\t" /* esi: Prior(x) */
3328 "subl bpp, %%edx \n\t" /* edx: Raw(x-bpp) */
3330 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3331 "movq %%mm5, %%mm3 \n\t"
3332 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3333 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3334 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3335 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3336 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3337 /* where both lsb's were == 1 */
3338 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3339 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3341 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg for each */
3343 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3345 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3347 "addl $8, %%ebx \n\t"
3348 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
3350 "cmpl _MMXLength, %%ebx \n\t"
3351 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3354 : /* FIXASM: output regs/vars go here, e.g.: "=m" (memory_var) */
3356 : /* FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest) */
3358 : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */
3360 #endif /* 0 - NEVER REACHED */
3364 } /* end switch (bpp) */
3366 __asm__
__volatile__ (
3367 /* MMX acceleration complete; now do clean-up */
3368 /* check if any remaining bytes left to decode */
3370 "pushl %%ebx \n\t" /* save index to Global Offset Table */
3372 "movl _MMXLength, %%ebx \n\t" /* ebx: x == offset bytes after MMX */
3373 /* pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
3374 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
3377 /* do Avg decode for remaining bytes */
3378 /*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
3379 "movl %%edi, %%edx \n\t"
3380 /*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
3381 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3382 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
3385 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3386 "xorl %%eax, %%eax \n\t"
3387 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3388 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
3389 "addw %%cx, %%ax \n\t"
3391 "shrw %%ax \n\t" /* divide by 2 */
3392 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
3393 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3394 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */
3395 "jb avg_lp2 \n\t" /* affect flags; -1 to offset inc ebx] */
3398 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
3400 "popl %%ebx \n\t" /* restore index to Global Offset Table */
3403 : "=c" (dummy_value_c
), /* output regs (dummy) */
3404 "=S" (dummy_value_S
),
3405 "=D" (dummy_value_D
)
3407 : "0" (bpp
), /* ecx // input regs */
3408 "1" (prev_row
), /* esi */
3411 : "%eax", "%edx" /* clobber list */
3417 } /* end png_read_filter_row_mmx_avg() */
3422 #ifdef PNG_THREAD_UNSAFE_OK
3423 /*===========================================================================*/
3425 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H */
3427 /*===========================================================================*/
3429 /* Optimized code for PNG Paeth filter decoder */
3431 static void /* PRIVATE */
3432 png_read_filter_row_mmx_paeth(png_row_infop row_info
, png_bytep row
,
3436 int dummy_value_c
; /* fix 'forbidden register 2 (cx) was spilled' error */
3440 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
3441 _FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
3443 __asm__
__volatile__ (
3445 "pushl %%ebx \n\t" /* save index to Global Offset Table */
3447 "xorl %%ebx, %%ebx \n\t" /* ebx: x offset */
3448 /*pre "movl row, %%edi \n\t" */
3449 "xorl %%edx, %%edx \n\t" /* edx: x-bpp offset */
3450 /*pre "movl prev_row, %%esi \n\t" */
3451 "xorl %%eax, %%eax \n\t"
3453 /* Compute the Raw value for the first bpp bytes */
3454 /* Note: the formula works out to be always */
3455 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
3457 "movb (%%edi,%%ebx,), %%al \n\t"
3458 "addb (%%esi,%%ebx,), %%al \n\t"
3460 /*pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx) */
3461 "cmpl %%ecx, %%ebx \n\t"
3462 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3464 /* get # of bytes to alignment */
3465 "movl %%edi, _dif \n\t" /* take start of row */
3466 "addl %%ebx, _dif \n\t" /* add bpp */
3467 "xorl %%ecx, %%ecx \n\t"
3468 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past alignment */
3470 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
3471 "subl %%edi, _dif \n\t" /* subtract from start ==> value ebx */
3477 "xorl %%eax, %%eax \n\t"
3478 /* pav = p - a = (a + b - c) - a = b - c */
3479 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
3480 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3481 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
3482 "movl %%eax, _patemp \n\t" /* Save pav for later use */
3483 "xorl %%eax, %%eax \n\t"
3484 /* pbv = p - b = (a + b - c) - b = a - c */
3485 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
3486 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
3487 "movl %%eax, %%ecx \n\t"
3488 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3489 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
3491 "testl $0x80000000, %%eax \n\t"
3493 "negl %%eax \n\t" /* reverse sign of neg values */
3496 "movl %%eax, _pctemp \n\t" /* save pc for later use */
3498 "testl $0x80000000, %%ecx \n\t"
3500 "negl %%ecx \n\t" /* reverse sign of neg values */
3503 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
3505 "movl _patemp, %%eax \n\t"
3506 "testl $0x80000000, %%eax \n\t"
3508 "negl %%eax \n\t" /* reverse sign of neg values */
3511 "movl %%eax, _patemp \n\t" /* save pa for later use */
3512 /* test if pa <= pb */
3513 "cmpl %%ecx, %%eax \n\t"
3514 "jna paeth_abb \n\t"
3515 /* pa > pb; now test if pb <= pc */
3516 "cmpl _pctemp, %%ecx \n\t"
3517 "jna paeth_bbc \n\t"
3518 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3519 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3520 "jmp paeth_paeth \n\t"
3523 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3524 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
3525 "jmp paeth_paeth \n\t"
3528 /* pa <= pb; now test if pa <= pc */
3529 "cmpl _pctemp, %%eax \n\t"
3530 "jna paeth_abc \n\t"
3531 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3532 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3533 "jmp paeth_paeth \n\t"
3536 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3537 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
3542 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3543 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3544 "cmpl _dif, %%ebx \n\t"
3548 "movl _FullLength, %%ecx \n\t"
3549 "movl %%ecx, %%eax \n\t"
3550 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
3551 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
3552 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
3553 "movl %%ecx, _MMXLength \n\t"
3555 "popl %%ebx \n\t" /* restore index to Global Offset Table */
3558 : "=c" (dummy_value_c
), /* output regs (dummy) */
3559 "=S" (dummy_value_S
),
3560 "=D" (dummy_value_D
)
3562 : "0" (bpp
), /* ecx // input regs */
3563 "1" (prev_row
), /* esi */
3566 : "%eax", "%edx" /* clobber list */
3572 /* now do the math for the rest of the row */
3577 _ActiveMask
.use
= 0x0000000000ffffffLL
;
3578 _ActiveMaskEnd
.use
= 0xffff000000000000LL
;
3579 _ShiftBpp
.use
= 24; /* == bpp(3) * 8 */
3580 _ShiftRem
.use
= 40; /* == 64 - 24 */
3582 __asm__
__volatile__ (
3583 "movl _dif, %%ecx \n\t"
3584 /* preload "movl row, %%edi \n\t" */
3585 /* preload "movl prev_row, %%esi \n\t" */
3586 "pxor %%mm0, %%mm0 \n\t"
3587 /* prime the pump: load the first Raw(x-bpp) data set */
3588 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3590 "psrlq _ShiftRem, %%mm1 \n\t" /* shift last 3 bytes to 1st */
3592 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3593 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3594 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */
3595 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3596 "psrlq _ShiftRem, %%mm3 \n\t" /* shift last 3 bytes to 1st */
3598 /* pav = p - a = (a + b - c) - a = b - c */
3599 "movq %%mm2, %%mm4 \n\t"
3600 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3601 /* pbv = p - b = (a + b - c) - b = a - c */
3602 "movq %%mm1, %%mm5 \n\t"
3603 "psubw %%mm3, %%mm4 \n\t"
3604 "pxor %%mm7, %%mm7 \n\t"
3605 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3606 "movq %%mm4, %%mm6 \n\t"
3607 "psubw %%mm3, %%mm5 \n\t"
3609 /* pa = abs(p-a) = abs(pav) */
3610 /* pb = abs(p-b) = abs(pbv) */
3611 /* pc = abs(p-c) = abs(pcv) */
3612 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3613 "paddw %%mm5, %%mm6 \n\t"
3614 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3615 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3616 "psubw %%mm0, %%mm4 \n\t"
3617 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3618 "psubw %%mm0, %%mm4 \n\t"
3619 "psubw %%mm7, %%mm5 \n\t"
3620 "pxor %%mm0, %%mm0 \n\t"
3621 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3622 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3623 "psubw %%mm7, %%mm5 \n\t"
3624 "psubw %%mm0, %%mm6 \n\t"
3626 "movq %%mm4, %%mm7 \n\t"
3627 "psubw %%mm0, %%mm6 \n\t"
3628 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3629 "movq %%mm7, %%mm0 \n\t"
3630 /* use mm7 mask to merge pa & pb */
3631 "pand %%mm7, %%mm5 \n\t"
3632 /* use mm0 mask copy to merge a & b */
3633 "pand %%mm0, %%mm2 \n\t"
3634 "pandn %%mm4, %%mm7 \n\t"
3635 "pandn %%mm1, %%mm0 \n\t"
3636 "paddw %%mm5, %%mm7 \n\t"
3637 "paddw %%mm2, %%mm0 \n\t"
3638 /* test ((pa <= pb)? pa:pb) <= pc */
3639 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3640 "pxor %%mm1, %%mm1 \n\t"
3641 "pand %%mm7, %%mm3 \n\t"
3642 "pandn %%mm0, %%mm7 \n\t"
3643 "paddw %%mm3, %%mm7 \n\t"
3644 "pxor %%mm0, %%mm0 \n\t"
3645 "packuswb %%mm1, %%mm7 \n\t"
3646 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
3647 "pand _ActiveMask, %%mm7 \n\t"
3648 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
3649 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3650 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3651 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3652 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as */
3654 /* now do Paeth for 2nd set of bytes (3-5) */
3655 "psrlq _ShiftBpp, %%mm2 \n\t" /* load b=Prior(x) step 2 */
3656 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3657 "pxor %%mm7, %%mm7 \n\t"
3658 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3659 /* pbv = p - b = (a + b - c) - b = a - c */
3660 "movq %%mm1, %%mm5 \n\t"
3661 /* pav = p - a = (a + b - c) - a = b - c */
3662 "movq %%mm2, %%mm4 \n\t"
3663 "psubw %%mm3, %%mm5 \n\t"
3664 "psubw %%mm3, %%mm4 \n\t"
3665 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
3666 /* pav + pbv = pbv + pav */
3667 "movq %%mm5, %%mm6 \n\t"
3668 "paddw %%mm4, %%mm6 \n\t"
3670 /* pa = abs(p-a) = abs(pav) */
3671 /* pb = abs(p-b) = abs(pbv) */
3672 /* pc = abs(p-c) = abs(pcv) */
3673 "pcmpgtw %%mm5, %%mm0 \n\t" /* create mask pbv bytes < 0 */
3674 "pcmpgtw %%mm4, %%mm7 \n\t" /* create mask pav bytes < 0 */
3675 "pand %%mm5, %%mm0 \n\t" /* only pbv bytes < 0 in mm0 */
3676 "pand %%mm4, %%mm7 \n\t" /* only pav bytes < 0 in mm7 */
3677 "psubw %%mm0, %%mm5 \n\t"
3678 "psubw %%mm7, %%mm4 \n\t"
3679 "psubw %%mm0, %%mm5 \n\t"
3680 "psubw %%mm7, %%mm4 \n\t"
3681 "pxor %%mm0, %%mm0 \n\t"
3682 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3683 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3684 "psubw %%mm0, %%mm6 \n\t"
3686 "movq %%mm4, %%mm7 \n\t"
3687 "psubw %%mm0, %%mm6 \n\t"
3688 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3689 "movq %%mm7, %%mm0 \n\t"
3690 /* use mm7 mask to merge pa & pb */
3691 "pand %%mm7, %%mm5 \n\t"
3692 /* use mm0 mask copy to merge a & b */
3693 "pand %%mm0, %%mm2 \n\t"
3694 "pandn %%mm4, %%mm7 \n\t"
3695 "pandn %%mm1, %%mm0 \n\t"
3696 "paddw %%mm5, %%mm7 \n\t"
3697 "paddw %%mm2, %%mm0 \n\t"
3698 /* test ((pa <= pb)? pa:pb) <= pc */
3699 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3700 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3701 "pand %%mm7, %%mm3 \n\t"
3702 "pandn %%mm0, %%mm7 \n\t"
3703 "pxor %%mm1, %%mm1 \n\t"
3704 "paddw %%mm3, %%mm7 \n\t"
3705 "pxor %%mm0, %%mm0 \n\t"
3706 "packuswb %%mm1, %%mm7 \n\t"
3707 "movq %%mm2, %%mm3 \n\t" /* load c=Prior(x-bpp) step 1 */
3708 "pand _ActiveMask, %%mm7 \n\t"
3709 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3710 "psllq _ShiftBpp, %%mm7 \n\t" /* shift bytes to 2nd group of */
3712 /* pav = p - a = (a + b - c) - a = b - c */
3713 "movq %%mm2, %%mm4 \n\t"
3714 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3715 "psllq _ShiftBpp, %%mm3 \n\t" /* load c=Prior(x-bpp) step 2 */
3716 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3717 "movq %%mm7, %%mm1 \n\t"
3718 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3719 "psllq _ShiftBpp, %%mm1 \n\t" /* shift bytes */
3720 /* now mm1 will be used as Raw(x-bpp) */
3721 /* now do Paeth for 3rd, and final, set of bytes (6-7) */
3722 "pxor %%mm7, %%mm7 \n\t"
3723 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3724 "psubw %%mm3, %%mm4 \n\t"
3725 /* pbv = p - b = (a + b - c) - b = a - c */
3726 "movq %%mm1, %%mm5 \n\t"
3727 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3728 "movq %%mm4, %%mm6 \n\t"
3729 "psubw %%mm3, %%mm5 \n\t"
3730 "pxor %%mm0, %%mm0 \n\t"
3731 "paddw %%mm5, %%mm6 \n\t"
3733 /* pa = abs(p-a) = abs(pav) */
3734 /* pb = abs(p-b) = abs(pbv) */
3735 /* pc = abs(p-c) = abs(pcv) */
3736 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3737 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3738 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3739 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3740 "psubw %%mm0, %%mm4 \n\t"
3741 "psubw %%mm7, %%mm5 \n\t"
3742 "psubw %%mm0, %%mm4 \n\t"
3743 "psubw %%mm7, %%mm5 \n\t"
3744 "pxor %%mm0, %%mm0 \n\t"
3745 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3746 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3747 "psubw %%mm0, %%mm6 \n\t"
3749 "movq %%mm4, %%mm7 \n\t"
3750 "psubw %%mm0, %%mm6 \n\t"
3751 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3752 "movq %%mm7, %%mm0 \n\t"
3753 /* use mm0 mask copy to merge a & b */
3754 "pand %%mm0, %%mm2 \n\t"
3755 /* use mm7 mask to merge pa & pb */
3756 "pand %%mm7, %%mm5 \n\t"
3757 "pandn %%mm1, %%mm0 \n\t"
3758 "pandn %%mm4, %%mm7 \n\t"
3759 "paddw %%mm2, %%mm0 \n\t"
3760 "paddw %%mm5, %%mm7 \n\t"
3761 /* test ((pa <= pb)? pa:pb) <= pc */
3762 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3763 "pand %%mm7, %%mm3 \n\t"
3764 "pandn %%mm0, %%mm7 \n\t"
3765 "paddw %%mm3, %%mm7 \n\t"
3766 "pxor %%mm1, %%mm1 \n\t"
3767 "packuswb %%mm7, %%mm1 \n\t"
3768 /* step ecx to next set of 8 bytes and repeat loop til done */
3769 "addl $8, %%ecx \n\t"
3770 "pand _ActiveMaskEnd, %%mm1 \n\t"
3771 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */
3774 "cmpl _MMXLength, %%ecx \n\t"
3775 "pxor %%mm0, %%mm0 \n\t" /* pxor does not affect flags */
3776 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3777 /* mm1 will be used as Raw(x-bpp) next loop */
3778 /* mm3 ready to be used as Prior(x-bpp) next loop */
3781 : "=S" (dummy_value_S
), /* output regs (dummy) */
3782 "=D" (dummy_value_D
)
3784 : "0" (prev_row
), /* esi // input regs */
3787 : "%ecx" /* clobber list */
3788 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3789 , "%mm0", "%mm1", "%mm2", "%mm3"
3790 , "%mm4", "%mm5", "%mm6", "%mm7"
3794 break; /* end 3 bpp */
3797 //case 7: /* GRR BOGUS */
3798 //case 5: /* GRR BOGUS */
3800 _ActiveMask
.use
= 0x00000000ffffffffLL
;
3801 _ActiveMask2
.use
= 0xffffffff00000000LL
;
3802 _ShiftBpp
.use
= bpp
<< 3; /* == bpp * 8 */
3803 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
3805 __asm__
__volatile__ (
3806 "movl _dif, %%ecx \n\t"
3807 /* preload "movl row, %%edi \n\t" */
3808 /* preload "movl prev_row, %%esi \n\t" */
3809 /* prime the pump: load the first Raw(x-bpp) data set */
3810 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3811 "pxor %%mm0, %%mm0 \n\t"
3814 /* must shift to position Raw(x-bpp) data */
3815 "psrlq _ShiftRem, %%mm1 \n\t"
3816 /* do first set of 4 bytes */
3817 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3818 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3819 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3820 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
3821 /* must shift to position Prior(x-bpp) data */
3822 "psrlq _ShiftRem, %%mm3 \n\t"
3823 /* pav = p - a = (a + b - c) - a = b - c */
3824 "movq %%mm2, %%mm4 \n\t"
3825 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
3826 /* pbv = p - b = (a + b - c) - b = a - c */
3827 "movq %%mm1, %%mm5 \n\t"
3828 "psubw %%mm3, %%mm4 \n\t"
3829 "pxor %%mm7, %%mm7 \n\t"
3830 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3831 "movq %%mm4, %%mm6 \n\t"
3832 "psubw %%mm3, %%mm5 \n\t"
3833 /* pa = abs(p-a) = abs(pav) */
3834 /* pb = abs(p-b) = abs(pbv) */
3835 /* pc = abs(p-c) = abs(pcv) */
3836 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3837 "paddw %%mm5, %%mm6 \n\t"
3838 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3839 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3840 "psubw %%mm0, %%mm4 \n\t"
3841 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3842 "psubw %%mm0, %%mm4 \n\t"
3843 "psubw %%mm7, %%mm5 \n\t"
3844 "pxor %%mm0, %%mm0 \n\t"
3845 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3846 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3847 "psubw %%mm7, %%mm5 \n\t"
3848 "psubw %%mm0, %%mm6 \n\t"
3850 "movq %%mm4, %%mm7 \n\t"
3851 "psubw %%mm0, %%mm6 \n\t"
3852 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3853 "movq %%mm7, %%mm0 \n\t"
3854 /* use mm7 mask to merge pa & pb */
3855 "pand %%mm7, %%mm5 \n\t"
3856 /* use mm0 mask copy to merge a & b */
3857 "pand %%mm0, %%mm2 \n\t"
3858 "pandn %%mm4, %%mm7 \n\t"
3859 "pandn %%mm1, %%mm0 \n\t"
3860 "paddw %%mm5, %%mm7 \n\t"
3861 "paddw %%mm2, %%mm0 \n\t"
3862 /* test ((pa <= pb)? pa:pb) <= pc */
3863 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3864 "pxor %%mm1, %%mm1 \n\t"
3865 "pand %%mm7, %%mm3 \n\t"
3866 "pandn %%mm0, %%mm7 \n\t"
3867 "paddw %%mm3, %%mm7 \n\t"
3868 "pxor %%mm0, %%mm0 \n\t"
3869 "packuswb %%mm1, %%mm7 \n\t"
3870 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
3871 "pand _ActiveMask, %%mm7 \n\t"
3872 "psrlq _ShiftRem, %%mm3 \n\t"
3873 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) step 1 */
3874 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */
3875 "movq %%mm2, %%mm6 \n\t"
3876 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3877 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3878 "psllq _ShiftBpp, %%mm6 \n\t"
3879 "movq %%mm7, %%mm5 \n\t"
3880 "psrlq _ShiftRem, %%mm1 \n\t"
3881 "por %%mm6, %%mm3 \n\t"
3882 "psllq _ShiftBpp, %%mm5 \n\t"
3883 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3884 "por %%mm5, %%mm1 \n\t"
3885 /* do second set of 4 bytes */
3886 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3887 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3888 /* pav = p - a = (a + b - c) - a = b - c */
3889 "movq %%mm2, %%mm4 \n\t"
3890 /* pbv = p - b = (a + b - c) - b = a - c */
3891 "movq %%mm1, %%mm5 \n\t"
3892 "psubw %%mm3, %%mm4 \n\t"
3893 "pxor %%mm7, %%mm7 \n\t"
3894 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3895 "movq %%mm4, %%mm6 \n\t"
3896 "psubw %%mm3, %%mm5 \n\t"
3897 /* pa = abs(p-a) = abs(pav) */
3898 /* pb = abs(p-b) = abs(pbv) */
3899 /* pc = abs(p-c) = abs(pcv) */
3900 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3901 "paddw %%mm5, %%mm6 \n\t"
3902 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3903 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3904 "psubw %%mm0, %%mm4 \n\t"
3905 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3906 "psubw %%mm0, %%mm4 \n\t"
3907 "psubw %%mm7, %%mm5 \n\t"
3908 "pxor %%mm0, %%mm0 \n\t"
3909 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3910 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3911 "psubw %%mm7, %%mm5 \n\t"
3912 "psubw %%mm0, %%mm6 \n\t"
3914 "movq %%mm4, %%mm7 \n\t"
3915 "psubw %%mm0, %%mm6 \n\t"
3916 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3917 "movq %%mm7, %%mm0 \n\t"
3918 /* use mm7 mask to merge pa & pb */
3919 "pand %%mm7, %%mm5 \n\t"
3920 /* use mm0 mask copy to merge a & b */
3921 "pand %%mm0, %%mm2 \n\t"
3922 "pandn %%mm4, %%mm7 \n\t"
3923 "pandn %%mm1, %%mm0 \n\t"
3924 "paddw %%mm5, %%mm7 \n\t"
3925 "paddw %%mm2, %%mm0 \n\t"
3926 /* test ((pa <= pb)? pa:pb) <= pc */
3927 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3928 "pxor %%mm1, %%mm1 \n\t"
3929 "pand %%mm7, %%mm3 \n\t"
3930 "pandn %%mm0, %%mm7 \n\t"
3931 "pxor %%mm1, %%mm1 \n\t"
3932 "paddw %%mm3, %%mm7 \n\t"
3933 "pxor %%mm0, %%mm0 \n\t"
3934 /* step ecx to next set of 8 bytes and repeat loop til done */
3935 "addl $8, %%ecx \n\t"
3936 "packuswb %%mm7, %%mm1 \n\t"
3937 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
3938 "cmpl _MMXLength, %%ecx \n\t"
3939 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3940 /* mm1 will be used as Raw(x-bpp) next loop */
3943 : "=S" (dummy_value_S
), /* output regs (dummy) */
3944 "=D" (dummy_value_D
)
3946 : "0" (prev_row
), /* esi // input regs */
3949 : "%ecx" /* clobber list */
3950 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3951 , "%mm0", "%mm1", "%mm2", "%mm3"
3952 , "%mm4", "%mm5", "%mm6", "%mm7"
3956 break; /* end 6 bpp */
3960 _ActiveMask
.use
= 0x00000000ffffffffLL
;
3962 __asm__
__volatile__ (
3963 "movl _dif, %%ecx \n\t"
3964 /* preload "movl row, %%edi \n\t" */
3965 /* preload "movl prev_row, %%esi \n\t" */
3966 "pxor %%mm0, %%mm0 \n\t"
3967 /* prime the pump: load the first Raw(x-bpp) data set */
3968 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
3969 /* a=Raw(x-bpp) bytes */
3971 /* do first set of 4 bytes */
3972 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3973 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3974 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3975 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3976 /* pav = p - a = (a + b - c) - a = b - c */
3977 "movq %%mm2, %%mm4 \n\t"
3978 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3979 /* pbv = p - b = (a + b - c) - b = a - c */
3980 "movq %%mm1, %%mm5 \n\t"
3981 "psubw %%mm3, %%mm4 \n\t"
3982 "pxor %%mm7, %%mm7 \n\t"
3983 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3984 "movq %%mm4, %%mm6 \n\t"
3985 "psubw %%mm3, %%mm5 \n\t"
3986 /* pa = abs(p-a) = abs(pav) */
3987 /* pb = abs(p-b) = abs(pbv) */
3988 /* pc = abs(p-c) = abs(pcv) */
3989 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3990 "paddw %%mm5, %%mm6 \n\t"
3991 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3992 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3993 "psubw %%mm0, %%mm4 \n\t"
3994 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3995 "psubw %%mm0, %%mm4 \n\t"
3996 "psubw %%mm7, %%mm5 \n\t"
3997 "pxor %%mm0, %%mm0 \n\t"
3998 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3999 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4000 "psubw %%mm7, %%mm5 \n\t"
4001 "psubw %%mm0, %%mm6 \n\t"
4003 "movq %%mm4, %%mm7 \n\t"
4004 "psubw %%mm0, %%mm6 \n\t"
4005 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4006 "movq %%mm7, %%mm0 \n\t"
4007 /* use mm7 mask to merge pa & pb */
4008 "pand %%mm7, %%mm5 \n\t"
4009 /* use mm0 mask copy to merge a & b */
4010 "pand %%mm0, %%mm2 \n\t"
4011 "pandn %%mm4, %%mm7 \n\t"
4012 "pandn %%mm1, %%mm0 \n\t"
4013 "paddw %%mm5, %%mm7 \n\t"
4014 "paddw %%mm2, %%mm0 \n\t"
4015 /* test ((pa <= pb)? pa:pb) <= pc */
4016 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4017 "pxor %%mm1, %%mm1 \n\t"
4018 "pand %%mm7, %%mm3 \n\t"
4019 "pandn %%mm0, %%mm7 \n\t"
4020 "paddw %%mm3, %%mm7 \n\t"
4021 "pxor %%mm0, %%mm0 \n\t"
4022 "packuswb %%mm1, %%mm7 \n\t"
4023 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
4024 "pand _ActiveMask, %%mm7 \n\t"
4025 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
4026 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4027 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4028 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4029 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as Raw(x-bpp) */
4030 /* do second set of 4 bytes */
4031 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4032 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4033 /* pav = p - a = (a + b - c) - a = b - c */
4034 "movq %%mm2, %%mm4 \n\t"
4035 /* pbv = p - b = (a + b - c) - b = a - c */
4036 "movq %%mm1, %%mm5 \n\t"
4037 "psubw %%mm3, %%mm4 \n\t"
4038 "pxor %%mm7, %%mm7 \n\t"
4039 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4040 "movq %%mm4, %%mm6 \n\t"
4041 "psubw %%mm3, %%mm5 \n\t"
4042 /* pa = abs(p-a) = abs(pav) */
4043 /* pb = abs(p-b) = abs(pbv) */
4044 /* pc = abs(p-c) = abs(pcv) */
4045 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
4046 "paddw %%mm5, %%mm6 \n\t"
4047 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4048 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4049 "psubw %%mm0, %%mm4 \n\t"
4050 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4051 "psubw %%mm0, %%mm4 \n\t"
4052 "psubw %%mm7, %%mm5 \n\t"
4053 "pxor %%mm0, %%mm0 \n\t"
4054 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4055 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4056 "psubw %%mm7, %%mm5 \n\t"
4057 "psubw %%mm0, %%mm6 \n\t"
4059 "movq %%mm4, %%mm7 \n\t"
4060 "psubw %%mm0, %%mm6 \n\t"
4061 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4062 "movq %%mm7, %%mm0 \n\t"
4063 /* use mm7 mask to merge pa & pb */
4064 "pand %%mm7, %%mm5 \n\t"
4065 /* use mm0 mask copy to merge a & b */
4066 "pand %%mm0, %%mm2 \n\t"
4067 "pandn %%mm4, %%mm7 \n\t"
4068 "pandn %%mm1, %%mm0 \n\t"
4069 "paddw %%mm5, %%mm7 \n\t"
4070 "paddw %%mm2, %%mm0 \n\t"
4071 /* test ((pa <= pb)? pa:pb) <= pc */
4072 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4073 "pxor %%mm1, %%mm1 \n\t"
4074 "pand %%mm7, %%mm3 \n\t"
4075 "pandn %%mm0, %%mm7 \n\t"
4076 "pxor %%mm1, %%mm1 \n\t"
4077 "paddw %%mm3, %%mm7 \n\t"
4078 "pxor %%mm0, %%mm0 \n\t"
4079 /* step ecx to next set of 8 bytes and repeat loop til done */
4080 "addl $8, %%ecx \n\t"
4081 "packuswb %%mm7, %%mm1 \n\t"
4082 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */
4083 "cmpl _MMXLength, %%ecx \n\t"
4084 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4085 /* mm1 will be used as Raw(x-bpp) next loop */
4088 : "=S" (dummy_value_S
), /* output regs (dummy) */
4089 "=D" (dummy_value_D
)
4091 : "0" (prev_row
), /* esi // input regs */
4094 : "%ecx" /* clobber list */
4095 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4096 , "%mm0", "%mm1", "%mm2", "%mm3"
4097 , "%mm4", "%mm5", "%mm6", "%mm7"
4101 break; /* end 4 bpp */
4103 case 8: /* bpp == 8 */
4105 _ActiveMask
.use
= 0x00000000ffffffffLL
;
4107 __asm__
__volatile__ (
4108 "movl _dif, %%ecx \n\t"
4109 /* preload "movl row, %%edi \n\t" */
4110 /* preload "movl prev_row, %%esi \n\t" */
4111 "pxor %%mm0, %%mm0 \n\t"
4112 /* prime the pump: load the first Raw(x-bpp) data set */
4113 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
4114 /* a=Raw(x-bpp) bytes */
4116 /* do first set of 4 bytes */
4117 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4118 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4119 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4120 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4121 /* pav = p - a = (a + b - c) - a = b - c */
4122 "movq %%mm2, %%mm4 \n\t"
4123 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
4124 /* pbv = p - b = (a + b - c) - b = a - c */
4125 "movq %%mm1, %%mm5 \n\t"
4126 "psubw %%mm3, %%mm4 \n\t"
4127 "pxor %%mm7, %%mm7 \n\t"
4128 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4129 "movq %%mm4, %%mm6 \n\t"
4130 "psubw %%mm3, %%mm5 \n\t"
4131 /* pa = abs(p-a) = abs(pav) */
4132 /* pb = abs(p-b) = abs(pbv) */
4133 /* pc = abs(p-c) = abs(pcv) */
4134 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
4135 "paddw %%mm5, %%mm6 \n\t"
4136 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4137 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4138 "psubw %%mm0, %%mm4 \n\t"
4139 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4140 "psubw %%mm0, %%mm4 \n\t"
4141 "psubw %%mm7, %%mm5 \n\t"
4142 "pxor %%mm0, %%mm0 \n\t"
4143 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4144 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4145 "psubw %%mm7, %%mm5 \n\t"
4146 "psubw %%mm0, %%mm6 \n\t"
4148 "movq %%mm4, %%mm7 \n\t"
4149 "psubw %%mm0, %%mm6 \n\t"
4150 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4151 "movq %%mm7, %%mm0 \n\t"
4152 /* use mm7 mask to merge pa & pb */
4153 "pand %%mm7, %%mm5 \n\t"
4154 /* use mm0 mask copy to merge a & b */
4155 "pand %%mm0, %%mm2 \n\t"
4156 "pandn %%mm4, %%mm7 \n\t"
4157 "pandn %%mm1, %%mm0 \n\t"
4158 "paddw %%mm5, %%mm7 \n\t"
4159 "paddw %%mm2, %%mm0 \n\t"
4160 /* test ((pa <= pb)? pa:pb) <= pc */
4161 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4162 "pxor %%mm1, %%mm1 \n\t"
4163 "pand %%mm7, %%mm3 \n\t"
4164 "pandn %%mm0, %%mm7 \n\t"
4165 "paddw %%mm3, %%mm7 \n\t"
4166 "pxor %%mm0, %%mm0 \n\t"
4167 "packuswb %%mm1, %%mm7 \n\t"
4168 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4169 "pand _ActiveMask, %%mm7 \n\t"
4170 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4171 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4172 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4173 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4174 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */
4176 /* do second set of 4 bytes */
4177 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
4178 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
4179 /* pav = p - a = (a + b - c) - a = b - c */
4180 "movq %%mm2, %%mm4 \n\t"
4181 /* pbv = p - b = (a + b - c) - b = a - c */
4182 "movq %%mm1, %%mm5 \n\t"
4183 "psubw %%mm3, %%mm4 \n\t"
4184 "pxor %%mm7, %%mm7 \n\t"
4185 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4186 "movq %%mm4, %%mm6 \n\t"
4187 "psubw %%mm3, %%mm5 \n\t"
4188 /* pa = abs(p-a) = abs(pav) */
4189 /* pb = abs(p-b) = abs(pbv) */
4190 /* pc = abs(p-c) = abs(pcv) */
4191 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
4192 "paddw %%mm5, %%mm6 \n\t"
4193 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4194 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4195 "psubw %%mm0, %%mm4 \n\t"
4196 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4197 "psubw %%mm0, %%mm4 \n\t"
4198 "psubw %%mm7, %%mm5 \n\t"
4199 "pxor %%mm0, %%mm0 \n\t"
4200 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4201 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4202 "psubw %%mm7, %%mm5 \n\t"
4203 "psubw %%mm0, %%mm6 \n\t"
4205 "movq %%mm4, %%mm7 \n\t"
4206 "psubw %%mm0, %%mm6 \n\t"
4207 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4208 "movq %%mm7, %%mm0 \n\t"
4209 /* use mm7 mask to merge pa & pb */
4210 "pand %%mm7, %%mm5 \n\t"
4211 /* use mm0 mask copy to merge a & b */
4212 "pand %%mm0, %%mm2 \n\t"
4213 "pandn %%mm4, %%mm7 \n\t"
4214 "pandn %%mm1, %%mm0 \n\t"
4215 "paddw %%mm5, %%mm7 \n\t"
4216 "paddw %%mm2, %%mm0 \n\t"
4217 /* test ((pa <= pb)? pa:pb) <= pc */
4218 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4219 "pxor %%mm1, %%mm1 \n\t"
4220 "pand %%mm7, %%mm3 \n\t"
4221 "pandn %%mm0, %%mm7 \n\t"
4222 "pxor %%mm1, %%mm1 \n\t"
4223 "paddw %%mm3, %%mm7 \n\t"
4224 "pxor %%mm0, %%mm0 \n\t"
4225 /* step ecx to next set of 8 bytes and repeat loop til done */
4226 "addl $8, %%ecx \n\t"
4227 "packuswb %%mm7, %%mm1 \n\t"
4228 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
4229 "cmpl _MMXLength, %%ecx \n\t"
4230 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4231 /* mm1 will be used as Raw(x-bpp) next loop */
4234 : "=S" (dummy_value_S
), /* output regs (dummy) */
4235 "=D" (dummy_value_D
)
4237 : "0" (prev_row
), /* esi // input regs */
4240 : "%ecx" /* clobber list */
4241 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4242 , "%mm0", "%mm1", "%mm2", "%mm3"
4243 , "%mm4", "%mm5", "%mm6", "%mm7"
4247 break; /* end 8 bpp */
4249 case 1: /* bpp = 1 */
4250 case 2: /* bpp = 2 */
4251 default: /* bpp > 8 */
4253 __asm__
__volatile__ (
4255 "pushl %%ebx \n\t" /* save Global Offset Table index */
4257 "movl _dif, %%ebx \n\t"
4258 "cmpl _FullLength, %%ebx \n\t"
4259 "jnb paeth_dend \n\t"
4261 /* preload "movl row, %%edi \n\t" */
4262 /* preload "movl prev_row, %%esi \n\t" */
4263 /* do Paeth decode for remaining bytes */
4264 "movl %%ebx, %%edx \n\t"
4265 /* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
4266 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4267 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
4270 "xorl %%eax, %%eax \n\t"
4271 /* pav = p - a = (a + b - c) - a = b - c */
4272 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4273 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4274 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4275 "movl %%eax, _patemp \n\t" /* Save pav for later use */
4276 "xorl %%eax, %%eax \n\t"
4277 /* pbv = p - b = (a + b - c) - b = a - c */
4278 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4279 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4280 "movl %%eax, %%ecx \n\t"
4281 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4282 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4284 "testl $0x80000000, %%eax \n\t"
4285 "jz paeth_dpca \n\t"
4286 "negl %%eax \n\t" /* reverse sign of neg values */
4289 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4291 "testl $0x80000000, %%ecx \n\t"
4292 "jz paeth_dpba \n\t"
4293 "negl %%ecx \n\t" /* reverse sign of neg values */
4296 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4298 "movl _patemp, %%eax \n\t"
4299 "testl $0x80000000, %%eax \n\t"
4300 "jz paeth_dpaa \n\t"
4301 "negl %%eax \n\t" /* reverse sign of neg values */
4304 "movl %%eax, _patemp \n\t" /* save pa for later use */
4305 /* test if pa <= pb */
4306 "cmpl %%ecx, %%eax \n\t"
4307 "jna paeth_dabb \n\t"
4308 /* pa > pb; now test if pb <= pc */
4309 "cmpl _pctemp, %%ecx \n\t"
4310 "jna paeth_dbbc \n\t"
4311 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4312 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4313 "jmp paeth_dpaeth \n\t"
4316 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4317 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
4318 "jmp paeth_dpaeth \n\t"
4321 /* pa <= pb; now test if pa <= pc */
4322 "cmpl _pctemp, %%eax \n\t"
4323 "jna paeth_dabc \n\t"
4324 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4325 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4326 "jmp paeth_dpaeth \n\t"
4329 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4330 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
4332 "paeth_dpaeth: \n\t"
4335 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4336 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4337 "cmpl _FullLength, %%ebx \n\t"
4342 "popl %%ebx \n\t" /* index to Global Offset Table */
4345 : "=c" (dummy_value_c
), /* output regs (dummy) */
4346 "=S" (dummy_value_S
),
4347 "=D" (dummy_value_D
)
4349 : "0" (bpp
), /* ecx // input regs */
4350 "1" (prev_row
), /* esi */
4353 : "%eax", "%edx" /* clobber list */
4359 return; /* No need to go further with this one */
4361 } /* end switch (bpp) */
4363 __asm__
__volatile__ (
4364 /* MMX acceleration complete; now do clean-up */
4365 /* check if any remaining bytes left to decode */
4367 "pushl %%ebx \n\t" /* save index to Global Offset Table */
4369 "movl _MMXLength, %%ebx \n\t"
4370 "cmpl _FullLength, %%ebx \n\t"
4371 "jnb paeth_end \n\t"
4372 /*pre "movl row, %%edi \n\t" */
4373 /*pre "movl prev_row, %%esi \n\t" */
4374 /* do Paeth decode for remaining bytes */
4375 "movl %%ebx, %%edx \n\t"
4376 /*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
4377 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4378 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
4381 "xorl %%eax, %%eax \n\t"
4382 /* pav = p - a = (a + b - c) - a = b - c */
4383 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4384 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4385 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4386 "movl %%eax, _patemp \n\t" /* Save pav for later use */
4387 "xorl %%eax, %%eax \n\t"
4388 /* pbv = p - b = (a + b - c) - b = a - c */
4389 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4390 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4391 "movl %%eax, %%ecx \n\t"
4392 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4393 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4395 "testl $0x80000000, %%eax \n\t"
4396 "jz paeth_pca2 \n\t"
4397 "negl %%eax \n\t" /* reverse sign of neg values */
4400 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4402 "testl $0x80000000, %%ecx \n\t"
4403 "jz paeth_pba2 \n\t"
4404 "negl %%ecx \n\t" /* reverse sign of neg values */
4407 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4409 "movl _patemp, %%eax \n\t"
4410 "testl $0x80000000, %%eax \n\t"
4411 "jz paeth_paa2 \n\t"
4412 "negl %%eax \n\t" /* reverse sign of neg values */
4415 "movl %%eax, _patemp \n\t" /* save pa for later use */
4416 /* test if pa <= pb */
4417 "cmpl %%ecx, %%eax \n\t"
4418 "jna paeth_abb2 \n\t"
4419 /* pa > pb; now test if pb <= pc */
4420 "cmpl _pctemp, %%ecx \n\t"
4421 "jna paeth_bbc2 \n\t"
4422 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4423 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4424 "jmp paeth_paeth2 \n\t"
4427 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4428 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
4429 "jmp paeth_paeth2 \n\t"
4432 /* pa <= pb; now test if pa <= pc */
4433 "cmpl _pctemp, %%eax \n\t"
4434 "jna paeth_abc2 \n\t"
4435 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4436 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4437 "jmp paeth_paeth2 \n\t"
4440 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4441 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
4443 "paeth_paeth2: \n\t"
4446 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4447 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4448 "cmpl _FullLength, %%ebx \n\t"
4452 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
4454 "popl %%ebx \n\t" /* restore index to Global Offset Table */
4457 : "=c" (dummy_value_c
), /* output regs (dummy) */
4458 "=S" (dummy_value_S
),
4459 "=D" (dummy_value_D
)
4461 : "0" (bpp
), /* ecx // input regs */
4462 "1" (prev_row
), /* esi */
4465 : "%eax", "%edx" /* clobber list (no input regs!) */
4471 } /* end png_read_filter_row_mmx_paeth() */
4477 #ifdef PNG_THREAD_UNSAFE_OK
4478 /*===========================================================================*/
4480 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B */
4482 /*===========================================================================*/
4484 /* Optimized code for PNG Sub filter decoder */
4486 static void /* PRIVATE */
4487 png_read_filter_row_mmx_sub(png_row_infop row_info
, png_bytep row
)
4493 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* calc number of bytes per pixel */
4494 _FullLength
= row_info
->rowbytes
- bpp
; /* number of bytes to filter */
4496 __asm__
__volatile__ (
4497 /*pre "movl row, %%edi \n\t" */
4498 "movl %%edi, %%esi \n\t" /* lp = row */
4499 /*pre "movl bpp, %%eax \n\t" */
4500 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4501 /*irr "xorl %%eax, %%eax \n\t" */
4502 /* get # of bytes to alignment */
4503 "movl %%edi, _dif \n\t" /* take start of row */
4504 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past */
4505 /* alignment boundary */
4506 "xorl %%ecx, %%ecx \n\t"
4507 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
4508 "subl %%edi, _dif \n\t" /* subtract from start ==> value */
4509 "jz sub_go \n\t" /* ecx at alignment */
4511 "sub_lp1: \n\t" /* fix alignment */
4512 "movb (%%esi,%%ecx,), %%al \n\t"
4513 "addb %%al, (%%edi,%%ecx,) \n\t"
4515 "cmpl _dif, %%ecx \n\t"
4519 "movl _FullLength, %%eax \n\t"
4520 "movl %%eax, %%edx \n\t"
4521 "subl %%ecx, %%edx \n\t" /* subtract alignment fix */
4522 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
4523 "subl %%edx, %%eax \n\t" /* drop over bytes from length */
4524 "movl %%eax, _MMXLength \n\t"
4526 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4527 "=D" (dummy_value_D
) /* 1 */
4529 : "0" (bpp
), /* eax // input regs */
4532 : "%ebx", "%ecx", "%edx" /* clobber list */
4535 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4536 , "%mm0", "%mm1", "%mm2", "%mm3"
4537 , "%mm4", "%mm5", "%mm6", "%mm7"
4541 /* now do the math for the rest of the row */
4546 _ActiveMask
.use
= 0x0000ffffff000000LL
;
4547 _ShiftBpp
.use
= 24; /* == 3 * 8 */
4548 _ShiftRem
.use
= 40; /* == 64 - 24 */
4550 __asm__
__volatile__ (
4551 /* preload "movl row, %%edi \n\t" */
4552 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4553 /* active byte group */
4554 "movl %%edi, %%esi \n\t" /* lp = row */
4555 /* preload "movl bpp, %%eax \n\t" */
4556 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4557 "movq %%mm7, %%mm6 \n\t"
4558 "movl _dif, %%edx \n\t"
4559 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4560 /* 3rd active byte group */
4561 /* prime the pump: load the first Raw(x-bpp) data set */
4562 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4564 "sub_3lp: \n\t" /* shift data for adding first */
4565 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4566 /* shift clears inactive bytes) */
4567 /* add 1st active group */
4568 "movq (%%edi,%%edx,), %%mm0 \n\t"
4569 "paddb %%mm1, %%mm0 \n\t"
4571 /* add 2nd active group */
4572 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4573 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4574 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
4575 "paddb %%mm1, %%mm0 \n\t"
4577 /* add 3rd active group */
4578 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4579 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4580 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
4581 "addl $8, %%edx \n\t"
4582 "paddb %%mm1, %%mm0 \n\t"
4584 "cmpl _MMXLength, %%edx \n\t"
4585 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4586 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
4589 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4590 "=D" (dummy_value_D
) /* 1 */
4592 : "0" (bpp
), /* eax // input regs */
4595 : "%edx", "%esi" /* clobber list */
4596 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4597 , "%mm0", "%mm1", "%mm6", "%mm7"
4605 __asm__
__volatile__ (
4606 "movl _dif, %%edx \n\t"
4607 /* preload "movl row, %%edi \n\t" */
4608 "cmpl _FullLength, %%edx \n\t"
4610 "movl %%edi, %%esi \n\t" /* lp = row */
4611 "xorl %%eax, %%eax \n\t"
4612 /* preload "movl bpp, %%eax \n\t" */
4613 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4616 "movb (%%esi,%%edx,), %%al \n\t"
4617 "addb %%al, (%%edi,%%edx,) \n\t"
4619 "cmpl _FullLength, %%edx \n\t"
4624 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4625 "=D" (dummy_value_D
) /* 1 */
4627 : "0" (bpp
), /* eax // input regs */
4630 : "%edx", "%esi" /* clobber list */
4637 //case 7: /* GRR BOGUS */
4638 //case 5: /* GRR BOGUS */
4640 _ShiftBpp
.use
= bpp
<< 3;
4641 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
4643 __asm__
__volatile__ (
4644 /* preload "movl row, %%edi \n\t" */
4645 "movl _dif, %%edx \n\t"
4646 "movl %%edi, %%esi \n\t" /* lp = row */
4647 /* preload "movl bpp, %%eax \n\t" */
4648 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4650 /* prime the pump: load the first Raw(x-bpp) data set */
4651 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4653 "sub_4lp: \n\t" /* shift data for adding first */
4654 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4655 /* shift clears inactive bytes) */
4656 "movq (%%edi,%%edx,), %%mm0 \n\t"
4657 "paddb %%mm1, %%mm0 \n\t"
4659 /* add 2nd active group */
4660 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4661 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4662 "addl $8, %%edx \n\t"
4663 "paddb %%mm1, %%mm0 \n\t"
4665 "cmpl _MMXLength, %%edx \n\t"
4666 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4667 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
4670 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4671 "=D" (dummy_value_D
) /* 1 */
4673 : "0" (bpp
), /* eax // input regs */
4676 : "%edx", "%esi" /* clobber list */
4677 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4686 _ActiveMask
.use
= 0x00000000ffff0000LL
;
4687 _ShiftBpp
.use
= 16; /* == 2 * 8 */
4688 _ShiftRem
.use
= 48; /* == 64 - 16 */
4690 __asm__
__volatile__ (
4691 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4692 /* active byte group */
4693 "movl _dif, %%edx \n\t"
4694 "movq %%mm7, %%mm6 \n\t"
4695 /* preload "movl row, %%edi \n\t" */
4696 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4697 /* 3rd active byte group */
4698 "movl %%edi, %%esi \n\t" /* lp = row */
4699 "movq %%mm6, %%mm5 \n\t"
4700 /* preload "movl bpp, %%eax \n\t" */
4701 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4702 "psllq _ShiftBpp, %%mm5 \n\t" /* move mask in mm5 to cover */
4703 /* 4th active byte group */
4704 /* prime the pump: load the first Raw(x-bpp) data set */
4705 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4707 "sub_2lp: \n\t" /* shift data for adding first */
4708 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4709 /* shift clears inactive bytes) */
4710 /* add 1st active group */
4711 "movq (%%edi,%%edx,), %%mm0 \n\t"
4712 "paddb %%mm1, %%mm0 \n\t"
4714 /* add 2nd active group */
4715 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4716 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4717 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
4718 "paddb %%mm1, %%mm0 \n\t"
4720 /* add 3rd active group */
4721 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4722 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4723 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
4724 "paddb %%mm1, %%mm0 \n\t"
4726 /* add 4th active group */
4727 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4728 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4729 "pand %%mm5, %%mm1 \n\t" /* mask to use 4th active group */
4730 "addl $8, %%edx \n\t"
4731 "paddb %%mm1, %%mm0 \n\t"
4732 "cmpl _MMXLength, %%edx \n\t"
4733 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4734 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
4737 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4738 "=D" (dummy_value_D
) /* 1 */
4740 : "0" (bpp
), /* eax // input regs */
4743 : "%edx", "%esi" /* clobber list */
4744 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4745 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4753 __asm__
__volatile__ (
4754 /* preload "movl row, %%edi \n\t" */
4755 "movl _dif, %%edx \n\t"
4756 "movl %%edi, %%esi \n\t" /* lp = row */
4757 /* preload "movl bpp, %%eax \n\t" */
4758 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4759 "movl _MMXLength, %%ecx \n\t"
4761 /* prime the pump: load the first Raw(x-bpp) data set */
4762 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4763 "andl $0x0000003f, %%ecx \n\t" /* calc bytes over mult of 64 */
4766 "movq (%%edi,%%edx,), %%mm0 \n\t" /* load Sub(x) for 1st 8 bytes */
4767 "paddb %%mm7, %%mm0 \n\t"
4768 "movq 8(%%edi,%%edx,), %%mm1 \n\t" /* load Sub(x) for 2nd 8 bytes */
4769 "movq %%mm0, (%%edi,%%edx,) \n\t" /* write Raw(x) for 1st 8 bytes */
4771 /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */
4772 /* This will be repeated for each group of 8 bytes with the 8th */
4773 /* group being used as the Raw(x-bpp) for the 1st group of the */
4776 "paddb %%mm0, %%mm1 \n\t"
4777 "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */
4778 "movq %%mm1, 8(%%edi,%%edx,) \n\t" /* write Raw(x) for 2nd 8 bytes */
4779 "paddb %%mm1, %%mm2 \n\t"
4780 "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */
4781 "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */
4782 "paddb %%mm2, %%mm3 \n\t"
4783 "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */
4784 "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */
4785 "paddb %%mm3, %%mm4 \n\t"
4786 "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */
4787 "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */
4788 "paddb %%mm4, %%mm5 \n\t"
4789 "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */
4790 "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */
4791 "paddb %%mm5, %%mm6 \n\t"
4792 "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */
4793 "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */
4794 "addl $64, %%edx \n\t"
4795 "paddb %%mm6, %%mm7 \n\t"
4796 "cmpl %%ecx, %%edx \n\t"
4797 "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */
4800 "cmpl _MMXLength, %%edx \n\t"
4804 "movq (%%edi,%%edx,), %%mm0 \n\t"
4805 "addl $8, %%edx \n\t"
4806 "paddb %%mm7, %%mm0 \n\t"
4807 "cmpl _MMXLength, %%edx \n\t"
4808 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */
4809 "movq %%mm0, %%mm7 \n\t" /* move calculated Raw(x) data */
4810 /* to mm1 to be new Raw(x-bpp) */
4816 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4817 "=D" (dummy_value_D
) /* 1 */
4819 : "0" (bpp
), /* eax // input regs */
4822 : "%ecx", "%edx", "%esi" /* clobber list */
4823 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4824 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4830 default: /* bpp greater than 8 bytes GRR BOGUS */
4832 __asm__
__volatile__ (
4833 "movl _dif, %%edx \n\t"
4834 /* preload "movl row, %%edi \n\t" */
4835 "movl %%edi, %%esi \n\t" /* lp = row */
4836 /* preload "movl bpp, %%eax \n\t" */
4837 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4840 "movq (%%edi,%%edx,), %%mm0 \n\t"
4841 "movq (%%esi,%%edx,), %%mm1 \n\t"
4842 "addl $8, %%edx \n\t"
4843 "paddb %%mm1, %%mm0 \n\t"
4844 "cmpl _MMXLength, %%edx \n\t"
4845 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */
4846 /* -8 to offset addl edx */
4849 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4850 "=D" (dummy_value_D
) /* 1 */
4852 : "0" (bpp
), /* eax // input regs */
4855 : "%edx", "%esi" /* clobber list */
4856 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4863 } /* end switch (bpp) */
4865 __asm__
__volatile__ (
4866 "movl _MMXLength, %%edx \n\t"
4867 /* pre "movl row, %%edi \n\t" */
4868 "cmpl _FullLength, %%edx \n\t"
4871 "movl %%edi, %%esi \n\t" /* lp = row */
4872 /* pre "movl bpp, %%eax \n\t" */
4873 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4874 "xorl %%eax, %%eax \n\t"
4877 "movb (%%esi,%%edx,), %%al \n\t"
4878 "addb %%al, (%%edi,%%edx,) \n\t"
4880 "cmpl _FullLength, %%edx \n\t"
4884 "EMMS \n\t" /* end MMX instructions */
4886 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4887 "=D" (dummy_value_D
) /* 1 */
4889 : "0" (bpp
), /* eax // input regs */
4892 : "%edx", "%esi" /* clobber list */
4895 } /* end of png_read_filter_row_mmx_sub() */
4901 /*===========================================================================*/
4903 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P */
4905 /*===========================================================================*/
4907 /* Optimized code for PNG Up filter decoder */
4909 static void /* PRIVATE */
4910 png_read_filter_row_mmx_up(png_row_infop row_info
, png_bytep row
,
4914 int dummy_value_d
; /* fix 'forbidden register 3 (dx) was spilled' error */
4918 len
= row_info
->rowbytes
; /* number of bytes to filter */
4920 __asm__
__volatile__ (
4921 /* pre "movl row, %%edi \n\t" */
4922 /* get # of bytes to alignment */
4926 "movl %%edi, %%ecx \n\t"
4927 "xorl %%ebx, %%ebx \n\t"
4928 "addl $0x7, %%ecx \n\t"
4929 "xorl %%eax, %%eax \n\t"
4930 "andl $0xfffffff8, %%ecx \n\t"
4931 /* pre "movl prev_row, %%esi \n\t" */
4932 "subl %%edi, %%ecx \n\t"
4935 "up_lp1: \n\t" /* fix alignment */
4936 "movb (%%edi,%%ebx,), %%al \n\t"
4937 "addb (%%esi,%%ebx,), %%al \n\t"
4939 "cmpl %%ecx, %%ebx \n\t"
4940 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
4941 "jb up_lp1 \n\t" /* offset incl ebx */
4944 /* pre "movl len, %%edx \n\t" */
4945 "movl %%edx, %%ecx \n\t"
4946 "subl %%ebx, %%edx \n\t" /* subtract alignment fix */
4947 "andl $0x0000003f, %%edx \n\t" /* calc bytes over mult of 64 */
4948 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
4950 /* unrolled loop - use all MMX registers and interleave to reduce */
4951 /* number of branch instructions (loops) and reduce partial stalls */
4953 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4954 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4955 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4956 "paddb %%mm1, %%mm0 \n\t"
4957 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4958 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4959 "paddb %%mm3, %%mm2 \n\t"
4960 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4961 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4962 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4963 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4964 "paddb %%mm5, %%mm4 \n\t"
4965 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4966 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4967 "paddb %%mm7, %%mm6 \n\t"
4968 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4969 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4970 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4971 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4972 "paddb %%mm1, %%mm0 \n\t"
4973 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4974 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4975 "paddb %%mm3, %%mm2 \n\t"
4976 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4977 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4978 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4979 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4980 "paddb %%mm5, %%mm4 \n\t"
4981 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4982 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4983 "addl $64, %%ebx \n\t"
4984 "paddb %%mm7, %%mm6 \n\t"
4985 "cmpl %%ecx, %%ebx \n\t"
4986 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */
4987 "jb up_loop \n\t" /* -8 to offset addl ebx */
4989 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 64 */
4992 "cmpl $8, %%edx \n\t" /* test for less than 8 bytes */
4993 "jb up_lt8 \n\t" /* [added by lcreeve@netins.net] */
4995 "addl %%edx, %%ecx \n\t"
4996 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
4997 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
5000 "up_lpA: \n\t" /* use MMX regs to update 8 bytes sim. */
5001 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5002 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5003 "addl $8, %%ebx \n\t"
5004 "paddb %%mm1, %%mm0 \n\t"
5005 "cmpl %%ecx, %%ebx \n\t"
5006 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */
5007 "jb up_lpA \n\t" /* offset add ebx */
5008 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 8 */
5012 "xorl %%eax, %%eax \n\t"
5013 "addl %%edx, %%ecx \n\t" /* move over byte count into counter */
5015 "up_lp2: \n\t" /* use x86 regs for remaining bytes */
5016 "movb (%%edi,%%ebx,), %%al \n\t"
5017 "addb (%%esi,%%ebx,), %%al \n\t"
5019 "cmpl %%ecx, %%ebx \n\t"
5020 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
5021 "jb up_lp2 \n\t" /* offset inc ebx */
5024 "EMMS \n\t" /* conversion of filtered row complete */
5029 : "=d" (dummy_value_d
), /* 0 // output regs (dummy) */
5030 "=S" (dummy_value_S
), /* 1 */
5031 "=D" (dummy_value_D
) /* 2 */
5033 : "0" (len
), /* edx // input regs */
5034 "1" (prev_row
), /* esi */
5037 : "%eax", "%ebx", "%ecx" /* clobber list (no input regs!) */
5039 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5040 , "%mm0", "%mm1", "%mm2", "%mm3"
5041 , "%mm4", "%mm5", "%mm6", "%mm7"
5045 } /* end of png_read_filter_row_mmx_up() */
5047 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5052 /*===========================================================================*/
5054 /* P N G _ R E A D _ F I L T E R _ R O W */
5056 /*===========================================================================*/
5059 /* Optimized png_read_filter_row routines */
5062 png_read_filter_row(png_structp png_ptr
, png_row_infop row_info
, png_bytep
5063 row
, png_bytep prev_row
, int filter
)
5069 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5070 /* GRR: these are superseded by png_ptr->asm_flags: */
5071 #define UseMMX_sub 1 /* GRR: converted 20000730 */
5072 #define UseMMX_up 1 /* GRR: converted 20000729 */
5073 #define UseMMX_avg 1 /* GRR: converted 20000828 (+ 16-bit bugfix 20000916) */
5074 #define UseMMX_paeth 1 /* GRR: converted 20000828 */
5076 if (_mmx_supported
== 2) {
5077 /* this should have happened in png_init_mmx_flags() already */
5078 #if !defined(PNG_1_0_X)
5079 png_warning(png_ptr
, "asm_flags may not have been initialized");
5083 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5086 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5089 case 0: sprintf(filnm
, "none");
5091 case 1: sprintf(filnm
, "sub-%s",
5092 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5093 #if !defined(PNG_1_0_X)
5094 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
)? "MMX" :
5099 case 2: sprintf(filnm
, "up-%s",
5100 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5101 #if !defined(PNG_1_0_X)
5102 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
)? "MMX" :
5107 case 3: sprintf(filnm
, "avg-%s",
5108 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5109 #if !defined(PNG_1_0_X)
5110 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
)? "MMX" :
5115 case 4: sprintf(filnm
, "Paeth-%s",
5116 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5117 #if !defined(PNG_1_0_X)
5118 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
)? "MMX":
5123 default: sprintf(filnm
, "unknw");
5126 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr
->row_number
, filnm
);
5127 png_debug1(0, "row=0x%08lx, ", (unsigned long)row
);
5128 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info
->pixel_depth
,
5129 (int)((row_info
->pixel_depth
+ 7) >> 3));
5130 png_debug1(0,"rowbytes=%8ld\n", row_info
->rowbytes
);
5131 #endif /* PNG_DEBUG */
5135 case PNG_FILTER_VALUE_NONE
:
5138 case PNG_FILTER_VALUE_SUB
:
5139 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5140 #if !defined(PNG_1_0_X)
5141 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
) &&
5142 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5143 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5148 png_read_filter_row_mmx_sub(row_info
, row
);
5151 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5154 png_uint_32 istop
= row_info
->rowbytes
;
5155 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5156 png_bytep rp
= row
+ bpp
;
5159 for (i
= bpp
; i
< istop
; i
++)
5161 *rp
= (png_byte
)(((int)(*rp
) + (int)(*lp
++)) & 0xff);
5164 } /* end !UseMMX_sub */
5167 case PNG_FILTER_VALUE_UP
:
5168 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5169 #if !defined(PNG_1_0_X)
5170 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
) &&
5171 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5172 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5177 png_read_filter_row_mmx_up(row_info
, row
, prev_row
);
5180 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5183 png_uint_32 istop
= row_info
->rowbytes
;
5185 png_bytep pp
= prev_row
;
5187 for (i
= 0; i
< istop
; ++i
)
5189 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
5192 } /* end !UseMMX_up */
5195 case PNG_FILTER_VALUE_AVG
:
5196 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5197 #if !defined(PNG_1_0_X)
5198 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
) &&
5199 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5200 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5205 png_read_filter_row_mmx_avg(row_info
, row
, prev_row
);
5208 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5212 png_bytep pp
= prev_row
;
5214 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5215 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
5217 for (i
= 0; i
< bpp
; i
++)
5219 *rp
= (png_byte
)(((int)(*rp
) +
5220 ((int)(*pp
++) >> 1)) & 0xff);
5224 for (i
= 0; i
< istop
; i
++)
5226 *rp
= (png_byte
)(((int)(*rp
) +
5227 ((int)(*pp
++ + *lp
++) >> 1)) & 0xff);
5230 } /* end !UseMMX_avg */
5233 case PNG_FILTER_VALUE_PAETH
:
5234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5235 #if !defined(PNG_1_0_X)
5236 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
) &&
5237 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5238 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5243 png_read_filter_row_mmx_paeth(row_info
, row
, prev_row
);
5246 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5250 png_bytep pp
= prev_row
;
5252 png_bytep cp
= prev_row
;
5253 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5254 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
5256 for (i
= 0; i
< bpp
; i
++)
5258 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
5262 for (i
= 0; i
< istop
; i
++) /* use leftover rp,pp */
5264 int a
, b
, c
, pa
, pb
, pc
, p
;
5278 pa
= p
< 0 ? -p
: p
;
5279 pb
= pc
< 0 ? -pc
: pc
;
5280 pc
= (p
+ pc
) < 0 ? -(p
+ pc
) : p
+ pc
;
5284 if (pa <= pb && pa <= pc)
5292 p
= (pa
<= pb
&& pa
<= pc
) ? a
: (pb
<= pc
) ? b
: c
;
5294 *rp
= (png_byte
)(((int)(*rp
) + p
) & 0xff);
5297 } /* end !UseMMX_paeth */
5301 png_warning(png_ptr
, "Ignoring bad row-filter type");
5307 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5310 /*===========================================================================*/
5312 /* P N G _ M M X _ S U P P O R T */
5314 /*===========================================================================*/
5316 /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5317 * (2) all instructions compile with gcc 2.7.2.3 and later
5318 * (3) the function is moved down here to prevent gcc from
5319 * inlining it in multiple places and then barfing be-
5320 * cause the ".NOT_SUPPORTED" label is multiply defined
5321 * [is there a way to signal that a *single* function should
5322 * not be inlined? is there a way to modify the label for
5323 * each inlined instance, e.g., by appending _1, _2, etc.?
5324 * maybe if don't use leading "." in label name? (nope...sigh)]
5328 png_mmx_support(void)
5330 #if defined(PNG_MMX_CODE_SUPPORTED)
5331 __asm__
__volatile__ (
5332 "pushl %%ebx \n\t" /* ebx gets clobbered by CPUID instruction */
5333 "pushl %%ecx \n\t" /* so does ecx... */
5334 "pushl %%edx \n\t" /* ...and edx (but ecx & edx safe on Linux) */
5335 /* ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd */
5336 /* "pushf \n\t" // 16-bit pushf */
5337 "pushfl \n\t" /* save Eflag to stack */
5338 "popl %%eax \n\t" /* get Eflag from stack into eax */
5339 "movl %%eax, %%ecx \n\t" /* make another copy of Eflag in ecx */
5340 "xorl $0x200000, %%eax \n\t" /* toggle ID bit in Eflag (i.e., bit 21) */
5341 "pushl %%eax \n\t" /* save modified Eflag back to stack */
5342 /* ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd */
5343 /* "popf \n\t" // 16-bit popf */
5344 "popfl \n\t" /* restore modified value to Eflag reg */
5345 "pushfl \n\t" /* save Eflag to stack */
5346 "popl %%eax \n\t" /* get Eflag from stack */
5347 "pushl %%ecx \n\t" /* save original Eflag to stack */
5348 "popfl \n\t" /* restore original Eflag */
5349 "xorl %%ecx, %%eax \n\t" /* compare new Eflag with original Eflag */
5350 "jz 0f \n\t" /* if same, CPUID instr. is not supported */
5352 "xorl %%eax, %%eax \n\t" /* set eax to zero */
5353 /* ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode) */
5354 "cpuid \n\t" /* get the CPU identification info */
5355 "cmpl $1, %%eax \n\t" /* make sure eax return non-zero value */
5356 "jl 0f \n\t" /* if eax is zero, MMX is not supported */
5358 "xorl %%eax, %%eax \n\t" /* set eax to zero and... */
5359 "incl %%eax \n\t" /* ...increment eax to 1. This pair is */
5360 /* faster than the instruction "mov eax, 1" */
5361 "cpuid \n\t" /* get the CPU identification info again */
5362 "andl $0x800000, %%edx \n\t" /* mask out all bits but MMX bit (23) */
5363 "cmpl $0, %%edx \n\t" /* 0 = MMX not supported */
5364 "jz 0f \n\t" /* non-zero = yes, MMX IS supported */
5366 "movl $1, %%eax \n\t" /* set return value to 1 */
5367 "jmp 1f \n\t" /* DONE: have MMX support */
5369 "0: \n\t" /* .NOT_SUPPORTED: target label for jump instructions */
5370 "movl $0, %%eax \n\t" /* set return value to 0 */
5371 "1: \n\t" /* .RETURN: target label for jump instructions */
5372 "movl %%eax, _mmx_supported \n\t" /* save in global static variable, too */
5373 "popl %%edx \n\t" /* restore edx */
5374 "popl %%ecx \n\t" /* restore ecx */
5375 "popl %%ebx \n\t" /* restore ebx */
5377 /* "ret \n\t" // DONE: no MMX support */
5378 /* (fall through to standard C "ret") */
5380 : /* output list (none) */
5382 : /* any variables used on input (none) */
5384 : "%eax" /* clobber list */
5385 /* , "%ebx", "%ecx", "%edx" // GRR: we handle these manually */
5386 /* , "memory" // if write to a variable gcc thought was in a reg */
5387 /* , "cc" // "condition codes" (flag bits) */
5391 #endif /* PNG_MMX_CODE_SUPPORTED */
5393 return _mmx_supported
;
5397 #endif /* PNG_USE_PNGGCCRD */