1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
9 * libpng version 1.2.7 - September 12, 2004
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
38 * For other platforms, see the main GNU site:
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
42 * Version 2.5.2l.15 is definitely too old...
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_read_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
207 * - added new png_init_mmx_flags() function (here only because it needs to
208 * call mmxsupport(), which should probably become global png_mmxsupport());
209 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 * and made it public; moved png_init_mmx_flags() to png.c as internal func
216 * - removed dependency on png_read_filter_row_c() (C code already duplicated
217 * within MMX version of png_read_filter_row()) so no longer necessary to
218 * compile it into pngrutil.o
221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
224 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
227 * - more tinkering with clobber list at lines 4529 and 5033, to get
228 * it to compile on gcc-3.4.
231 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
232 * - write MMX code for 48-bit case (pixel_bytes == 6)
233 * - figure out what's up with 24-bit case (pixel_bytes == 3):
234 * why subtract 8 from width_mmx in the pass 4/5 case?
235 * (only width_mmx case) (near line 1606)
236 * - rewrite all MMX interlacing code so it's aligned with beginning
237 * of the row buffer, not the end (see 19991007 for details)
238 * x pick one version of mmxsupport() and get rid of the other
239 * - add error messages to any remaining bogus default cases
240 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
241 * x add support for runtime enable/disable/query of various MMX routines
247 #if defined(PNG_USE_PNGGCCRD)
249 int PNGAPI
png_mmx_support(void);
251 #ifdef PNG_USE_LOCAL_ARRAYS
252 static const int FARDATA png_pass_start
[7] = {0, 4, 0, 2, 0, 1, 0};
253 static const int FARDATA png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
254 static const int FARDATA png_pass_width
[7] = {8, 4, 4, 2, 2, 1, 1};
257 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
258 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
259 * so define them without: */
260 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
261 # define _mmx_supported mmx_supported
262 # define _const4 const4
263 # define _const6 const6
264 # define _mask8_0 mask8_0
265 # define _mask16_1 mask16_1
266 # define _mask16_0 mask16_0
267 # define _mask24_2 mask24_2
268 # define _mask24_1 mask24_1
269 # define _mask24_0 mask24_0
270 # define _mask32_3 mask32_3
271 # define _mask32_2 mask32_2
272 # define _mask32_1 mask32_1
273 # define _mask32_0 mask32_0
274 # define _mask48_5 mask48_5
275 # define _mask48_4 mask48_4
276 # define _mask48_3 mask48_3
277 # define _mask48_2 mask48_2
278 # define _mask48_1 mask48_1
279 # define _mask48_0 mask48_0
280 # define _LBCarryMask LBCarryMask
281 # define _HBClearMask HBClearMask
282 # define _ActiveMask ActiveMask
283 # define _ActiveMask2 ActiveMask2
284 # define _ActiveMaskEnd ActiveMaskEnd
285 # define _ShiftBpp ShiftBpp
286 # define _ShiftRem ShiftRem
287 #ifdef PNG_THREAD_UNSAFE_OK
288 # define _unmask unmask
289 # define _FullLength FullLength
290 # define _MMXLength MMXLength
292 # define _patemp patemp
293 # define _pbtemp pbtemp
294 # define _pctemp pctemp
299 /* These constants are used in the inlined MMX assembly code.
300 Ignore gcc's "At top level: defined but not used" warnings. */
302 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
303 * since that case uses the %ebx register for indexing the Global Offset Table
304 * and there were no other registers available. But gcc 2.95 and later emit
305 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
306 * in the non-PIC case, so we'll just use the global unconditionally now.
308 #ifdef PNG_THREAD_UNSAFE_OK
312 static unsigned long long _mask8_0
= 0x0102040810204080LL
;
314 static unsigned long long _mask16_1
= 0x0101020204040808LL
;
315 static unsigned long long _mask16_0
= 0x1010202040408080LL
;
317 static unsigned long long _mask24_2
= 0x0101010202020404LL
;
318 static unsigned long long _mask24_1
= 0x0408080810101020LL
;
319 static unsigned long long _mask24_0
= 0x2020404040808080LL
;
321 static unsigned long long _mask32_3
= 0x0101010102020202LL
;
322 static unsigned long long _mask32_2
= 0x0404040408080808LL
;
323 static unsigned long long _mask32_1
= 0x1010101020202020LL
;
324 static unsigned long long _mask32_0
= 0x4040404080808080LL
;
326 static unsigned long long _mask48_5
= 0x0101010101010202LL
;
327 static unsigned long long _mask48_4
= 0x0202020204040404LL
;
328 static unsigned long long _mask48_3
= 0x0404080808080808LL
;
329 static unsigned long long _mask48_2
= 0x1010101010102020LL
;
330 static unsigned long long _mask48_1
= 0x2020202040404040LL
;
331 static unsigned long long _mask48_0
= 0x4040808080808080LL
;
333 static unsigned long long _const4
= 0x0000000000FFFFFFLL
;
334 /* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */ /* NOT USED */
335 static unsigned long long _const6
= 0x00000000000000FFLL
;
337 /* These are used in the row-filter routines and should/would be local */
338 /* variables if not for gcc addressing limitations. */
339 /* WARNING: Their presence probably defeats the thread safety of libpng. */
341 #ifdef PNG_THREAD_UNSAFE_OK
342 static png_uint_32 _FullLength
;
343 static png_uint_32 _MMXLength
;
345 static int _patemp
; /* temp variables for Paeth routine */
351 png_squelch_warnings(void)
353 #ifdef PNG_THREAD_UNSAFE_OK
358 _MMXLength
= _MMXLength
;
363 _mask16_1
= _mask16_1
;
364 _mask16_0
= _mask16_0
;
365 _mask24_2
= _mask24_2
;
366 _mask24_1
= _mask24_1
;
367 _mask24_0
= _mask24_0
;
368 _mask32_3
= _mask32_3
;
369 _mask32_2
= _mask32_2
;
370 _mask32_1
= _mask32_1
;
371 _mask32_0
= _mask32_0
;
372 _mask48_5
= _mask48_5
;
373 _mask48_4
= _mask48_4
;
374 _mask48_3
= _mask48_3
;
375 _mask48_2
= _mask48_2
;
376 _mask48_1
= _mask48_1
;
377 _mask48_0
= _mask48_0
;
379 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
382 static int _mmx_supported
= 2;
384 /*===========================================================================*/
386 /* P N G _ C O M B I N E _ R O W */
388 /*===========================================================================*/
390 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
393 #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
395 #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
398 /* Combines the row recently read in with the previous row.
399 This routine takes care of alpha and transparency if requested.
400 This routine also handles the two methods of progressive display
401 of interlaced images, depending on the mask value.
402 The mask value describes which pixels are to be combined with
403 the row. The pattern always repeats every 8 pixels, so just 8
404 bits are needed. A one indicates the pixel is to be combined; a
405 zero indicates the pixel is to be skipped. This is in addition
406 to any alpha or transparency value associated with the pixel.
407 If you want all pixels to be combined, pass 0xff (255) in mask. */
409 /* Use this routine for the x86 platform - it uses a faster MMX routine
410 if the machine supports MMX. */
413 png_combine_row(png_structp png_ptr
, png_bytep row
, int mask
)
415 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
417 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
418 if (_mmx_supported
== 2) {
419 #if !defined(PNG_1_0_X)
420 /* this should have happened in png_init_mmx_flags() already */
421 png_warning(png_ptr
, "asm_flags may not have been initialized");
429 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
430 png_memcpy(row
, png_ptr
->row_buf
+ 1,
431 (png_size_t
)PNG_ROWBYTES(png_ptr
->row_info
.pixel_depth
,png_ptr
->width
));
433 else /* (png_combine_row() is never called with mask == 0) */
435 switch (png_ptr
->row_info
.pixel_depth
)
437 case 1: /* png_ptr->row_info.pixel_depth */
441 int s_inc
, s_start
, s_end
;
446 sp
= png_ptr
->row_buf
+ 1;
449 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
450 if (png_ptr
->transformations
& PNG_PACKSWAP
)
466 for (i
= 0; i
< png_ptr
->width
; i
++)
472 value
= (*sp
>> shift
) & 0x1;
473 *dp
&= (png_byte
)((0x7f7f >> (7 - shift
)) & 0xff);
474 *dp
|= (png_byte
)(value
<< shift
);
494 case 2: /* png_ptr->row_info.pixel_depth */
498 int s_start
, s_end
, s_inc
;
504 sp
= png_ptr
->row_buf
+ 1;
507 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
508 if (png_ptr
->transformations
& PNG_PACKSWAP
)
524 for (i
= 0; i
< png_ptr
->width
; i
++)
528 value
= (*sp
>> shift
) & 0x3;
529 *dp
&= (png_byte
)((0x3f3f >> (6 - shift
)) & 0xff);
530 *dp
|= (png_byte
)(value
<< shift
);
549 case 4: /* png_ptr->row_info.pixel_depth */
553 int s_start
, s_end
, s_inc
;
559 sp
= png_ptr
->row_buf
+ 1;
562 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
563 if (png_ptr
->transformations
& PNG_PACKSWAP
)
578 for (i
= 0; i
< png_ptr
->width
; i
++)
582 value
= (*sp
>> shift
) & 0xf;
583 *dp
&= (png_byte
)((0xf0f >> (4 - shift
)) & 0xff);
584 *dp
|= (png_byte
)(value
<< shift
);
603 case 8: /* png_ptr->row_info.pixel_depth */
608 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
609 #if !defined(PNG_1_0_X)
610 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
611 /* && _mmx_supported */ )
618 int dummy_value_a
; /* fix 'forbidden register spilled' error */
623 _unmask
= ~mask
; /* global variable for -fPIC version */
624 srcptr
= png_ptr
->row_buf
+ 1;
626 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
627 diff
= (int) (png_ptr
->width
& 7); /* amount lost */
629 __asm__
__volatile__ (
630 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
631 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
632 "punpcklbw %%mm7, %%mm7 \n\t"
633 "punpcklwd %%mm7, %%mm7 \n\t"
634 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
636 "movq _mask8_0, %%mm0 \n\t"
637 "pand %%mm7, %%mm0 \n\t" /* nonzero if keep byte */
638 "pcmpeqb %%mm6, %%mm0 \n\t" /* zeros->1s, v versa */
640 /* preload "movl len, %%ecx \n\t" // load length of line */
641 /* preload "movl srcptr, %%esi \n\t" // load source */
642 /* preload "movl dstptr, %%edi \n\t" // load dest */
644 "cmpl $0, %%ecx \n\t" /* len == 0 ? */
645 "je mainloop8end \n\t"
648 "movq (%%esi), %%mm4 \n\t" /* *srcptr */
649 "pand %%mm0, %%mm4 \n\t"
650 "movq %%mm0, %%mm6 \n\t"
651 "pandn (%%edi), %%mm6 \n\t" /* *dstptr */
652 "por %%mm6, %%mm4 \n\t"
653 "movq %%mm4, (%%edi) \n\t"
654 "addl $8, %%esi \n\t" /* inc by 8 bytes processed */
655 "addl $8, %%edi \n\t"
656 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
660 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
661 "movl %%eax, %%ecx \n\t"
662 "cmpl $0, %%ecx \n\t"
664 /* preload "movl mask, %%edx \n\t" */
665 "sall $24, %%edx \n\t" /* make low byte, high byte */
668 "sall %%edx \n\t" /* move high bit to CF */
669 "jnc skip8 \n\t" /* if CF = 0 */
670 "movb (%%esi), %%al \n\t"
671 "movb %%al, (%%edi) \n\t"
677 "jnz secondloop8 \n\t"
680 "EMMS \n\t" /* DONE */
682 : "=a" (dummy_value_a
), /* output regs (dummy) */
683 "=d" (dummy_value_d
),
684 "=c" (dummy_value_c
),
685 "=S" (dummy_value_S
),
688 : "3" (srcptr
), /* esi // input regs */
689 "4" (dstptr
), /* edi */
690 "0" (diff
), /* eax */
691 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
695 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
696 : "%mm0", "%mm4", "%mm6", "%mm7" /* clobber list */
700 else /* mmx _not supported - Use modified C routine */
701 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
703 register png_uint_32 i
;
704 png_uint_32 initial_val
= png_pass_start
[png_ptr
->pass
];
705 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
706 register int stride
= png_pass_inc
[png_ptr
->pass
];
707 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
708 register int rep_bytes
= png_pass_width
[png_ptr
->pass
];
709 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
710 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
711 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
712 register png_uint_32 final_val
= len
; /* GRR bugfix */
714 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
715 dstptr
= row
+ initial_val
;
717 for (i
= initial_val
; i
< final_val
; i
+= stride
)
719 png_memcpy(dstptr
, srcptr
, rep_bytes
);
723 if (diff
) /* number of leftover pixels: 3 for pngtest */
725 final_val
+=diff
/* *BPP1 */ ;
726 for (; i
< final_val
; i
+= stride
)
728 if (rep_bytes
> (int)(final_val
-i
))
729 rep_bytes
= (int)(final_val
-i
);
730 png_memcpy(dstptr
, srcptr
, rep_bytes
);
736 } /* end of else (_mmx_supported) */
741 case 16: /* png_ptr->row_info.pixel_depth */
746 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
747 #if !defined(PNG_1_0_X)
748 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
749 /* && _mmx_supported */ )
756 int dummy_value_a
; /* fix 'forbidden register spilled' error */
761 _unmask
= ~mask
; /* global variable for -fPIC version */
762 srcptr
= png_ptr
->row_buf
+ 1;
764 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
765 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
767 __asm__
__volatile__ (
768 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
769 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
770 "punpcklbw %%mm7, %%mm7 \n\t"
771 "punpcklwd %%mm7, %%mm7 \n\t"
772 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
774 "movq _mask16_0, %%mm0 \n\t"
775 "movq _mask16_1, %%mm1 \n\t"
777 "pand %%mm7, %%mm0 \n\t"
778 "pand %%mm7, %%mm1 \n\t"
780 "pcmpeqb %%mm6, %%mm0 \n\t"
781 "pcmpeqb %%mm6, %%mm1 \n\t"
783 /* preload "movl len, %%ecx \n\t" // load length of line */
784 /* preload "movl srcptr, %%esi \n\t" // load source */
785 /* preload "movl dstptr, %%edi \n\t" // load dest */
787 "cmpl $0, %%ecx \n\t"
788 "jz mainloop16end \n\t"
791 "movq (%%esi), %%mm4 \n\t"
792 "pand %%mm0, %%mm4 \n\t"
793 "movq %%mm0, %%mm6 \n\t"
794 "movq (%%edi), %%mm7 \n\t"
795 "pandn %%mm7, %%mm6 \n\t"
796 "por %%mm6, %%mm4 \n\t"
797 "movq %%mm4, (%%edi) \n\t"
799 "movq 8(%%esi), %%mm5 \n\t"
800 "pand %%mm1, %%mm5 \n\t"
801 "movq %%mm1, %%mm7 \n\t"
802 "movq 8(%%edi), %%mm6 \n\t"
803 "pandn %%mm6, %%mm7 \n\t"
804 "por %%mm7, %%mm5 \n\t"
805 "movq %%mm5, 8(%%edi) \n\t"
807 "addl $16, %%esi \n\t" /* inc by 16 bytes processed */
808 "addl $16, %%edi \n\t"
809 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
812 "mainloop16end: \n\t"
813 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
814 "movl %%eax, %%ecx \n\t"
815 "cmpl $0, %%ecx \n\t"
817 /* preload "movl mask, %%edx \n\t" */
818 "sall $24, %%edx \n\t" /* make low byte, high byte */
821 "sall %%edx \n\t" /* move high bit to CF */
822 "jnc skip16 \n\t" /* if CF = 0 */
823 "movw (%%esi), %%ax \n\t"
824 "movw %%ax, (%%edi) \n\t"
827 "addl $2, %%esi \n\t"
828 "addl $2, %%edi \n\t"
830 "jnz secondloop16 \n\t"
833 "EMMS \n\t" /* DONE */
835 : "=a" (dummy_value_a
), /* output regs (dummy) */
836 "=c" (dummy_value_c
),
837 "=d" (dummy_value_d
),
838 "=S" (dummy_value_S
),
841 : "0" (diff
), /* eax // input regs */
842 /* was (unmask) " " RESERVED // ebx // Global Offset Table idx */
844 "2" (mask
), /* edx */
845 "3" (srcptr
), /* esi */
846 "4" (dstptr
) /* edi */
848 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
849 : "%mm0", "%mm1", "%mm4" /* clobber list */
850 , "%mm5", "%mm6", "%mm7"
854 else /* mmx _not supported - Use modified C routine */
855 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
857 register png_uint_32 i
;
858 png_uint_32 initial_val
= BPP2
* png_pass_start
[png_ptr
->pass
];
859 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
860 register int stride
= BPP2
* png_pass_inc
[png_ptr
->pass
];
861 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
862 register int rep_bytes
= BPP2
* png_pass_width
[png_ptr
->pass
];
863 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
864 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
865 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
866 register png_uint_32 final_val
= BPP2
* len
; /* GRR bugfix */
868 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
869 dstptr
= row
+ initial_val
;
871 for (i
= initial_val
; i
< final_val
; i
+= stride
)
873 png_memcpy(dstptr
, srcptr
, rep_bytes
);
877 if (diff
) /* number of leftover pixels: 3 for pngtest */
879 final_val
+=diff
*BPP2
;
880 for (; i
< final_val
; i
+= stride
)
882 if (rep_bytes
> (int)(final_val
-i
))
883 rep_bytes
= (int)(final_val
-i
);
884 png_memcpy(dstptr
, srcptr
, rep_bytes
);
889 } /* end of else (_mmx_supported) */
894 case 24: /* png_ptr->row_info.pixel_depth */
899 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
900 #if !defined(PNG_1_0_X)
901 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
902 /* && _mmx_supported */ )
909 int dummy_value_a
; /* fix 'forbidden register spilled' error */
914 _unmask
= ~mask
; /* global variable for -fPIC version */
915 srcptr
= png_ptr
->row_buf
+ 1;
917 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
918 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
920 __asm__
__volatile__ (
921 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
922 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
923 "punpcklbw %%mm7, %%mm7 \n\t"
924 "punpcklwd %%mm7, %%mm7 \n\t"
925 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
927 "movq _mask24_0, %%mm0 \n\t"
928 "movq _mask24_1, %%mm1 \n\t"
929 "movq _mask24_2, %%mm2 \n\t"
931 "pand %%mm7, %%mm0 \n\t"
932 "pand %%mm7, %%mm1 \n\t"
933 "pand %%mm7, %%mm2 \n\t"
935 "pcmpeqb %%mm6, %%mm0 \n\t"
936 "pcmpeqb %%mm6, %%mm1 \n\t"
937 "pcmpeqb %%mm6, %%mm2 \n\t"
939 /* preload "movl len, %%ecx \n\t" // load length of line */
940 /* preload "movl srcptr, %%esi \n\t" // load source */
941 /* preload "movl dstptr, %%edi \n\t" // load dest */
943 "cmpl $0, %%ecx \n\t"
944 "jz mainloop24end \n\t"
947 "movq (%%esi), %%mm4 \n\t"
948 "pand %%mm0, %%mm4 \n\t"
949 "movq %%mm0, %%mm6 \n\t"
950 "movq (%%edi), %%mm7 \n\t"
951 "pandn %%mm7, %%mm6 \n\t"
952 "por %%mm6, %%mm4 \n\t"
953 "movq %%mm4, (%%edi) \n\t"
955 "movq 8(%%esi), %%mm5 \n\t"
956 "pand %%mm1, %%mm5 \n\t"
957 "movq %%mm1, %%mm7 \n\t"
958 "movq 8(%%edi), %%mm6 \n\t"
959 "pandn %%mm6, %%mm7 \n\t"
960 "por %%mm7, %%mm5 \n\t"
961 "movq %%mm5, 8(%%edi) \n\t"
963 "movq 16(%%esi), %%mm6 \n\t"
964 "pand %%mm2, %%mm6 \n\t"
965 "movq %%mm2, %%mm4 \n\t"
966 "movq 16(%%edi), %%mm7 \n\t"
967 "pandn %%mm7, %%mm4 \n\t"
968 "por %%mm4, %%mm6 \n\t"
969 "movq %%mm6, 16(%%edi) \n\t"
971 "addl $24, %%esi \n\t" /* inc by 24 bytes processed */
972 "addl $24, %%edi \n\t"
973 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
977 "mainloop24end: \n\t"
978 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
979 "movl %%eax, %%ecx \n\t"
980 "cmpl $0, %%ecx \n\t"
982 /* preload "movl mask, %%edx \n\t" */
983 "sall $24, %%edx \n\t" /* make low byte, high byte */
986 "sall %%edx \n\t" /* move high bit to CF */
987 "jnc skip24 \n\t" /* if CF = 0 */
988 "movw (%%esi), %%ax \n\t"
989 "movw %%ax, (%%edi) \n\t"
990 "xorl %%eax, %%eax \n\t"
991 "movb 2(%%esi), %%al \n\t"
992 "movb %%al, 2(%%edi) \n\t"
995 "addl $3, %%esi \n\t"
996 "addl $3, %%edi \n\t"
998 "jnz secondloop24 \n\t"
1001 "EMMS \n\t" /* DONE */
1003 : "=a" (dummy_value_a
), /* output regs (dummy) */
1004 "=d" (dummy_value_d
),
1005 "=c" (dummy_value_c
),
1006 "=S" (dummy_value_S
),
1007 "=D" (dummy_value_D
)
1009 : "3" (srcptr
), /* esi // input regs */
1010 "4" (dstptr
), /* edi */
1011 "0" (diff
), /* eax */
1012 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1013 "2" (len
), /* ecx */
1014 "1" (mask
) /* edx */
1016 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1017 : "%mm0", "%mm1", "%mm2" /* clobber list */
1018 , "%mm4", "%mm5", "%mm6", "%mm7"
1022 else /* mmx _not supported - Use modified C routine */
1023 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1025 register png_uint_32 i
;
1026 png_uint_32 initial_val
= BPP3
* png_pass_start
[png_ptr
->pass
];
1027 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1028 register int stride
= BPP3
* png_pass_inc
[png_ptr
->pass
];
1029 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1030 register int rep_bytes
= BPP3
* png_pass_width
[png_ptr
->pass
];
1031 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1032 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1033 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1034 register png_uint_32 final_val
= BPP3
* len
; /* GRR bugfix */
1036 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1037 dstptr
= row
+ initial_val
;
1039 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1041 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1045 if (diff
) /* number of leftover pixels: 3 for pngtest */
1047 final_val
+=diff
*BPP3
;
1048 for (; i
< final_val
; i
+= stride
)
1050 if (rep_bytes
> (int)(final_val
-i
))
1051 rep_bytes
= (int)(final_val
-i
);
1052 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1057 } /* end of else (_mmx_supported) */
1062 case 32: /* png_ptr->row_info.pixel_depth */
1067 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1068 #if !defined(PNG_1_0_X)
1069 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
1070 /* && _mmx_supported */ )
1077 int dummy_value_a
; /* fix 'forbidden register spilled' error */
1082 _unmask
= ~mask
; /* global variable for -fPIC version */
1083 srcptr
= png_ptr
->row_buf
+ 1;
1085 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
1086 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
1088 __asm__
__volatile__ (
1089 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1090 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
1091 "punpcklbw %%mm7, %%mm7 \n\t"
1092 "punpcklwd %%mm7, %%mm7 \n\t"
1093 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
1095 "movq _mask32_0, %%mm0 \n\t"
1096 "movq _mask32_1, %%mm1 \n\t"
1097 "movq _mask32_2, %%mm2 \n\t"
1098 "movq _mask32_3, %%mm3 \n\t"
1100 "pand %%mm7, %%mm0 \n\t"
1101 "pand %%mm7, %%mm1 \n\t"
1102 "pand %%mm7, %%mm2 \n\t"
1103 "pand %%mm7, %%mm3 \n\t"
1105 "pcmpeqb %%mm6, %%mm0 \n\t"
1106 "pcmpeqb %%mm6, %%mm1 \n\t"
1107 "pcmpeqb %%mm6, %%mm2 \n\t"
1108 "pcmpeqb %%mm6, %%mm3 \n\t"
1110 /* preload "movl len, %%ecx \n\t" // load length of line */
1111 /* preload "movl srcptr, %%esi \n\t" // load source */
1112 /* preload "movl dstptr, %%edi \n\t" // load dest */
1114 "cmpl $0, %%ecx \n\t" /* lcr */
1115 "jz mainloop32end \n\t"
1118 "movq (%%esi), %%mm4 \n\t"
1119 "pand %%mm0, %%mm4 \n\t"
1120 "movq %%mm0, %%mm6 \n\t"
1121 "movq (%%edi), %%mm7 \n\t"
1122 "pandn %%mm7, %%mm6 \n\t"
1123 "por %%mm6, %%mm4 \n\t"
1124 "movq %%mm4, (%%edi) \n\t"
1126 "movq 8(%%esi), %%mm5 \n\t"
1127 "pand %%mm1, %%mm5 \n\t"
1128 "movq %%mm1, %%mm7 \n\t"
1129 "movq 8(%%edi), %%mm6 \n\t"
1130 "pandn %%mm6, %%mm7 \n\t"
1131 "por %%mm7, %%mm5 \n\t"
1132 "movq %%mm5, 8(%%edi) \n\t"
1134 "movq 16(%%esi), %%mm6 \n\t"
1135 "pand %%mm2, %%mm6 \n\t"
1136 "movq %%mm2, %%mm4 \n\t"
1137 "movq 16(%%edi), %%mm7 \n\t"
1138 "pandn %%mm7, %%mm4 \n\t"
1139 "por %%mm4, %%mm6 \n\t"
1140 "movq %%mm6, 16(%%edi) \n\t"
1142 "movq 24(%%esi), %%mm7 \n\t"
1143 "pand %%mm3, %%mm7 \n\t"
1144 "movq %%mm3, %%mm5 \n\t"
1145 "movq 24(%%edi), %%mm4 \n\t"
1146 "pandn %%mm4, %%mm5 \n\t"
1147 "por %%mm5, %%mm7 \n\t"
1148 "movq %%mm7, 24(%%edi) \n\t"
1150 "addl $32, %%esi \n\t" /* inc by 32 bytes processed */
1151 "addl $32, %%edi \n\t"
1152 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
1153 "ja mainloop32 \n\t"
1155 "mainloop32end: \n\t"
1156 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
1157 "movl %%eax, %%ecx \n\t"
1158 "cmpl $0, %%ecx \n\t"
1160 /* preload "movl mask, %%edx \n\t" */
1161 "sall $24, %%edx \n\t" /* low byte => high byte */
1163 "secondloop32: \n\t"
1164 "sall %%edx \n\t" /* move high bit to CF */
1165 "jnc skip32 \n\t" /* if CF = 0 */
1166 "movl (%%esi), %%eax \n\t"
1167 "movl %%eax, (%%edi) \n\t"
1170 "addl $4, %%esi \n\t"
1171 "addl $4, %%edi \n\t"
1173 "jnz secondloop32 \n\t"
1176 "EMMS \n\t" /* DONE */
1178 : "=a" (dummy_value_a
), /* output regs (dummy) */
1179 "=d" (dummy_value_d
),
1180 "=c" (dummy_value_c
),
1181 "=S" (dummy_value_S
),
1182 "=D" (dummy_value_D
)
1184 : "3" (srcptr
), /* esi // input regs */
1185 "4" (dstptr
), /* edi */
1186 "0" (diff
), /* eax */
1187 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1188 "2" (len
), /* ecx */
1189 "1" (mask
) /* edx */
1191 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1192 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
1193 , "%mm4", "%mm5", "%mm6", "%mm7"
1197 else /* mmx _not supported - Use modified C routine */
1198 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1200 register png_uint_32 i
;
1201 png_uint_32 initial_val
= BPP4
* png_pass_start
[png_ptr
->pass
];
1202 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1203 register int stride
= BPP4
* png_pass_inc
[png_ptr
->pass
];
1204 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1205 register int rep_bytes
= BPP4
* png_pass_width
[png_ptr
->pass
];
1206 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1207 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1208 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1209 register png_uint_32 final_val
= BPP4
* len
; /* GRR bugfix */
1211 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1212 dstptr
= row
+ initial_val
;
1214 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1216 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1220 if (diff
) /* number of leftover pixels: 3 for pngtest */
1222 final_val
+=diff
*BPP4
;
1223 for (; i
< final_val
; i
+= stride
)
1225 if (rep_bytes
> (int)(final_val
-i
))
1226 rep_bytes
= (int)(final_val
-i
);
1227 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1232 } /* end of else (_mmx_supported) */
1237 case 48: /* png_ptr->row_info.pixel_depth */
1242 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1243 #if !defined(PNG_1_0_X)
1244 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
1245 /* && _mmx_supported */ )
1252 int dummy_value_a
; /* fix 'forbidden register spilled' error */
1257 _unmask
= ~mask
; /* global variable for -fPIC version */
1258 srcptr
= png_ptr
->row_buf
+ 1;
1260 len
= png_ptr
->width
&~7; /* reduce to multiple of 8 */
1261 diff
= (int) (png_ptr
->width
& 7); /* amount lost // */
1263 __asm__
__volatile__ (
1264 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1265 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
1266 "punpcklbw %%mm7, %%mm7 \n\t"
1267 "punpcklwd %%mm7, %%mm7 \n\t"
1268 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
1270 "movq _mask48_0, %%mm0 \n\t"
1271 "movq _mask48_1, %%mm1 \n\t"
1272 "movq _mask48_2, %%mm2 \n\t"
1273 "movq _mask48_3, %%mm3 \n\t"
1274 "movq _mask48_4, %%mm4 \n\t"
1275 "movq _mask48_5, %%mm5 \n\t"
1277 "pand %%mm7, %%mm0 \n\t"
1278 "pand %%mm7, %%mm1 \n\t"
1279 "pand %%mm7, %%mm2 \n\t"
1280 "pand %%mm7, %%mm3 \n\t"
1281 "pand %%mm7, %%mm4 \n\t"
1282 "pand %%mm7, %%mm5 \n\t"
1284 "pcmpeqb %%mm6, %%mm0 \n\t"
1285 "pcmpeqb %%mm6, %%mm1 \n\t"
1286 "pcmpeqb %%mm6, %%mm2 \n\t"
1287 "pcmpeqb %%mm6, %%mm3 \n\t"
1288 "pcmpeqb %%mm6, %%mm4 \n\t"
1289 "pcmpeqb %%mm6, %%mm5 \n\t"
1291 /* preload "movl len, %%ecx \n\t" // load length of line */
1292 /* preload "movl srcptr, %%esi \n\t" // load source */
1293 /* preload "movl dstptr, %%edi \n\t" // load dest */
1295 "cmpl $0, %%ecx \n\t"
1296 "jz mainloop48end \n\t"
1299 "movq (%%esi), %%mm7 \n\t"
1300 "pand %%mm0, %%mm7 \n\t"
1301 "movq %%mm0, %%mm6 \n\t"
1302 "pandn (%%edi), %%mm6 \n\t"
1303 "por %%mm6, %%mm7 \n\t"
1304 "movq %%mm7, (%%edi) \n\t"
1306 "movq 8(%%esi), %%mm6 \n\t"
1307 "pand %%mm1, %%mm6 \n\t"
1308 "movq %%mm1, %%mm7 \n\t"
1309 "pandn 8(%%edi), %%mm7 \n\t"
1310 "por %%mm7, %%mm6 \n\t"
1311 "movq %%mm6, 8(%%edi) \n\t"
1313 "movq 16(%%esi), %%mm6 \n\t"
1314 "pand %%mm2, %%mm6 \n\t"
1315 "movq %%mm2, %%mm7 \n\t"
1316 "pandn 16(%%edi), %%mm7 \n\t"
1317 "por %%mm7, %%mm6 \n\t"
1318 "movq %%mm6, 16(%%edi) \n\t"
1320 "movq 24(%%esi), %%mm7 \n\t"
1321 "pand %%mm3, %%mm7 \n\t"
1322 "movq %%mm3, %%mm6 \n\t"
1323 "pandn 24(%%edi), %%mm6 \n\t"
1324 "por %%mm6, %%mm7 \n\t"
1325 "movq %%mm7, 24(%%edi) \n\t"
1327 "movq 32(%%esi), %%mm6 \n\t"
1328 "pand %%mm4, %%mm6 \n\t"
1329 "movq %%mm4, %%mm7 \n\t"
1330 "pandn 32(%%edi), %%mm7 \n\t"
1331 "por %%mm7, %%mm6 \n\t"
1332 "movq %%mm6, 32(%%edi) \n\t"
1334 "movq 40(%%esi), %%mm7 \n\t"
1335 "pand %%mm5, %%mm7 \n\t"
1336 "movq %%mm5, %%mm6 \n\t"
1337 "pandn 40(%%edi), %%mm6 \n\t"
1338 "por %%mm6, %%mm7 \n\t"
1339 "movq %%mm7, 40(%%edi) \n\t"
1341 "addl $48, %%esi \n\t" /* inc by 48 bytes processed */
1342 "addl $48, %%edi \n\t"
1343 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
1345 "ja mainloop48 \n\t"
1347 "mainloop48end: \n\t"
1348 /* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
1349 "movl %%eax, %%ecx \n\t"
1350 "cmpl $0, %%ecx \n\t"
1352 /* preload "movl mask, %%edx \n\t" */
1353 "sall $24, %%edx \n\t" /* make low byte, high byte */
1355 "secondloop48: \n\t"
1356 "sall %%edx \n\t" /* move high bit to CF */
1357 "jnc skip48 \n\t" /* if CF = 0 */
1358 "movl (%%esi), %%eax \n\t"
1359 "movl %%eax, (%%edi) \n\t"
1362 "addl $4, %%esi \n\t"
1363 "addl $4, %%edi \n\t"
1365 "jnz secondloop48 \n\t"
1368 "EMMS \n\t" /* DONE */
1370 : "=a" (dummy_value_a
), /* output regs (dummy) */
1371 "=d" (dummy_value_d
),
1372 "=c" (dummy_value_c
),
1373 "=S" (dummy_value_S
),
1374 "=D" (dummy_value_D
)
1376 : "3" (srcptr
), /* esi // input regs */
1377 "4" (dstptr
), /* edi */
1378 "0" (diff
), /* eax */
1379 /* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1380 "2" (len
), /* ecx */
1381 "1" (mask
) /* edx */
1383 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1384 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
1385 , "%mm4", "%mm5", "%mm6", "%mm7"
1389 else /* mmx _not supported - Use modified C routine */
1390 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1392 register png_uint_32 i
;
1393 png_uint_32 initial_val
= BPP6
* png_pass_start
[png_ptr
->pass
];
1394 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1395 register int stride
= BPP6
* png_pass_inc
[png_ptr
->pass
];
1396 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1397 register int rep_bytes
= BPP6
* png_pass_width
[png_ptr
->pass
];
1398 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1399 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1400 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1401 register png_uint_32 final_val
= BPP6
* len
; /* GRR bugfix */
1403 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1404 dstptr
= row
+ initial_val
;
1406 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1408 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1412 if (diff
) /* number of leftover pixels: 3 for pngtest */
1414 final_val
+=diff
*BPP6
;
1415 for (; i
< final_val
; i
+= stride
)
1417 if (rep_bytes
> (int)(final_val
-i
))
1418 rep_bytes
= (int)(final_val
-i
);
1419 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1424 } /* end of else (_mmx_supported) */
1429 case 64: /* png_ptr->row_info.pixel_depth */
1433 register png_uint_32 i
;
1434 png_uint_32 initial_val
= BPP8
* png_pass_start
[png_ptr
->pass
];
1435 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1436 register int stride
= BPP8
* png_pass_inc
[png_ptr
->pass
];
1437 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1438 register int rep_bytes
= BPP8
* png_pass_width
[png_ptr
->pass
];
1439 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1440 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1441 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1442 register png_uint_32 final_val
= BPP8
* len
; /* GRR bugfix */
1444 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1445 dstptr
= row
+ initial_val
;
1447 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1449 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1453 if (diff
) /* number of leftover pixels: 3 for pngtest */
1455 final_val
+=diff
*BPP8
;
1456 for (; i
< final_val
; i
+= stride
)
1458 if (rep_bytes
> (int)(final_val
-i
))
1459 rep_bytes
= (int)(final_val
-i
);
1460 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1469 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1471 /* this should never happen */
1472 png_warning(png_ptr
, "Invalid row_info.pixel_depth in pnggccrd");
1475 } /* end switch (png_ptr->row_info.pixel_depth) */
1477 } /* end if (non-trivial mask) */
1479 } /* end png_combine_row() */
1481 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1486 /*===========================================================================*/
1488 /* P N G _ D O _ R E A D _ I N T E R L A C E */
1490 /*===========================================================================*/
1492 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1493 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1495 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1496 * has taken place. [GRR: what other steps come before and/or after?]
1500 png_do_read_interlace(png_structp png_ptr
)
1502 png_row_infop row_info
= &(png_ptr
->row_info
);
1503 png_bytep row
= png_ptr
->row_buf
+ 1;
1504 int pass
= png_ptr
->pass
;
1505 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1506 png_uint_32 transformations
= png_ptr
->transformations
;
1509 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1511 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1512 if (_mmx_supported
== 2) {
1513 #if !defined(PNG_1_0_X)
1514 /* this should have happened in png_init_mmx_flags() already */
1515 png_warning(png_ptr
, "asm_flags may not have been initialized");
1521 if (row
!= NULL
&& row_info
!= NULL
)
1523 png_uint_32 final_width
;
1525 final_width
= row_info
->width
* png_pass_inc
[pass
];
1527 switch (row_info
->pixel_depth
)
1533 int s_start
, s_end
, s_inc
;
1538 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 3);
1539 dp
= row
+ (png_size_t
)((final_width
- 1) >> 3);
1540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541 if (transformations
& PNG_PACKSWAP
)
1543 sshift
= (int)((row_info
->width
+ 7) & 7);
1544 dshift
= (int)((final_width
+ 7) & 7);
1552 sshift
= 7 - (int)((row_info
->width
+ 7) & 7);
1553 dshift
= 7 - (int)((final_width
+ 7) & 7);
1559 for (i
= row_info
->width
; i
; i
--)
1561 v
= (png_byte
)((*sp
>> sshift
) & 0x1);
1562 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1564 *dp
&= (png_byte
)((0x7f7f >> (7 - dshift
)) & 0xff);
1565 *dp
|= (png_byte
)(v
<< dshift
);
1566 if (dshift
== s_end
)
1574 if (sshift
== s_end
)
1589 int s_start
, s_end
, s_inc
;
1592 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 2);
1593 dp
= row
+ (png_size_t
)((final_width
- 1) >> 2);
1594 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1595 if (transformations
& PNG_PACKSWAP
)
1597 sshift
= (png_size_t
)(((row_info
->width
+ 3) & 3) << 1);
1598 dshift
= (png_size_t
)(((final_width
+ 3) & 3) << 1);
1606 sshift
= (png_size_t
)((3 - ((row_info
->width
+ 3) & 3)) << 1);
1607 dshift
= (png_size_t
)((3 - ((final_width
+ 3) & 3)) << 1);
1613 for (i
= row_info
->width
; i
; i
--)
1618 v
= (png_byte
)((*sp
>> sshift
) & 0x3);
1619 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1621 *dp
&= (png_byte
)((0x3f3f >> (6 - dshift
)) & 0xff);
1622 *dp
|= (png_byte
)(v
<< dshift
);
1623 if (dshift
== s_end
)
1631 if (sshift
== s_end
)
1646 int s_start
, s_end
, s_inc
;
1649 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 1);
1650 dp
= row
+ (png_size_t
)((final_width
- 1) >> 1);
1651 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1652 if (transformations
& PNG_PACKSWAP
)
1654 sshift
= (png_size_t
)(((row_info
->width
+ 1) & 1) << 2);
1655 dshift
= (png_size_t
)(((final_width
+ 1) & 1) << 2);
1663 sshift
= (png_size_t
)((1 - ((row_info
->width
+ 1) & 1)) << 2);
1664 dshift
= (png_size_t
)((1 - ((final_width
+ 1) & 1)) << 2);
1670 for (i
= row_info
->width
; i
; i
--)
1675 v
= (png_byte
)((*sp
>> sshift
) & 0xf);
1676 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1678 *dp
&= (png_byte
)((0xf0f >> (4 - dshift
)) & 0xff);
1679 *dp
|= (png_byte
)(v
<< dshift
);
1680 if (dshift
== s_end
)
1688 if (sshift
== s_end
)
1699 /*====================================================================*/
1701 default: /* 8-bit or larger (this is where the routine is modified) */
1704 /* static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1705 /* static unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
1706 /* unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1707 /* unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
1711 png_size_t pixel_bytes
;
1712 int width
= (int)row_info
->width
;
1714 pixel_bytes
= (row_info
->pixel_depth
>> 3);
1716 /* point sptr at the last pixel in the pre-expanded row: */
1717 sptr
= row
+ (width
- 1) * pixel_bytes
;
1719 /* point dp at the last pixel position in the expanded row: */
1720 dp
= row
+ (final_width
- 1) * pixel_bytes
;
1722 /* New code by Nirav Chhatrapati - Intel Corporation */
1724 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1725 #if !defined(PNG_1_0_X)
1726 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_INTERLACE
)
1727 /* && _mmx_supported */ )
1732 //--------------------------------------------------------------
1733 if (pixel_bytes
== 3)
1735 if (((pass
== 0) || (pass
== 1)) && width
)
1737 int dummy_value_c
; /* fix 'forbidden register spilled' */
1741 __asm__
__volatile__ (
1742 "subl $21, %%edi \n\t"
1743 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1745 ".loop3_pass0: \n\t"
1746 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1747 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1748 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1749 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1750 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1751 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1752 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1753 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1754 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
1755 "movq %%mm0, %%mm3 \n\t" /* 2 1 0 2 1 0 2 1 */
1756 "psllq $16, %%mm0 \n\t" /* 0 2 1 0 2 1 z z */
1757 "movq %%mm3, %%mm4 \n\t" /* 2 1 0 2 1 0 2 1 */
1758 "punpckhdq %%mm0, %%mm3 \n\t" /* 0 2 1 0 2 1 0 2 */
1759 "movq %%mm4, 16(%%edi) \n\t"
1760 "psrlq $32, %%mm0 \n\t" /* z z z z 0 2 1 0 */
1761 "movq %%mm3, 8(%%edi) \n\t"
1762 "punpckldq %%mm4, %%mm0 \n\t" /* 1 0 2 1 0 2 1 0 */
1763 "subl $3, %%esi \n\t"
1764 "movq %%mm0, (%%edi) \n\t"
1765 "subl $24, %%edi \n\t"
1767 "jnz .loop3_pass0 \n\t"
1768 "EMMS \n\t" /* DONE */
1770 : "=c" (dummy_value_c
), /* output regs (dummy) */
1771 "=S" (dummy_value_S
),
1772 "=D" (dummy_value_D
)
1774 : "1" (sptr
), // esi // input regs
1777 "rim" (_const4
) // %1(?) (0x0000000000FFFFFFLL)
1779 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1780 : "%mm0", "%mm1", "%mm2" /* clobber list */
1785 else if (((pass
== 2) || (pass
== 3)) && width
)
1787 int dummy_value_c
; /* fix 'forbidden register spilled' */
1791 __asm__
__volatile__ (
1792 "subl $9, %%edi \n\t"
1793 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1795 ".loop3_pass2: \n\t"
1796 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1797 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1798 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1799 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1800 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1801 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1802 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1803 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1804 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
1805 "movq %%mm0, 4(%%edi) \n\t"
1806 "psrlq $16, %%mm0 \n\t" /* z z 2 1 0 2 1 0 */
1807 "subl $3, %%esi \n\t"
1808 "movd %%mm0, (%%edi) \n\t"
1809 "subl $12, %%edi \n\t"
1811 "jnz .loop3_pass2 \n\t"
1812 "EMMS \n\t" /* DONE */
1814 : "=c" (dummy_value_c
), /* output regs (dummy) */
1815 "=S" (dummy_value_S
),
1816 "=D" (dummy_value_D
)
1818 : "1" (sptr
), // esi // input regs
1821 "rim" (_const4
) // (0x0000000000FFFFFFLL)
1823 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1824 : "%mm0", "%mm1", "%mm2" /* clobber list */
1828 else if (width
) /* && ((pass == 4) || (pass == 5)) */
1830 int width_mmx
= ((width
>> 1) << 1) - 8; /* GRR: huh? */
1833 width
-= width_mmx
; /* 8 or 9 pix, 24 or 27 bytes */
1836 /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1837 /* sptr points at last pixel in pre-expanded row */
1838 /* dp points at last pixel position in expanded row */
1839 int dummy_value_c
; /* fix 'forbidden register spilled' */
1843 __asm__
__volatile__ (
1844 "subl $3, %%esi \n\t"
1845 "subl $9, %%edi \n\t"
1846 /* (png_pass_inc[pass] + 1)*pixel_bytes */
1848 ".loop3_pass4: \n\t"
1849 "movq (%%esi), %%mm0 \n\t" /* x x 5 4 3 2 1 0 */
1850 "movq %%mm0, %%mm1 \n\t" /* x x 5 4 3 2 1 0 */
1851 "movq %%mm0, %%mm2 \n\t" /* x x 5 4 3 2 1 0 */
1852 "psllq $24, %%mm0 \n\t" /* 4 3 2 1 0 z z z */
1853 "pand _const4, %%mm1 \n\t" /* z z z z z 2 1 0 */
1854 "psrlq $24, %%mm2 \n\t" /* z z z x x 5 4 3 */
1855 "por %%mm1, %%mm0 \n\t" /* 4 3 2 1 0 2 1 0 */
1856 "movq %%mm2, %%mm3 \n\t" /* z z z x x 5 4 3 */
1857 "psllq $8, %%mm2 \n\t" /* z z x x 5 4 3 z */
1858 "movq %%mm0, (%%edi) \n\t"
1859 "psrlq $16, %%mm3 \n\t" /* z z z z z x x 5 */
1860 "pand _const6, %%mm3 \n\t" /* z z z z z z z 5 */
1861 "por %%mm3, %%mm2 \n\t" /* z z x x 5 4 3 5 */
1862 "subl $6, %%esi \n\t"
1863 "movd %%mm2, 8(%%edi) \n\t"
1864 "subl $12, %%edi \n\t"
1865 "subl $2, %%ecx \n\t"
1866 "jnz .loop3_pass4 \n\t"
1867 "EMMS \n\t" /* DONE */
1869 : "=c" (dummy_value_c
), /* output regs (dummy) */
1870 "=S" (dummy_value_S
),
1871 "=D" (dummy_value_D
)
1873 : "1" (sptr
), // esi // input regs
1875 "0" (width_mmx
), // ecx
1876 "rim" (_const4
), // 0x0000000000FFFFFFLL
1877 "rim" (_const6
) // 0x00000000000000FFLL
1879 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1880 : "%mm0", "%mm1" /* clobber list */
1886 sptr
-= width_mmx
*3;
1888 for (i
= width
; i
; i
--)
1893 png_memcpy(v
, sptr
, 3);
1894 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1896 png_memcpy(dp
, v
, 3);
1902 } /* end of pixel_bytes == 3 */
1904 //--------------------------------------------------------------
1905 else if (pixel_bytes
== 1)
1907 if (((pass
== 0) || (pass
== 1)) && width
)
1909 int width_mmx
= ((width
>> 2) << 2);
1910 width
-= width_mmx
; /* 0-3 pixels => 0-3 bytes */
1913 int dummy_value_c
; /* fix 'forbidden register spilled' */
1917 __asm__
__volatile__ (
1918 "subl $3, %%esi \n\t"
1919 "subl $31, %%edi \n\t"
1921 ".loop1_pass0: \n\t"
1922 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
1923 "movq %%mm0, %%mm1 \n\t" /* x x x x 3 2 1 0 */
1924 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
1925 "movq %%mm0, %%mm2 \n\t" /* 3 3 2 2 1 1 0 0 */
1926 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
1927 "movq %%mm0, %%mm3 \n\t" /* 1 1 1 1 0 0 0 0 */
1928 "punpckldq %%mm0, %%mm0 \n\t" /* 0 0 0 0 0 0 0 0 */
1929 "punpckhdq %%mm3, %%mm3 \n\t" /* 1 1 1 1 1 1 1 1 */
1930 "movq %%mm0, (%%edi) \n\t"
1931 "punpckhwd %%mm2, %%mm2 \n\t" /* 3 3 3 3 2 2 2 2 */
1932 "movq %%mm3, 8(%%edi) \n\t"
1933 "movq %%mm2, %%mm4 \n\t" /* 3 3 3 3 2 2 2 2 */
1934 "punpckldq %%mm2, %%mm2 \n\t" /* 2 2 2 2 2 2 2 2 */
1935 "punpckhdq %%mm4, %%mm4 \n\t" /* 3 3 3 3 3 3 3 3 */
1936 "movq %%mm2, 16(%%edi) \n\t"
1937 "subl $4, %%esi \n\t"
1938 "movq %%mm4, 24(%%edi) \n\t"
1939 "subl $32, %%edi \n\t"
1940 "subl $4, %%ecx \n\t"
1941 "jnz .loop1_pass0 \n\t"
1942 "EMMS \n\t" /* DONE */
1944 : "=c" (dummy_value_c
), /* output regs (dummy) */
1945 "=S" (dummy_value_S
),
1946 "=D" (dummy_value_D
)
1948 : "1" (sptr
), /* esi // input regs */
1950 "0" (width_mmx
) /* ecx */
1952 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1953 : "%mm0", "%mm1", "%mm2" /* clobber list */
1961 for (i
= width
; i
; i
--)
1965 /* I simplified this part in version 1.0.4e
1966 * here and in several other instances where
1967 * pixel_bytes == 1 -- GR-P
1972 * png_memcpy(v, sptr, pixel_bytes);
1973 * for (j = 0; j < png_pass_inc[pass]; j++)
1975 * png_memcpy(dp, v, pixel_bytes);
1976 * dp -= pixel_bytes;
1978 * sptr -= pixel_bytes;
1980 * Replacement code is in the next three lines:
1983 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1990 else if (((pass
== 2) || (pass
== 3)) && width
)
1992 int width_mmx
= ((width
>> 2) << 2);
1993 width
-= width_mmx
; /* 0-3 pixels => 0-3 bytes */
1996 int dummy_value_c
; /* fix 'forbidden register spilled' */
2000 __asm__
__volatile__ (
2001 "subl $3, %%esi \n\t"
2002 "subl $15, %%edi \n\t"
2004 ".loop1_pass2: \n\t"
2005 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2006 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
2007 "movq %%mm0, %%mm1 \n\t" /* 3 3 2 2 1 1 0 0 */
2008 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
2009 "punpckhwd %%mm1, %%mm1 \n\t" /* 3 3 3 3 2 2 2 2 */
2010 "movq %%mm0, (%%edi) \n\t"
2011 "subl $4, %%esi \n\t"
2012 "movq %%mm1, 8(%%edi) \n\t"
2013 "subl $16, %%edi \n\t"
2014 "subl $4, %%ecx \n\t"
2015 "jnz .loop1_pass2 \n\t"
2016 "EMMS \n\t" /* DONE */
2018 : "=c" (dummy_value_c
), /* output regs (dummy) */
2019 "=S" (dummy_value_S
),
2020 "=D" (dummy_value_D
)
2022 : "1" (sptr
), /* esi // input regs */
2024 "0" (width_mmx
) /* ecx */
2026 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2027 : "%mm0", "%mm1" /* clobber list */
2034 for (i
= width
; i
; i
--)
2038 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2045 else if (width
) /* && ((pass == 4) || (pass == 5)) */
2047 int width_mmx
= ((width
>> 3) << 3);
2048 width
-= width_mmx
; /* 0-3 pixels => 0-3 bytes */
2051 int dummy_value_c
; /* fix 'forbidden register spilled' */
2055 __asm__
__volatile__ (
2056 "subl $7, %%esi \n\t"
2057 "subl $15, %%edi \n\t"
2059 ".loop1_pass4: \n\t"
2060 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2061 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2062 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
2063 "punpckhbw %%mm1, %%mm1 \n\t" /* 7 7 6 6 5 5 4 4 */
2064 "movq %%mm1, 8(%%edi) \n\t"
2065 "subl $8, %%esi \n\t"
2066 "movq %%mm0, (%%edi) \n\t"
2067 "subl $16, %%edi \n\t"
2068 "subl $8, %%ecx \n\t"
2069 "jnz .loop1_pass4 \n\t"
2070 "EMMS \n\t" /* DONE */
2072 : "=c" (dummy_value_c
), /* output regs (none) */
2073 "=S" (dummy_value_S
),
2074 "=D" (dummy_value_D
)
2076 : "1" (sptr
), /* esi // input regs */
2078 "0" (width_mmx
) /* ecx */
2080 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2081 : "%mm0", "%mm1" /* clobber list */
2088 for (i
= width
; i
; i
--)
2092 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2099 } /* end of pixel_bytes == 1 */
2101 //--------------------------------------------------------------
2102 else if (pixel_bytes
== 2)
2104 if (((pass
== 0) || (pass
== 1)) && width
)
2106 int width_mmx
= ((width
>> 1) << 1);
2107 width
-= width_mmx
; /* 0,1 pixels => 0,2 bytes */
2110 int dummy_value_c
; /* fix 'forbidden register spilled' */
2114 __asm__
__volatile__ (
2115 "subl $2, %%esi \n\t"
2116 "subl $30, %%edi \n\t"
2118 ".loop2_pass0: \n\t"
2119 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2120 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2121 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2122 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2123 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
2124 "movq %%mm0, (%%edi) \n\t"
2125 "movq %%mm0, 8(%%edi) \n\t"
2126 "movq %%mm1, 16(%%edi) \n\t"
2127 "subl $4, %%esi \n\t"
2128 "movq %%mm1, 24(%%edi) \n\t"
2129 "subl $32, %%edi \n\t"
2130 "subl $2, %%ecx \n\t"
2131 "jnz .loop2_pass0 \n\t"
2132 "EMMS \n\t" /* DONE */
2134 : "=c" (dummy_value_c
), /* output regs (dummy) */
2135 "=S" (dummy_value_S
),
2136 "=D" (dummy_value_D
)
2138 : "1" (sptr
), /* esi // input regs */
2140 "0" (width_mmx
) /* ecx */
2142 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2143 : "%mm0", "%mm1" /* clobber list */
2148 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
2149 dp
-= (width_mmx
*16 - 2); /* sign fixed */
2150 for (i
= width
; i
; i
--)
2155 png_memcpy(v
, sptr
, 2);
2156 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2159 png_memcpy(dp
, v
, 2);
2163 else if (((pass
== 2) || (pass
== 3)) && width
)
2165 int width_mmx
= ((width
>> 1) << 1) ;
2166 width
-= width_mmx
; /* 0,1 pixels => 0,2 bytes */
2169 int dummy_value_c
; /* fix 'forbidden register spilled' */
2173 __asm__
__volatile__ (
2174 "subl $2, %%esi \n\t"
2175 "subl $14, %%edi \n\t"
2177 ".loop2_pass2: \n\t"
2178 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2179 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2180 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2181 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2182 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
2183 "movq %%mm0, (%%edi) \n\t"
2184 "subl $4, %%esi \n\t"
2185 "movq %%mm1, 8(%%edi) \n\t"
2186 "subl $16, %%edi \n\t"
2187 "subl $2, %%ecx \n\t"
2188 "jnz .loop2_pass2 \n\t"
2189 "EMMS \n\t" /* DONE */
2191 : "=c" (dummy_value_c
), /* output regs (dummy) */
2192 "=S" (dummy_value_S
),
2193 "=D" (dummy_value_D
)
2195 : "1" (sptr
), /* esi // input regs */
2197 "0" (width_mmx
) /* ecx */
2199 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2200 : "%mm0", "%mm1" /* clobber list */
2205 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
2206 dp
-= (width_mmx
*8 - 2); /* sign fixed */
2207 for (i
= width
; i
; i
--)
2212 png_memcpy(v
, sptr
, 2);
2213 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2216 png_memcpy(dp
, v
, 2);
2220 else if (width
) /* pass == 4 or 5 */
2222 int width_mmx
= ((width
>> 1) << 1) ;
2223 width
-= width_mmx
; /* 0,1 pixels => 0,2 bytes */
2226 int dummy_value_c
; /* fix 'forbidden register spilled' */
2230 __asm__
__volatile__ (
2231 "subl $2, %%esi \n\t"
2232 "subl $6, %%edi \n\t"
2234 ".loop2_pass4: \n\t"
2235 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2236 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2237 "subl $4, %%esi \n\t"
2238 "movq %%mm0, (%%edi) \n\t"
2239 "subl $8, %%edi \n\t"
2240 "subl $2, %%ecx \n\t"
2241 "jnz .loop2_pass4 \n\t"
2242 "EMMS \n\t" /* DONE */
2244 : "=c" (dummy_value_c
), /* output regs (dummy) */
2245 "=S" (dummy_value_S
),
2246 "=D" (dummy_value_D
)
2248 : "1" (sptr
), /* esi // input regs */
2250 "0" (width_mmx
) /* ecx */
2252 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2253 : "%mm0" /* clobber list */
2258 sptr
-= (width_mmx
*2 - 2); /* sign fixed */
2259 dp
-= (width_mmx
*4 - 2); /* sign fixed */
2260 for (i
= width
; i
; i
--)
2265 png_memcpy(v
, sptr
, 2);
2266 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2269 png_memcpy(dp
, v
, 2);
2273 } /* end of pixel_bytes == 2 */
2275 //--------------------------------------------------------------
2276 else if (pixel_bytes
== 4)
2278 if (((pass
== 0) || (pass
== 1)) && width
)
2280 int width_mmx
= ((width
>> 1) << 1);
2281 width
-= width_mmx
; /* 0,1 pixels => 0,4 bytes */
2284 int dummy_value_c
; /* fix 'forbidden register spilled' */
2288 __asm__
__volatile__ (
2289 "subl $4, %%esi \n\t"
2290 "subl $60, %%edi \n\t"
2292 ".loop4_pass0: \n\t"
2293 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2294 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2295 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2296 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
2297 "movq %%mm0, (%%edi) \n\t"
2298 "movq %%mm0, 8(%%edi) \n\t"
2299 "movq %%mm0, 16(%%edi) \n\t"
2300 "movq %%mm0, 24(%%edi) \n\t"
2301 "movq %%mm1, 32(%%edi) \n\t"
2302 "movq %%mm1, 40(%%edi) \n\t"
2303 "movq %%mm1, 48(%%edi) \n\t"
2304 "subl $8, %%esi \n\t"
2305 "movq %%mm1, 56(%%edi) \n\t"
2306 "subl $64, %%edi \n\t"
2307 "subl $2, %%ecx \n\t"
2308 "jnz .loop4_pass0 \n\t"
2309 "EMMS \n\t" /* DONE */
2311 : "=c" (dummy_value_c
), /* output regs (dummy) */
2312 "=S" (dummy_value_S
),
2313 "=D" (dummy_value_D
)
2315 : "1" (sptr
), /* esi // input regs */
2317 "0" (width_mmx
) /* ecx */
2319 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2320 : "%mm0", "%mm1" /* clobber list */
2325 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
2326 dp
-= (width_mmx
*32 - 4); /* sign fixed */
2327 for (i
= width
; i
; i
--)
2332 png_memcpy(v
, sptr
, 4);
2333 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2336 png_memcpy(dp
, v
, 4);
2340 else if (((pass
== 2) || (pass
== 3)) && width
)
2342 int width_mmx
= ((width
>> 1) << 1);
2343 width
-= width_mmx
; /* 0,1 pixels => 0,4 bytes */
2346 int dummy_value_c
; /* fix 'forbidden register spilled' */
2350 __asm__
__volatile__ (
2351 "subl $4, %%esi \n\t"
2352 "subl $28, %%edi \n\t"
2354 ".loop4_pass2: \n\t"
2355 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2356 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2357 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2358 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
2359 "movq %%mm0, (%%edi) \n\t"
2360 "movq %%mm0, 8(%%edi) \n\t"
2361 "movq %%mm1, 16(%%edi) \n\t"
2362 "movq %%mm1, 24(%%edi) \n\t"
2363 "subl $8, %%esi \n\t"
2364 "subl $32, %%edi \n\t"
2365 "subl $2, %%ecx \n\t"
2366 "jnz .loop4_pass2 \n\t"
2367 "EMMS \n\t" /* DONE */
2369 : "=c" (dummy_value_c
), /* output regs (dummy) */
2370 "=S" (dummy_value_S
),
2371 "=D" (dummy_value_D
)
2373 : "1" (sptr
), /* esi // input regs */
2375 "0" (width_mmx
) /* ecx */
2377 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2378 : "%mm0", "%mm1" /* clobber list */
2383 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
2384 dp
-= (width_mmx
*16 - 4); /* sign fixed */
2385 for (i
= width
; i
; i
--)
2390 png_memcpy(v
, sptr
, 4);
2391 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2394 png_memcpy(dp
, v
, 4);
2398 else if (width
) /* pass == 4 or 5 */
2400 int width_mmx
= ((width
>> 1) << 1) ;
2401 width
-= width_mmx
; /* 0,1 pixels => 0,4 bytes */
2404 int dummy_value_c
; /* fix 'forbidden register spilled' */
2408 __asm__
__volatile__ (
2409 "subl $4, %%esi \n\t"
2410 "subl $12, %%edi \n\t"
2412 ".loop4_pass4: \n\t"
2413 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2414 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2415 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2416 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
2417 "movq %%mm0, (%%edi) \n\t"
2418 "subl $8, %%esi \n\t"
2419 "movq %%mm1, 8(%%edi) \n\t"
2420 "subl $16, %%edi \n\t"
2421 "subl $2, %%ecx \n\t"
2422 "jnz .loop4_pass4 \n\t"
2423 "EMMS \n\t" /* DONE */
2425 : "=c" (dummy_value_c
), /* output regs (dummy) */
2426 "=S" (dummy_value_S
),
2427 "=D" (dummy_value_D
)
2429 : "1" (sptr
), /* esi // input regs */
2431 "0" (width_mmx
) /* ecx */
2433 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2434 : "%mm0", "%mm1" /* clobber list */
2439 sptr
-= (width_mmx
*4 - 4); /* sign fixed */
2440 dp
-= (width_mmx
*8 - 4); /* sign fixed */
2441 for (i
= width
; i
; i
--)
2446 png_memcpy(v
, sptr
, 4);
2447 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2450 png_memcpy(dp
, v
, 4);
2454 } /* end of pixel_bytes == 4 */
2456 //--------------------------------------------------------------
2457 else if (pixel_bytes
== 8)
2459 /* GRR TEST: should work, but needs testing (special 64-bit version of rpng2?) */
2460 /* GRR NOTE: no need to combine passes here! */
2461 if (((pass
== 0) || (pass
== 1)) && width
)
2463 int dummy_value_c
; /* fix 'forbidden register spilled' */
2467 /* source is 8-byte RRGGBBAA */
2468 /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */
2469 __asm__
__volatile__ (
2470 "subl $56, %%edi \n\t" /* start of last block */
2472 ".loop8_pass0: \n\t"
2473 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2474 "movq %%mm0, (%%edi) \n\t"
2475 "movq %%mm0, 8(%%edi) \n\t"
2476 "movq %%mm0, 16(%%edi) \n\t"
2477 "movq %%mm0, 24(%%edi) \n\t"
2478 "movq %%mm0, 32(%%edi) \n\t"
2479 "movq %%mm0, 40(%%edi) \n\t"
2480 "movq %%mm0, 48(%%edi) \n\t"
2481 "subl $8, %%esi \n\t"
2482 "movq %%mm0, 56(%%edi) \n\t"
2483 "subl $64, %%edi \n\t"
2485 "jnz .loop8_pass0 \n\t"
2486 "EMMS \n\t" /* DONE */
2488 : "=c" (dummy_value_c
), /* output regs (dummy) */
2489 "=S" (dummy_value_S
),
2490 "=D" (dummy_value_D
)
2492 : "1" (sptr
), /* esi // input regs */
2494 "0" (width
) /* ecx */
2496 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2497 : "%mm0" /* clobber list */
2501 else if (((pass
== 2) || (pass
== 3)) && width
)
2503 /* source is 8-byte RRGGBBAA */
2504 /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
2505 /* (recall that expansion is _in place_: sptr and dp */
2506 /* both point at locations within same row buffer) */
2508 int dummy_value_c
; /* fix 'forbidden register spilled' */
2512 __asm__
__volatile__ (
2513 "subl $24, %%edi \n\t" /* start of last block */
2515 ".loop8_pass2: \n\t"
2516 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2517 "movq %%mm0, (%%edi) \n\t"
2518 "movq %%mm0, 8(%%edi) \n\t"
2519 "movq %%mm0, 16(%%edi) \n\t"
2520 "subl $8, %%esi \n\t"
2521 "movq %%mm0, 24(%%edi) \n\t"
2522 "subl $32, %%edi \n\t"
2524 "jnz .loop8_pass2 \n\t"
2525 "EMMS \n\t" /* DONE */
2527 : "=c" (dummy_value_c
), /* output regs (dummy) */
2528 "=S" (dummy_value_S
),
2529 "=D" (dummy_value_D
)
2531 : "1" (sptr
), /* esi // input regs */
2533 "0" (width
) /* ecx */
2535 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2536 : "%mm0" /* clobber list */
2541 else if (width
) /* pass == 4 or 5 */
2543 /* source is 8-byte RRGGBBAA */
2544 /* dest is 16-byte RRGGBBAA RRGGBBAA */
2546 int dummy_value_c
; /* fix 'forbidden register spilled' */
2550 __asm__
__volatile__ (
2551 "subl $8, %%edi \n\t" /* start of last block */
2553 ".loop8_pass4: \n\t"
2554 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2555 "movq %%mm0, (%%edi) \n\t"
2556 "subl $8, %%esi \n\t"
2557 "movq %%mm0, 8(%%edi) \n\t"
2558 "subl $16, %%edi \n\t"
2560 "jnz .loop8_pass4 \n\t"
2561 "EMMS \n\t" /* DONE */
2563 : "=c" (dummy_value_c
), /* output regs (dummy) */
2564 "=S" (dummy_value_S
),
2565 "=D" (dummy_value_D
)
2567 : "1" (sptr
), /* esi // input regs */
2569 "0" (width
) /* ecx */
2571 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2572 : "%mm0" /* clobber list */
2578 } /* end of pixel_bytes == 8 */
2580 //--------------------------------------------------------------
2581 else if (pixel_bytes
== 6)
2583 for (i
= width
; i
; i
--)
2587 png_memcpy(v
, sptr
, 6);
2588 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2590 png_memcpy(dp
, v
, 6);
2595 } /* end of pixel_bytes == 6 */
2597 //--------------------------------------------------------------
2600 for (i
= width
; i
; i
--)
2604 png_memcpy(v
, sptr
, pixel_bytes
);
2605 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2607 png_memcpy(dp
, v
, pixel_bytes
);
2613 } /* end of _mmx_supported ======================================== */
2615 else /* MMX not supported: use modified C code - takes advantage
2616 * of inlining of png_memcpy for a constant */
2617 /* GRR 19991007: does it? or should pixel_bytes in each
2618 * block be replaced with immediate value (e.g., 1)? */
2619 /* GRR 19991017: replaced with constants in each case */
2620 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2622 if (pixel_bytes
== 1)
2624 for (i
= width
; i
; i
--)
2627 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2634 else if (pixel_bytes
== 3)
2636 for (i
= width
; i
; i
--)
2640 png_memcpy(v
, sptr
, 3);
2641 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2643 png_memcpy(dp
, v
, 3);
2649 else if (pixel_bytes
== 2)
2651 for (i
= width
; i
; i
--)
2655 png_memcpy(v
, sptr
, 2);
2656 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2658 png_memcpy(dp
, v
, 2);
2664 else if (pixel_bytes
== 4)
2666 for (i
= width
; i
; i
--)
2670 png_memcpy(v
, sptr
, 4);
2671 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2674 if (dp
< row
|| dp
+3 > row
+png_ptr
->row_buf_size
)
2676 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2677 row
, dp
, row
+png_ptr
->row_buf_size
);
2678 printf("row_buf=%d\n",png_ptr
->row_buf_size
);
2681 png_memcpy(dp
, v
, 4);
2687 else if (pixel_bytes
== 6)
2689 for (i
= width
; i
; i
--)
2693 png_memcpy(v
, sptr
, 6);
2694 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2696 png_memcpy(dp
, v
, 6);
2702 else if (pixel_bytes
== 8)
2704 for (i
= width
; i
; i
--)
2708 png_memcpy(v
, sptr
, 8);
2709 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2711 png_memcpy(dp
, v
, 8);
2717 else /* GRR: should never be reached */
2719 for (i
= width
; i
; i
--)
2723 png_memcpy(v
, sptr
, pixel_bytes
);
2724 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2726 png_memcpy(dp
, v
, pixel_bytes
);
2729 sptr
-= pixel_bytes
;
2733 } /* end if (MMX not supported) */
2736 } /* end switch (row_info->pixel_depth) */
2738 row_info
->width
= final_width
;
2740 row_info
->rowbytes
= PNG_ROWBYTES(row_info
->pixel_depth
,final_width
);
2743 } /* end png_do_read_interlace() */
2745 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2746 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2750 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2751 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2753 /* These variables are utilized in the functions below. They are declared */
2754 /* globally here to ensure alignment on 8-byte boundaries. */
2759 } _LBCarryMask
= {0x0101010101010101LL
},
2760 _HBClearMask
= {0x7f7f7f7f7f7f7f7fLL
},
2761 _ActiveMask
, _ActiveMask2
, _ActiveMaskEnd
, _ShiftBpp
, _ShiftRem
;
2763 #ifdef PNG_THREAD_UNSAFE_OK
2764 /*===========================================================================*/
2766 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G */
2768 /*===========================================================================*/
2770 /* Optimized code for PNG Average filter decoder */
2772 static void /* PRIVATE */
2773 png_read_filter_row_mmx_avg(png_row_infop row_info
, png_bytep row
,
2777 int dummy_value_c
; /* fix 'forbidden register 2 (cx) was spilled' error */
2781 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* get # bytes per pixel */
2782 _FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
2784 __asm__
__volatile__ (
2785 /* initialize address pointers and offset */
2787 "pushl %%ebx \n\t" /* save index to Global Offset Table */
2789 /*pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
2790 "xorl %%ebx, %%ebx \n\t" /* ebx: x */
2791 "movl %%edi, %%edx \n\t"
2792 /*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
2793 /*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
2794 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
2796 "xorl %%eax,%%eax \n\t"
2798 /* Compute the Raw value for the first bpp bytes */
2799 /* Raw(x) = Avg(x) + (Prior(x)/2) */
2801 "movb (%%esi,%%ebx,),%%al \n\t" /* load al with Prior(x) */
2803 "shrb %%al \n\t" /* divide by 2 */
2804 "addb -1(%%edi,%%ebx,),%%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2805 /* pre "cmpl bpp, %%ebx \n\t" */ /* (bpp is preloaded into ecx) */
2806 "cmpl %%ecx, %%ebx \n\t"
2807 "movb %%al,-1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2808 "jb avg_rlp \n\t" /* mov does not affect flags */
2810 /* get # of bytes to alignment */
2811 "movl %%edi, _dif \n\t" /* take start of row */
2812 "addl %%ebx, _dif \n\t" /* add bpp */
2813 "addl $0xf, _dif \n\t" /* add 7+8 to incr past alignment bdry */
2814 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
2815 "subl %%edi, _dif \n\t" /* subtract from start => value ebx at */
2816 "jz avg_go \n\t" /* alignment */
2819 /* Compute the Raw value for the bytes up to the alignment boundary */
2820 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2821 "xorl %%ecx, %%ecx \n\t"
2824 "xorl %%eax, %%eax \n\t"
2825 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
2826 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
2827 "addw %%cx, %%ax \n\t"
2829 "shrw %%ax \n\t" /* divide by 2 */
2830 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2831 "cmpl _dif, %%ebx \n\t" /* check if at alignment boundary */
2832 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2833 "jb avg_lp1 \n\t" /* repeat until at alignment boundary */
2836 "movl _FullLength, %%eax \n\t"
2837 "movl %%eax, %%ecx \n\t"
2838 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
2839 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
2840 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
2841 "movl %%ecx, _MMXLength \n\t"
2843 "popl %%ebx \n\t" /* restore index to Global Offset Table */
2846 : "=c" (dummy_value_c
), /* output regs (dummy) */
2847 "=S" (dummy_value_S
),
2848 "=D" (dummy_value_D
)
2850 : "0" (bpp
), /* ecx // input regs */
2851 "1" (prev_row
), /* esi */
2854 : "%eax", "%edx" /* clobber list */
2858 /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */
2859 /* (seems to work fine without...) */
2862 /* now do the math for the rest of the row */
2867 _ActiveMask
.use
= 0x0000000000ffffffLL
;
2868 _ShiftBpp
.use
= 24; /* == 3 * 8 */
2869 _ShiftRem
.use
= 40; /* == 64 - 24 */
2871 __asm__
__volatile__ (
2872 /* re-init address pointers and offset */
2873 "movq _ActiveMask, %%mm7 \n\t"
2874 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2875 "movq _LBCarryMask, %%mm5 \n\t" /* alignment boundary */
2876 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
2877 "movq _HBClearMask, %%mm4 \n\t"
2878 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
2880 /* prime the pump: load the first Raw(x-bpp) data set */
2881 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2882 /* (correct pos. in loop below) */
2884 "movq (%%edi,%%ecx,), %%mm0 \n\t" /* load mm0 with Avg(x) */
2885 "movq %%mm5, %%mm3 \n\t"
2886 "psrlq _ShiftRem, %%mm2 \n\t" /* correct position Raw(x-bpp) */
2888 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* load mm1 with Prior(x) */
2889 "movq %%mm7, %%mm6 \n\t"
2890 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
2891 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
2892 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
2894 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
2896 /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */
2897 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2899 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2901 /* lsb's were == 1 (only valid for active group) */
2902 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2903 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2905 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2907 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
2908 /* bytes to add to Avg */
2909 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2910 /* Avg for each Active */
2912 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
2913 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
2915 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2916 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2917 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2919 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2921 /* lsb's were == 1 (only valid for active group) */
2922 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2923 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2925 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2927 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2928 /* bytes to add to Avg */
2929 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2930 /* Avg for each Active */
2933 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
2934 "psllq _ShiftBpp, %%mm6 \n\t" /* shift mm6 mask to cover last */
2937 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2938 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2939 /* Data only needs to be shifted once here to */
2940 /* get the correct x-bpp offset. */
2941 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2943 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2945 /* lsb's were == 1 (only valid for active group) */
2946 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2947 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2949 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2951 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2952 /* bytes to add to Avg */
2953 "addl $8, %%ecx \n\t"
2954 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2955 /* Avg for each Active */
2957 /* now ready to write back to memory */
2958 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2959 /* move updated Raw(x) to use as Raw(x-bpp) for next loop */
2960 "cmpl _MMXLength, %%ecx \n\t"
2961 "movq %%mm0, %%mm2 \n\t" /* mov updated Raw(x) to mm2 */
2964 : "=S" (dummy_value_S
), /* output regs (dummy) */
2965 "=D" (dummy_value_D
)
2967 : "0" (prev_row
), /* esi // input regs */
2970 : "%ecx" /* clobber list */
2971 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2972 , "%mm0", "%mm1", "%mm2", "%mm3"
2973 , "%mm4", "%mm5", "%mm6", "%mm7"
2977 break; /* end 3 bpp */
2981 //case 7: /* who wrote this? PNG doesn't support 5 or 7 bytes/pixel */
2982 //case 5: /* GRR BOGUS */
2984 _ActiveMask
.use
= 0xffffffffffffffffLL
; /* use shift below to clear */
2985 /* appropriate inactive bytes */
2986 _ShiftBpp
.use
= bpp
<< 3;
2987 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
2989 __asm__
__volatile__ (
2990 "movq _HBClearMask, %%mm4 \n\t"
2992 /* re-init address pointers and offset */
2993 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2994 /* alignment boundary */
2996 /* load _ActiveMask and clear all bytes except for 1st active group */
2997 "movq _ActiveMask, %%mm7 \n\t"
2998 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
2999 "psrlq _ShiftRem, %%mm7 \n\t"
3000 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3001 "movq %%mm7, %%mm6 \n\t"
3002 "movq _LBCarryMask, %%mm5 \n\t"
3003 "psllq _ShiftBpp, %%mm6 \n\t" /* create mask for 2nd active */
3006 /* prime the pump: load the first Raw(x-bpp) data set */
3007 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3008 /* (we correct pos. in loop below) */
3010 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3011 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
3012 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3013 /* add (Prev_row/2) to average */
3014 "movq %%mm5, %%mm3 \n\t"
3015 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3016 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3017 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3019 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3021 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3022 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3024 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3026 /* lsb's were == 1 (only valid for active group) */
3027 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3028 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3030 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3032 "pand %%mm7, %%mm2 \n\t" /* leave only Active Group 1 */
3033 /* bytes to add to Avg */
3034 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3035 /* for each Active */
3037 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3038 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3039 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3040 "addl $8, %%ecx \n\t"
3041 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3043 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3045 /* lsb's were == 1 (only valid for active group) */
3046 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3047 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3049 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3051 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3052 /* bytes to add to Avg */
3053 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3054 /* Avg for each Active */
3056 "cmpl _MMXLength, %%ecx \n\t"
3057 /* now ready to write back to memory */
3058 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3059 /* prep Raw(x-bpp) for next loop */
3060 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3063 : "=S" (dummy_value_S
), /* output regs (dummy) */
3064 "=D" (dummy_value_D
)
3066 : "0" (prev_row
), /* esi // input regs */
3069 : "%ecx" /* clobber list */
3070 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3071 , "%mm0", "%mm1", "%mm2", "%mm3"
3072 , "%mm4", "%mm5", "%mm6", "%mm7"
3076 break; /* end 4,6 bpp */
3080 _ActiveMask
.use
= 0x000000000000ffffLL
;
3081 _ShiftBpp
.use
= 16; /* == 2 * 8 */
3082 _ShiftRem
.use
= 48; /* == 64 - 16 */
3084 __asm__
__volatile__ (
3085 /* load _ActiveMask */
3086 "movq _ActiveMask, %%mm7 \n\t"
3087 /* re-init address pointers and offset */
3088 "movl _dif, %%ecx \n\t" /* ecx: x = offset to alignment */
3090 "movq _LBCarryMask, %%mm5 \n\t"
3091 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3092 "movq _HBClearMask, %%mm4 \n\t"
3093 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3095 /* prime the pump: load the first Raw(x-bpp) data set */
3096 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3097 /* (we correct pos. in loop below) */
3099 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3100 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
3101 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* (GRR BUGFIX: was psllq) */
3102 /* add (Prev_row/2) to average */
3103 "movq %%mm5, %%mm3 \n\t"
3104 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3105 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3106 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3108 "movq %%mm7, %%mm6 \n\t"
3109 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3112 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3113 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3115 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3117 /* lsb's were == 1 (only valid */
3118 /* for active group) */
3119 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3120 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3122 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3124 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
3125 /* bytes to add to Avg */
3126 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3127 /* for each Active byte */
3129 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3130 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3132 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3133 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3134 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3136 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3138 /* lsb's were == 1 (only valid */
3139 /* for active group) */
3140 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3141 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3143 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3145 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3146 /* bytes to add to Avg */
3147 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3148 /* Avg for each Active byte */
3150 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
3151 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3153 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3154 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3155 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3157 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3158 /* where both lsb's were == 1 */
3159 /* (only valid for active group) */
3160 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3161 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3163 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3165 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3166 /* bytes to add to Avg */
3167 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3168 /* Avg for each Active byte */
3170 /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */
3171 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3173 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3174 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3175 "addl $8, %%ecx \n\t"
3176 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3178 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3180 /* lsb's were == 1 (only valid */
3181 /* for active group) */
3182 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3183 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3185 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3187 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3188 /* bytes to add to Avg */
3189 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3190 /* Avg for each Active byte */
3192 "cmpl _MMXLength, %%ecx \n\t"
3193 /* now ready to write back to memory */
3194 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3195 /* prep Raw(x-bpp) for next loop */
3196 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3199 : "=S" (dummy_value_S
), /* output regs (dummy) */
3200 "=D" (dummy_value_D
)
3202 : "0" (prev_row
), /* esi // input regs */
3205 : "%ecx" /* clobber list */
3206 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3207 , "%mm0", "%mm1", "%mm2", "%mm3"
3208 , "%mm4", "%mm5", "%mm6", "%mm7"
3212 break; /* end 2 bpp */
3216 __asm__
__volatile__ (
3217 /* re-init address pointers and offset */
3219 "pushl %%ebx \n\t" /* save Global Offset Table index */
3221 "movl _dif, %%ebx \n\t" /* ebx: x = offset to alignment */
3223 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3224 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
3226 /* do Paeth decode for remaining bytes */
3227 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3228 "movl %%edi, %%edx \n\t"
3229 /* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
3230 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3231 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
3234 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3235 "xorl %%eax, %%eax \n\t"
3236 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3237 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
3238 "addw %%cx, %%ax \n\t"
3240 "shrw %%ax \n\t" /* divide by 2 */
3241 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */
3243 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3244 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */
3245 /* mov does not affect flags; -1 to offset inc ebx */
3250 "popl %%ebx \n\t" /* Global Offset Table index */
3253 : "=c" (dummy_value_c
), /* output regs (dummy) */
3254 "=S" (dummy_value_S
),
3255 "=D" (dummy_value_D
)
3257 : "0" (bpp
), /* ecx // input regs */
3258 "1" (prev_row
), /* esi */
3261 : "%eax", "%edx" /* clobber list */
3267 return; /* end 1 bpp */
3271 __asm__
__volatile__ (
3272 /* re-init address pointers and offset */
3273 "movl _dif, %%ecx \n\t" /* ecx: x == offset to alignment */
3274 "movq _LBCarryMask, %%mm5 \n\t" /* boundary */
3275 /* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3276 "movq _HBClearMask, %%mm4 \n\t"
3277 /* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
3279 /* prime the pump: load the first Raw(x-bpp) data set */
3280 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3281 /* (NO NEED to correct pos. in loop below) */
3284 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3285 "movq %%mm5, %%mm3 \n\t"
3286 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3287 "addl $8, %%ecx \n\t"
3288 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3289 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3290 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3291 /* where both lsb's were == 1 */
3292 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3293 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7, each byte */
3294 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg, each byte */
3295 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7, each byte */
3296 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg, each */
3297 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
3298 "cmpl _MMXLength, %%ecx \n\t"
3299 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3300 "movq %%mm0, %%mm2 \n\t" /* reuse as Raw(x-bpp) */
3303 : "=S" (dummy_value_S
), /* output regs (dummy) */
3304 "=D" (dummy_value_D
)
3306 : "0" (prev_row
), /* esi // input regs */
3309 : "%ecx" /* clobber list */
3310 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3311 , "%mm0", "%mm1", "%mm2"
3312 , "%mm3", "%mm4", "%mm5"
3316 break; /* end 8 bpp */
3318 default: /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */
3322 /* GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED */
3324 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3328 __asm__
__volatile__ (
3329 "movq _LBCarryMask, %%mm5 \n\t"
3330 /* re-init address pointers and offset */
3331 "movl _dif, %%ebx \n\t" /* ebx: x = offset to */
3332 /* alignment boundary */
3333 "movl row, %%edi \n\t" /* edi: Avg(x) */
3334 "movq _HBClearMask, %%mm4 \n\t"
3335 "movl %%edi, %%edx \n\t"
3336 "movl prev_row, %%esi \n\t" /* esi: Prior(x) */
3337 "subl bpp, %%edx \n\t" /* edx: Raw(x-bpp) */
3339 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3340 "movq %%mm5, %%mm3 \n\t"
3341 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3342 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3343 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3344 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3345 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3346 /* where both lsb's were == 1 */
3347 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3348 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3350 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg for each */
3352 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3354 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3356 "addl $8, %%ebx \n\t"
3357 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
3359 "cmpl _MMXLength, %%ebx \n\t"
3360 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3363 : /* FIXASM: output regs/vars go here, e.g.: "=m" (memory_var) */
3365 : /* FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest) */
3367 : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */
3369 #endif /* 0 - NEVER REACHED */
3373 } /* end switch (bpp) */
3375 __asm__
__volatile__ (
3376 /* MMX acceleration complete; now do clean-up */
3377 /* check if any remaining bytes left to decode */
3379 "pushl %%ebx \n\t" /* save index to Global Offset Table */
3381 "movl _MMXLength, %%ebx \n\t" /* ebx: x == offset bytes after MMX */
3382 /* pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
3383 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
3386 /* do Avg decode for remaining bytes */
3387 /*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
3388 "movl %%edi, %%edx \n\t"
3389 /*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
3390 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3391 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
3394 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3395 "xorl %%eax, %%eax \n\t"
3396 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3397 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
3398 "addw %%cx, %%ax \n\t"
3400 "shrw %%ax \n\t" /* divide by 2 */
3401 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
3402 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3403 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */
3404 "jb avg_lp2 \n\t" /* affect flags; -1 to offset inc ebx] */
3407 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
3409 "popl %%ebx \n\t" /* restore index to Global Offset Table */
3412 : "=c" (dummy_value_c
), /* output regs (dummy) */
3413 "=S" (dummy_value_S
),
3414 "=D" (dummy_value_D
)
3416 : "0" (bpp
), /* ecx // input regs */
3417 "1" (prev_row
), /* esi */
3420 : "%eax", "%edx" /* clobber list */
3426 } /* end png_read_filter_row_mmx_avg() */
3431 #ifdef PNG_THREAD_UNSAFE_OK
3432 /*===========================================================================*/
3434 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H */
3436 /*===========================================================================*/
3438 /* Optimized code for PNG Paeth filter decoder */
3440 static void /* PRIVATE */
3441 png_read_filter_row_mmx_paeth(png_row_infop row_info
, png_bytep row
,
3445 int dummy_value_c
; /* fix 'forbidden register 2 (cx) was spilled' error */
3449 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* Get # bytes per pixel */
3450 _FullLength
= row_info
->rowbytes
; /* # of bytes to filter */
3452 __asm__
__volatile__ (
3454 "pushl %%ebx \n\t" /* save index to Global Offset Table */
3456 "xorl %%ebx, %%ebx \n\t" /* ebx: x offset */
3457 /*pre "movl row, %%edi \n\t" */
3458 "xorl %%edx, %%edx \n\t" /* edx: x-bpp offset */
3459 /*pre "movl prev_row, %%esi \n\t" */
3460 "xorl %%eax, %%eax \n\t"
3462 /* Compute the Raw value for the first bpp bytes */
3463 /* Note: the formula works out to be always */
3464 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
3466 "movb (%%edi,%%ebx,), %%al \n\t"
3467 "addb (%%esi,%%ebx,), %%al \n\t"
3469 /*pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx) */
3470 "cmpl %%ecx, %%ebx \n\t"
3471 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3473 /* get # of bytes to alignment */
3474 "movl %%edi, _dif \n\t" /* take start of row */
3475 "addl %%ebx, _dif \n\t" /* add bpp */
3476 "xorl %%ecx, %%ecx \n\t"
3477 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past alignment */
3479 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
3480 "subl %%edi, _dif \n\t" /* subtract from start ==> value ebx */
3486 "xorl %%eax, %%eax \n\t"
3487 /* pav = p - a = (a + b - c) - a = b - c */
3488 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
3489 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3490 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
3491 "movl %%eax, _patemp \n\t" /* Save pav for later use */
3492 "xorl %%eax, %%eax \n\t"
3493 /* pbv = p - b = (a + b - c) - b = a - c */
3494 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
3495 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
3496 "movl %%eax, %%ecx \n\t"
3497 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3498 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
3500 "testl $0x80000000, %%eax \n\t"
3502 "negl %%eax \n\t" /* reverse sign of neg values */
3505 "movl %%eax, _pctemp \n\t" /* save pc for later use */
3507 "testl $0x80000000, %%ecx \n\t"
3509 "negl %%ecx \n\t" /* reverse sign of neg values */
3512 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
3514 "movl _patemp, %%eax \n\t"
3515 "testl $0x80000000, %%eax \n\t"
3517 "negl %%eax \n\t" /* reverse sign of neg values */
3520 "movl %%eax, _patemp \n\t" /* save pa for later use */
3521 /* test if pa <= pb */
3522 "cmpl %%ecx, %%eax \n\t"
3523 "jna paeth_abb \n\t"
3524 /* pa > pb; now test if pb <= pc */
3525 "cmpl _pctemp, %%ecx \n\t"
3526 "jna paeth_bbc \n\t"
3527 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3528 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3529 "jmp paeth_paeth \n\t"
3532 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3533 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
3534 "jmp paeth_paeth \n\t"
3537 /* pa <= pb; now test if pa <= pc */
3538 "cmpl _pctemp, %%eax \n\t"
3539 "jna paeth_abc \n\t"
3540 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3541 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3542 "jmp paeth_paeth \n\t"
3545 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3546 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
3551 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3552 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3553 "cmpl _dif, %%ebx \n\t"
3557 "movl _FullLength, %%ecx \n\t"
3558 "movl %%ecx, %%eax \n\t"
3559 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
3560 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
3561 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
3562 "movl %%ecx, _MMXLength \n\t"
3564 "popl %%ebx \n\t" /* restore index to Global Offset Table */
3567 : "=c" (dummy_value_c
), /* output regs (dummy) */
3568 "=S" (dummy_value_S
),
3569 "=D" (dummy_value_D
)
3571 : "0" (bpp
), /* ecx // input regs */
3572 "1" (prev_row
), /* esi */
3575 : "%eax", "%edx" /* clobber list */
3581 /* now do the math for the rest of the row */
3586 _ActiveMask
.use
= 0x0000000000ffffffLL
;
3587 _ActiveMaskEnd
.use
= 0xffff000000000000LL
;
3588 _ShiftBpp
.use
= 24; /* == bpp(3) * 8 */
3589 _ShiftRem
.use
= 40; /* == 64 - 24 */
3591 __asm__
__volatile__ (
3592 "movl _dif, %%ecx \n\t"
3593 /* preload "movl row, %%edi \n\t" */
3594 /* preload "movl prev_row, %%esi \n\t" */
3595 "pxor %%mm0, %%mm0 \n\t"
3596 /* prime the pump: load the first Raw(x-bpp) data set */
3597 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3599 "psrlq _ShiftRem, %%mm1 \n\t" /* shift last 3 bytes to 1st */
3601 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3602 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3603 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */
3604 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3605 "psrlq _ShiftRem, %%mm3 \n\t" /* shift last 3 bytes to 1st */
3607 /* pav = p - a = (a + b - c) - a = b - c */
3608 "movq %%mm2, %%mm4 \n\t"
3609 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3610 /* pbv = p - b = (a + b - c) - b = a - c */
3611 "movq %%mm1, %%mm5 \n\t"
3612 "psubw %%mm3, %%mm4 \n\t"
3613 "pxor %%mm7, %%mm7 \n\t"
3614 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3615 "movq %%mm4, %%mm6 \n\t"
3616 "psubw %%mm3, %%mm5 \n\t"
3618 /* pa = abs(p-a) = abs(pav) */
3619 /* pb = abs(p-b) = abs(pbv) */
3620 /* pc = abs(p-c) = abs(pcv) */
3621 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3622 "paddw %%mm5, %%mm6 \n\t"
3623 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3624 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3625 "psubw %%mm0, %%mm4 \n\t"
3626 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3627 "psubw %%mm0, %%mm4 \n\t"
3628 "psubw %%mm7, %%mm5 \n\t"
3629 "pxor %%mm0, %%mm0 \n\t"
3630 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3631 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3632 "psubw %%mm7, %%mm5 \n\t"
3633 "psubw %%mm0, %%mm6 \n\t"
3635 "movq %%mm4, %%mm7 \n\t"
3636 "psubw %%mm0, %%mm6 \n\t"
3637 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3638 "movq %%mm7, %%mm0 \n\t"
3639 /* use mm7 mask to merge pa & pb */
3640 "pand %%mm7, %%mm5 \n\t"
3641 /* use mm0 mask copy to merge a & b */
3642 "pand %%mm0, %%mm2 \n\t"
3643 "pandn %%mm4, %%mm7 \n\t"
3644 "pandn %%mm1, %%mm0 \n\t"
3645 "paddw %%mm5, %%mm7 \n\t"
3646 "paddw %%mm2, %%mm0 \n\t"
3647 /* test ((pa <= pb)? pa:pb) <= pc */
3648 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3649 "pxor %%mm1, %%mm1 \n\t"
3650 "pand %%mm7, %%mm3 \n\t"
3651 "pandn %%mm0, %%mm7 \n\t"
3652 "paddw %%mm3, %%mm7 \n\t"
3653 "pxor %%mm0, %%mm0 \n\t"
3654 "packuswb %%mm1, %%mm7 \n\t"
3655 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
3656 "pand _ActiveMask, %%mm7 \n\t"
3657 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
3658 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3659 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3660 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3661 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as */
3663 /* now do Paeth for 2nd set of bytes (3-5) */
3664 "psrlq _ShiftBpp, %%mm2 \n\t" /* load b=Prior(x) step 2 */
3665 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3666 "pxor %%mm7, %%mm7 \n\t"
3667 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3668 /* pbv = p - b = (a + b - c) - b = a - c */
3669 "movq %%mm1, %%mm5 \n\t"
3670 /* pav = p - a = (a + b - c) - a = b - c */
3671 "movq %%mm2, %%mm4 \n\t"
3672 "psubw %%mm3, %%mm5 \n\t"
3673 "psubw %%mm3, %%mm4 \n\t"
3674 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
3675 /* pav + pbv = pbv + pav */
3676 "movq %%mm5, %%mm6 \n\t"
3677 "paddw %%mm4, %%mm6 \n\t"
3679 /* pa = abs(p-a) = abs(pav) */
3680 /* pb = abs(p-b) = abs(pbv) */
3681 /* pc = abs(p-c) = abs(pcv) */
3682 "pcmpgtw %%mm5, %%mm0 \n\t" /* create mask pbv bytes < 0 */
3683 "pcmpgtw %%mm4, %%mm7 \n\t" /* create mask pav bytes < 0 */
3684 "pand %%mm5, %%mm0 \n\t" /* only pbv bytes < 0 in mm0 */
3685 "pand %%mm4, %%mm7 \n\t" /* only pav bytes < 0 in mm7 */
3686 "psubw %%mm0, %%mm5 \n\t"
3687 "psubw %%mm7, %%mm4 \n\t"
3688 "psubw %%mm0, %%mm5 \n\t"
3689 "psubw %%mm7, %%mm4 \n\t"
3690 "pxor %%mm0, %%mm0 \n\t"
3691 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3692 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3693 "psubw %%mm0, %%mm6 \n\t"
3695 "movq %%mm4, %%mm7 \n\t"
3696 "psubw %%mm0, %%mm6 \n\t"
3697 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3698 "movq %%mm7, %%mm0 \n\t"
3699 /* use mm7 mask to merge pa & pb */
3700 "pand %%mm7, %%mm5 \n\t"
3701 /* use mm0 mask copy to merge a & b */
3702 "pand %%mm0, %%mm2 \n\t"
3703 "pandn %%mm4, %%mm7 \n\t"
3704 "pandn %%mm1, %%mm0 \n\t"
3705 "paddw %%mm5, %%mm7 \n\t"
3706 "paddw %%mm2, %%mm0 \n\t"
3707 /* test ((pa <= pb)? pa:pb) <= pc */
3708 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3709 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3710 "pand %%mm7, %%mm3 \n\t"
3711 "pandn %%mm0, %%mm7 \n\t"
3712 "pxor %%mm1, %%mm1 \n\t"
3713 "paddw %%mm3, %%mm7 \n\t"
3714 "pxor %%mm0, %%mm0 \n\t"
3715 "packuswb %%mm1, %%mm7 \n\t"
3716 "movq %%mm2, %%mm3 \n\t" /* load c=Prior(x-bpp) step 1 */
3717 "pand _ActiveMask, %%mm7 \n\t"
3718 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3719 "psllq _ShiftBpp, %%mm7 \n\t" /* shift bytes to 2nd group of */
3721 /* pav = p - a = (a + b - c) - a = b - c */
3722 "movq %%mm2, %%mm4 \n\t"
3723 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3724 "psllq _ShiftBpp, %%mm3 \n\t" /* load c=Prior(x-bpp) step 2 */
3725 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3726 "movq %%mm7, %%mm1 \n\t"
3727 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3728 "psllq _ShiftBpp, %%mm1 \n\t" /* shift bytes */
3729 /* now mm1 will be used as Raw(x-bpp) */
3730 /* now do Paeth for 3rd, and final, set of bytes (6-7) */
3731 "pxor %%mm7, %%mm7 \n\t"
3732 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3733 "psubw %%mm3, %%mm4 \n\t"
3734 /* pbv = p - b = (a + b - c) - b = a - c */
3735 "movq %%mm1, %%mm5 \n\t"
3736 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3737 "movq %%mm4, %%mm6 \n\t"
3738 "psubw %%mm3, %%mm5 \n\t"
3739 "pxor %%mm0, %%mm0 \n\t"
3740 "paddw %%mm5, %%mm6 \n\t"
3742 /* pa = abs(p-a) = abs(pav) */
3743 /* pb = abs(p-b) = abs(pbv) */
3744 /* pc = abs(p-c) = abs(pcv) */
3745 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3746 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3747 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3748 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3749 "psubw %%mm0, %%mm4 \n\t"
3750 "psubw %%mm7, %%mm5 \n\t"
3751 "psubw %%mm0, %%mm4 \n\t"
3752 "psubw %%mm7, %%mm5 \n\t"
3753 "pxor %%mm0, %%mm0 \n\t"
3754 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3755 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3756 "psubw %%mm0, %%mm6 \n\t"
3758 "movq %%mm4, %%mm7 \n\t"
3759 "psubw %%mm0, %%mm6 \n\t"
3760 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3761 "movq %%mm7, %%mm0 \n\t"
3762 /* use mm0 mask copy to merge a & b */
3763 "pand %%mm0, %%mm2 \n\t"
3764 /* use mm7 mask to merge pa & pb */
3765 "pand %%mm7, %%mm5 \n\t"
3766 "pandn %%mm1, %%mm0 \n\t"
3767 "pandn %%mm4, %%mm7 \n\t"
3768 "paddw %%mm2, %%mm0 \n\t"
3769 "paddw %%mm5, %%mm7 \n\t"
3770 /* test ((pa <= pb)? pa:pb) <= pc */
3771 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3772 "pand %%mm7, %%mm3 \n\t"
3773 "pandn %%mm0, %%mm7 \n\t"
3774 "paddw %%mm3, %%mm7 \n\t"
3775 "pxor %%mm1, %%mm1 \n\t"
3776 "packuswb %%mm7, %%mm1 \n\t"
3777 /* step ecx to next set of 8 bytes and repeat loop til done */
3778 "addl $8, %%ecx \n\t"
3779 "pand _ActiveMaskEnd, %%mm1 \n\t"
3780 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */
3783 "cmpl _MMXLength, %%ecx \n\t"
3784 "pxor %%mm0, %%mm0 \n\t" /* pxor does not affect flags */
3785 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3786 /* mm1 will be used as Raw(x-bpp) next loop */
3787 /* mm3 ready to be used as Prior(x-bpp) next loop */
3790 : "=S" (dummy_value_S
), /* output regs (dummy) */
3791 "=D" (dummy_value_D
)
3793 : "0" (prev_row
), /* esi // input regs */
3796 : "%ecx" /* clobber list */
3797 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3798 , "%mm0", "%mm1", "%mm2", "%mm3"
3799 , "%mm4", "%mm5", "%mm6", "%mm7"
3803 break; /* end 3 bpp */
3806 //case 7: /* GRR BOGUS */
3807 //case 5: /* GRR BOGUS */
3809 _ActiveMask
.use
= 0x00000000ffffffffLL
;
3810 _ActiveMask2
.use
= 0xffffffff00000000LL
;
3811 _ShiftBpp
.use
= bpp
<< 3; /* == bpp * 8 */
3812 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
3814 __asm__
__volatile__ (
3815 "movl _dif, %%ecx \n\t"
3816 /* preload "movl row, %%edi \n\t" */
3817 /* preload "movl prev_row, %%esi \n\t" */
3818 /* prime the pump: load the first Raw(x-bpp) data set */
3819 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3820 "pxor %%mm0, %%mm0 \n\t"
3823 /* must shift to position Raw(x-bpp) data */
3824 "psrlq _ShiftRem, %%mm1 \n\t"
3825 /* do first set of 4 bytes */
3826 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3827 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3828 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3829 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
3830 /* must shift to position Prior(x-bpp) data */
3831 "psrlq _ShiftRem, %%mm3 \n\t"
3832 /* pav = p - a = (a + b - c) - a = b - c */
3833 "movq %%mm2, %%mm4 \n\t"
3834 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
3835 /* pbv = p - b = (a + b - c) - b = a - c */
3836 "movq %%mm1, %%mm5 \n\t"
3837 "psubw %%mm3, %%mm4 \n\t"
3838 "pxor %%mm7, %%mm7 \n\t"
3839 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3840 "movq %%mm4, %%mm6 \n\t"
3841 "psubw %%mm3, %%mm5 \n\t"
3842 /* pa = abs(p-a) = abs(pav) */
3843 /* pb = abs(p-b) = abs(pbv) */
3844 /* pc = abs(p-c) = abs(pcv) */
3845 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3846 "paddw %%mm5, %%mm6 \n\t"
3847 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3848 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3849 "psubw %%mm0, %%mm4 \n\t"
3850 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3851 "psubw %%mm0, %%mm4 \n\t"
3852 "psubw %%mm7, %%mm5 \n\t"
3853 "pxor %%mm0, %%mm0 \n\t"
3854 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3855 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3856 "psubw %%mm7, %%mm5 \n\t"
3857 "psubw %%mm0, %%mm6 \n\t"
3859 "movq %%mm4, %%mm7 \n\t"
3860 "psubw %%mm0, %%mm6 \n\t"
3861 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3862 "movq %%mm7, %%mm0 \n\t"
3863 /* use mm7 mask to merge pa & pb */
3864 "pand %%mm7, %%mm5 \n\t"
3865 /* use mm0 mask copy to merge a & b */
3866 "pand %%mm0, %%mm2 \n\t"
3867 "pandn %%mm4, %%mm7 \n\t"
3868 "pandn %%mm1, %%mm0 \n\t"
3869 "paddw %%mm5, %%mm7 \n\t"
3870 "paddw %%mm2, %%mm0 \n\t"
3871 /* test ((pa <= pb)? pa:pb) <= pc */
3872 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3873 "pxor %%mm1, %%mm1 \n\t"
3874 "pand %%mm7, %%mm3 \n\t"
3875 "pandn %%mm0, %%mm7 \n\t"
3876 "paddw %%mm3, %%mm7 \n\t"
3877 "pxor %%mm0, %%mm0 \n\t"
3878 "packuswb %%mm1, %%mm7 \n\t"
3879 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
3880 "pand _ActiveMask, %%mm7 \n\t"
3881 "psrlq _ShiftRem, %%mm3 \n\t"
3882 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) step 1 */
3883 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */
3884 "movq %%mm2, %%mm6 \n\t"
3885 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3886 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3887 "psllq _ShiftBpp, %%mm6 \n\t"
3888 "movq %%mm7, %%mm5 \n\t"
3889 "psrlq _ShiftRem, %%mm1 \n\t"
3890 "por %%mm6, %%mm3 \n\t"
3891 "psllq _ShiftBpp, %%mm5 \n\t"
3892 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3893 "por %%mm5, %%mm1 \n\t"
3894 /* do second set of 4 bytes */
3895 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3896 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3897 /* pav = p - a = (a + b - c) - a = b - c */
3898 "movq %%mm2, %%mm4 \n\t"
3899 /* pbv = p - b = (a + b - c) - b = a - c */
3900 "movq %%mm1, %%mm5 \n\t"
3901 "psubw %%mm3, %%mm4 \n\t"
3902 "pxor %%mm7, %%mm7 \n\t"
3903 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3904 "movq %%mm4, %%mm6 \n\t"
3905 "psubw %%mm3, %%mm5 \n\t"
3906 /* pa = abs(p-a) = abs(pav) */
3907 /* pb = abs(p-b) = abs(pbv) */
3908 /* pc = abs(p-c) = abs(pcv) */
3909 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3910 "paddw %%mm5, %%mm6 \n\t"
3911 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3912 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3913 "psubw %%mm0, %%mm4 \n\t"
3914 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
3915 "psubw %%mm0, %%mm4 \n\t"
3916 "psubw %%mm7, %%mm5 \n\t"
3917 "pxor %%mm0, %%mm0 \n\t"
3918 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3919 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3920 "psubw %%mm7, %%mm5 \n\t"
3921 "psubw %%mm0, %%mm6 \n\t"
3923 "movq %%mm4, %%mm7 \n\t"
3924 "psubw %%mm0, %%mm6 \n\t"
3925 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
3926 "movq %%mm7, %%mm0 \n\t"
3927 /* use mm7 mask to merge pa & pb */
3928 "pand %%mm7, %%mm5 \n\t"
3929 /* use mm0 mask copy to merge a & b */
3930 "pand %%mm0, %%mm2 \n\t"
3931 "pandn %%mm4, %%mm7 \n\t"
3932 "pandn %%mm1, %%mm0 \n\t"
3933 "paddw %%mm5, %%mm7 \n\t"
3934 "paddw %%mm2, %%mm0 \n\t"
3935 /* test ((pa <= pb)? pa:pb) <= pc */
3936 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3937 "pxor %%mm1, %%mm1 \n\t"
3938 "pand %%mm7, %%mm3 \n\t"
3939 "pandn %%mm0, %%mm7 \n\t"
3940 "pxor %%mm1, %%mm1 \n\t"
3941 "paddw %%mm3, %%mm7 \n\t"
3942 "pxor %%mm0, %%mm0 \n\t"
3943 /* step ecx to next set of 8 bytes and repeat loop til done */
3944 "addl $8, %%ecx \n\t"
3945 "packuswb %%mm7, %%mm1 \n\t"
3946 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
3947 "cmpl _MMXLength, %%ecx \n\t"
3948 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3949 /* mm1 will be used as Raw(x-bpp) next loop */
3952 : "=S" (dummy_value_S
), /* output regs (dummy) */
3953 "=D" (dummy_value_D
)
3955 : "0" (prev_row
), /* esi // input regs */
3958 : "%ecx" /* clobber list */
3959 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3960 , "%mm0", "%mm1", "%mm2", "%mm3"
3961 , "%mm4", "%mm5", "%mm6", "%mm7"
3965 break; /* end 6 bpp */
3969 _ActiveMask
.use
= 0x00000000ffffffffLL
;
3971 __asm__
__volatile__ (
3972 "movl _dif, %%ecx \n\t"
3973 /* preload "movl row, %%edi \n\t" */
3974 /* preload "movl prev_row, %%esi \n\t" */
3975 "pxor %%mm0, %%mm0 \n\t"
3976 /* prime the pump: load the first Raw(x-bpp) data set */
3977 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
3978 /* a=Raw(x-bpp) bytes */
3980 /* do first set of 4 bytes */
3981 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3982 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3983 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3984 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3985 /* pav = p - a = (a + b - c) - a = b - c */
3986 "movq %%mm2, %%mm4 \n\t"
3987 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3988 /* pbv = p - b = (a + b - c) - b = a - c */
3989 "movq %%mm1, %%mm5 \n\t"
3990 "psubw %%mm3, %%mm4 \n\t"
3991 "pxor %%mm7, %%mm7 \n\t"
3992 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3993 "movq %%mm4, %%mm6 \n\t"
3994 "psubw %%mm3, %%mm5 \n\t"
3995 /* pa = abs(p-a) = abs(pav) */
3996 /* pb = abs(p-b) = abs(pbv) */
3997 /* pc = abs(p-c) = abs(pcv) */
3998 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3999 "paddw %%mm5, %%mm6 \n\t"
4000 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4001 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4002 "psubw %%mm0, %%mm4 \n\t"
4003 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4004 "psubw %%mm0, %%mm4 \n\t"
4005 "psubw %%mm7, %%mm5 \n\t"
4006 "pxor %%mm0, %%mm0 \n\t"
4007 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4008 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4009 "psubw %%mm7, %%mm5 \n\t"
4010 "psubw %%mm0, %%mm6 \n\t"
4012 "movq %%mm4, %%mm7 \n\t"
4013 "psubw %%mm0, %%mm6 \n\t"
4014 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4015 "movq %%mm7, %%mm0 \n\t"
4016 /* use mm7 mask to merge pa & pb */
4017 "pand %%mm7, %%mm5 \n\t"
4018 /* use mm0 mask copy to merge a & b */
4019 "pand %%mm0, %%mm2 \n\t"
4020 "pandn %%mm4, %%mm7 \n\t"
4021 "pandn %%mm1, %%mm0 \n\t"
4022 "paddw %%mm5, %%mm7 \n\t"
4023 "paddw %%mm2, %%mm0 \n\t"
4024 /* test ((pa <= pb)? pa:pb) <= pc */
4025 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4026 "pxor %%mm1, %%mm1 \n\t"
4027 "pand %%mm7, %%mm3 \n\t"
4028 "pandn %%mm0, %%mm7 \n\t"
4029 "paddw %%mm3, %%mm7 \n\t"
4030 "pxor %%mm0, %%mm0 \n\t"
4031 "packuswb %%mm1, %%mm7 \n\t"
4032 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
4033 "pand _ActiveMask, %%mm7 \n\t"
4034 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
4035 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4036 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4037 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4038 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as Raw(x-bpp) */
4039 /* do second set of 4 bytes */
4040 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4041 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4042 /* pav = p - a = (a + b - c) - a = b - c */
4043 "movq %%mm2, %%mm4 \n\t"
4044 /* pbv = p - b = (a + b - c) - b = a - c */
4045 "movq %%mm1, %%mm5 \n\t"
4046 "psubw %%mm3, %%mm4 \n\t"
4047 "pxor %%mm7, %%mm7 \n\t"
4048 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4049 "movq %%mm4, %%mm6 \n\t"
4050 "psubw %%mm3, %%mm5 \n\t"
4051 /* pa = abs(p-a) = abs(pav) */
4052 /* pb = abs(p-b) = abs(pbv) */
4053 /* pc = abs(p-c) = abs(pcv) */
4054 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
4055 "paddw %%mm5, %%mm6 \n\t"
4056 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4057 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4058 "psubw %%mm0, %%mm4 \n\t"
4059 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4060 "psubw %%mm0, %%mm4 \n\t"
4061 "psubw %%mm7, %%mm5 \n\t"
4062 "pxor %%mm0, %%mm0 \n\t"
4063 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4064 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4065 "psubw %%mm7, %%mm5 \n\t"
4066 "psubw %%mm0, %%mm6 \n\t"
4068 "movq %%mm4, %%mm7 \n\t"
4069 "psubw %%mm0, %%mm6 \n\t"
4070 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4071 "movq %%mm7, %%mm0 \n\t"
4072 /* use mm7 mask to merge pa & pb */
4073 "pand %%mm7, %%mm5 \n\t"
4074 /* use mm0 mask copy to merge a & b */
4075 "pand %%mm0, %%mm2 \n\t"
4076 "pandn %%mm4, %%mm7 \n\t"
4077 "pandn %%mm1, %%mm0 \n\t"
4078 "paddw %%mm5, %%mm7 \n\t"
4079 "paddw %%mm2, %%mm0 \n\t"
4080 /* test ((pa <= pb)? pa:pb) <= pc */
4081 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4082 "pxor %%mm1, %%mm1 \n\t"
4083 "pand %%mm7, %%mm3 \n\t"
4084 "pandn %%mm0, %%mm7 \n\t"
4085 "pxor %%mm1, %%mm1 \n\t"
4086 "paddw %%mm3, %%mm7 \n\t"
4087 "pxor %%mm0, %%mm0 \n\t"
4088 /* step ecx to next set of 8 bytes and repeat loop til done */
4089 "addl $8, %%ecx \n\t"
4090 "packuswb %%mm7, %%mm1 \n\t"
4091 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */
4092 "cmpl _MMXLength, %%ecx \n\t"
4093 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4094 /* mm1 will be used as Raw(x-bpp) next loop */
4097 : "=S" (dummy_value_S
), /* output regs (dummy) */
4098 "=D" (dummy_value_D
)
4100 : "0" (prev_row
), /* esi // input regs */
4103 : "%ecx" /* clobber list */
4104 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4105 , "%mm0", "%mm1", "%mm2", "%mm3"
4106 , "%mm4", "%mm5", "%mm6", "%mm7"
4110 break; /* end 4 bpp */
4112 case 8: /* bpp == 8 */
4114 _ActiveMask
.use
= 0x00000000ffffffffLL
;
4116 __asm__
__volatile__ (
4117 "movl _dif, %%ecx \n\t"
4118 /* preload "movl row, %%edi \n\t" */
4119 /* preload "movl prev_row, %%esi \n\t" */
4120 "pxor %%mm0, %%mm0 \n\t"
4121 /* prime the pump: load the first Raw(x-bpp) data set */
4122 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
4123 /* a=Raw(x-bpp) bytes */
4125 /* do first set of 4 bytes */
4126 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4127 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4128 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4129 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4130 /* pav = p - a = (a + b - c) - a = b - c */
4131 "movq %%mm2, %%mm4 \n\t"
4132 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
4133 /* pbv = p - b = (a + b - c) - b = a - c */
4134 "movq %%mm1, %%mm5 \n\t"
4135 "psubw %%mm3, %%mm4 \n\t"
4136 "pxor %%mm7, %%mm7 \n\t"
4137 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4138 "movq %%mm4, %%mm6 \n\t"
4139 "psubw %%mm3, %%mm5 \n\t"
4140 /* pa = abs(p-a) = abs(pav) */
4141 /* pb = abs(p-b) = abs(pbv) */
4142 /* pc = abs(p-c) = abs(pcv) */
4143 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
4144 "paddw %%mm5, %%mm6 \n\t"
4145 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4146 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4147 "psubw %%mm0, %%mm4 \n\t"
4148 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4149 "psubw %%mm0, %%mm4 \n\t"
4150 "psubw %%mm7, %%mm5 \n\t"
4151 "pxor %%mm0, %%mm0 \n\t"
4152 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4153 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4154 "psubw %%mm7, %%mm5 \n\t"
4155 "psubw %%mm0, %%mm6 \n\t"
4157 "movq %%mm4, %%mm7 \n\t"
4158 "psubw %%mm0, %%mm6 \n\t"
4159 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4160 "movq %%mm7, %%mm0 \n\t"
4161 /* use mm7 mask to merge pa & pb */
4162 "pand %%mm7, %%mm5 \n\t"
4163 /* use mm0 mask copy to merge a & b */
4164 "pand %%mm0, %%mm2 \n\t"
4165 "pandn %%mm4, %%mm7 \n\t"
4166 "pandn %%mm1, %%mm0 \n\t"
4167 "paddw %%mm5, %%mm7 \n\t"
4168 "paddw %%mm2, %%mm0 \n\t"
4169 /* test ((pa <= pb)? pa:pb) <= pc */
4170 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4171 "pxor %%mm1, %%mm1 \n\t"
4172 "pand %%mm7, %%mm3 \n\t"
4173 "pandn %%mm0, %%mm7 \n\t"
4174 "paddw %%mm3, %%mm7 \n\t"
4175 "pxor %%mm0, %%mm0 \n\t"
4176 "packuswb %%mm1, %%mm7 \n\t"
4177 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4178 "pand _ActiveMask, %%mm7 \n\t"
4179 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4180 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4181 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4182 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4183 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */
4185 /* do second set of 4 bytes */
4186 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
4187 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
4188 /* pav = p - a = (a + b - c) - a = b - c */
4189 "movq %%mm2, %%mm4 \n\t"
4190 /* pbv = p - b = (a + b - c) - b = a - c */
4191 "movq %%mm1, %%mm5 \n\t"
4192 "psubw %%mm3, %%mm4 \n\t"
4193 "pxor %%mm7, %%mm7 \n\t"
4194 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4195 "movq %%mm4, %%mm6 \n\t"
4196 "psubw %%mm3, %%mm5 \n\t"
4197 /* pa = abs(p-a) = abs(pav) */
4198 /* pb = abs(p-b) = abs(pbv) */
4199 /* pc = abs(p-c) = abs(pcv) */
4200 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
4201 "paddw %%mm5, %%mm6 \n\t"
4202 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4203 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
4204 "psubw %%mm0, %%mm4 \n\t"
4205 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
4206 "psubw %%mm0, %%mm4 \n\t"
4207 "psubw %%mm7, %%mm5 \n\t"
4208 "pxor %%mm0, %%mm0 \n\t"
4209 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4210 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4211 "psubw %%mm7, %%mm5 \n\t"
4212 "psubw %%mm0, %%mm6 \n\t"
4214 "movq %%mm4, %%mm7 \n\t"
4215 "psubw %%mm0, %%mm6 \n\t"
4216 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
4217 "movq %%mm7, %%mm0 \n\t"
4218 /* use mm7 mask to merge pa & pb */
4219 "pand %%mm7, %%mm5 \n\t"
4220 /* use mm0 mask copy to merge a & b */
4221 "pand %%mm0, %%mm2 \n\t"
4222 "pandn %%mm4, %%mm7 \n\t"
4223 "pandn %%mm1, %%mm0 \n\t"
4224 "paddw %%mm5, %%mm7 \n\t"
4225 "paddw %%mm2, %%mm0 \n\t"
4226 /* test ((pa <= pb)? pa:pb) <= pc */
4227 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
4228 "pxor %%mm1, %%mm1 \n\t"
4229 "pand %%mm7, %%mm3 \n\t"
4230 "pandn %%mm0, %%mm7 \n\t"
4231 "pxor %%mm1, %%mm1 \n\t"
4232 "paddw %%mm3, %%mm7 \n\t"
4233 "pxor %%mm0, %%mm0 \n\t"
4234 /* step ecx to next set of 8 bytes and repeat loop til done */
4235 "addl $8, %%ecx \n\t"
4236 "packuswb %%mm7, %%mm1 \n\t"
4237 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
4238 "cmpl _MMXLength, %%ecx \n\t"
4239 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4240 /* mm1 will be used as Raw(x-bpp) next loop */
4243 : "=S" (dummy_value_S
), /* output regs (dummy) */
4244 "=D" (dummy_value_D
)
4246 : "0" (prev_row
), /* esi // input regs */
4249 : "%ecx" /* clobber list */
4250 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4251 , "%mm0", "%mm1", "%mm2", "%mm3"
4252 , "%mm4", "%mm5", "%mm6", "%mm7"
4256 break; /* end 8 bpp */
4258 case 1: /* bpp = 1 */
4259 case 2: /* bpp = 2 */
4260 default: /* bpp > 8 */
4262 __asm__
__volatile__ (
4264 "pushl %%ebx \n\t" /* save Global Offset Table index */
4266 "movl _dif, %%ebx \n\t"
4267 "cmpl _FullLength, %%ebx \n\t"
4268 "jnb paeth_dend \n\t"
4270 /* preload "movl row, %%edi \n\t" */
4271 /* preload "movl prev_row, %%esi \n\t" */
4272 /* do Paeth decode for remaining bytes */
4273 "movl %%ebx, %%edx \n\t"
4274 /* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
4275 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4276 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
4279 "xorl %%eax, %%eax \n\t"
4280 /* pav = p - a = (a + b - c) - a = b - c */
4281 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4282 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4283 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4284 "movl %%eax, _patemp \n\t" /* Save pav for later use */
4285 "xorl %%eax, %%eax \n\t"
4286 /* pbv = p - b = (a + b - c) - b = a - c */
4287 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4288 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4289 "movl %%eax, %%ecx \n\t"
4290 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4291 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4293 "testl $0x80000000, %%eax \n\t"
4294 "jz paeth_dpca \n\t"
4295 "negl %%eax \n\t" /* reverse sign of neg values */
4298 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4300 "testl $0x80000000, %%ecx \n\t"
4301 "jz paeth_dpba \n\t"
4302 "negl %%ecx \n\t" /* reverse sign of neg values */
4305 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4307 "movl _patemp, %%eax \n\t"
4308 "testl $0x80000000, %%eax \n\t"
4309 "jz paeth_dpaa \n\t"
4310 "negl %%eax \n\t" /* reverse sign of neg values */
4313 "movl %%eax, _patemp \n\t" /* save pa for later use */
4314 /* test if pa <= pb */
4315 "cmpl %%ecx, %%eax \n\t"
4316 "jna paeth_dabb \n\t"
4317 /* pa > pb; now test if pb <= pc */
4318 "cmpl _pctemp, %%ecx \n\t"
4319 "jna paeth_dbbc \n\t"
4320 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4321 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4322 "jmp paeth_dpaeth \n\t"
4325 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4326 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
4327 "jmp paeth_dpaeth \n\t"
4330 /* pa <= pb; now test if pa <= pc */
4331 "cmpl _pctemp, %%eax \n\t"
4332 "jna paeth_dabc \n\t"
4333 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4334 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4335 "jmp paeth_dpaeth \n\t"
4338 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4339 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
4341 "paeth_dpaeth: \n\t"
4344 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4345 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4346 "cmpl _FullLength, %%ebx \n\t"
4351 "popl %%ebx \n\t" /* index to Global Offset Table */
4354 : "=c" (dummy_value_c
), /* output regs (dummy) */
4355 "=S" (dummy_value_S
),
4356 "=D" (dummy_value_D
)
4358 : "0" (bpp
), /* ecx // input regs */
4359 "1" (prev_row
), /* esi */
4362 : "%eax", "%edx" /* clobber list */
4368 return; /* No need to go further with this one */
4370 } /* end switch (bpp) */
4372 __asm__
__volatile__ (
4373 /* MMX acceleration complete; now do clean-up */
4374 /* check if any remaining bytes left to decode */
4376 "pushl %%ebx \n\t" /* save index to Global Offset Table */
4378 "movl _MMXLength, %%ebx \n\t"
4379 "cmpl _FullLength, %%ebx \n\t"
4380 "jnb paeth_end \n\t"
4381 /*pre "movl row, %%edi \n\t" */
4382 /*pre "movl prev_row, %%esi \n\t" */
4383 /* do Paeth decode for remaining bytes */
4384 "movl %%ebx, %%edx \n\t"
4385 /*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
4386 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4387 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
4390 "xorl %%eax, %%eax \n\t"
4391 /* pav = p - a = (a + b - c) - a = b - c */
4392 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4393 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4394 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4395 "movl %%eax, _patemp \n\t" /* Save pav for later use */
4396 "xorl %%eax, %%eax \n\t"
4397 /* pbv = p - b = (a + b - c) - b = a - c */
4398 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4399 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4400 "movl %%eax, %%ecx \n\t"
4401 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4402 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4404 "testl $0x80000000, %%eax \n\t"
4405 "jz paeth_pca2 \n\t"
4406 "negl %%eax \n\t" /* reverse sign of neg values */
4409 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4411 "testl $0x80000000, %%ecx \n\t"
4412 "jz paeth_pba2 \n\t"
4413 "negl %%ecx \n\t" /* reverse sign of neg values */
4416 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4418 "movl _patemp, %%eax \n\t"
4419 "testl $0x80000000, %%eax \n\t"
4420 "jz paeth_paa2 \n\t"
4421 "negl %%eax \n\t" /* reverse sign of neg values */
4424 "movl %%eax, _patemp \n\t" /* save pa for later use */
4425 /* test if pa <= pb */
4426 "cmpl %%ecx, %%eax \n\t"
4427 "jna paeth_abb2 \n\t"
4428 /* pa > pb; now test if pb <= pc */
4429 "cmpl _pctemp, %%ecx \n\t"
4430 "jna paeth_bbc2 \n\t"
4431 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4432 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4433 "jmp paeth_paeth2 \n\t"
4436 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4437 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
4438 "jmp paeth_paeth2 \n\t"
4441 /* pa <= pb; now test if pa <= pc */
4442 "cmpl _pctemp, %%eax \n\t"
4443 "jna paeth_abc2 \n\t"
4444 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4445 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4446 "jmp paeth_paeth2 \n\t"
4449 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4450 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
4452 "paeth_paeth2: \n\t"
4455 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4456 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4457 "cmpl _FullLength, %%ebx \n\t"
4461 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
4463 "popl %%ebx \n\t" /* restore index to Global Offset Table */
4466 : "=c" (dummy_value_c
), /* output regs (dummy) */
4467 "=S" (dummy_value_S
),
4468 "=D" (dummy_value_D
)
4470 : "0" (bpp
), /* ecx // input regs */
4471 "1" (prev_row
), /* esi */
4474 : "%eax", "%edx" /* clobber list (no input regs!) */
4480 } /* end png_read_filter_row_mmx_paeth() */
4486 #ifdef PNG_THREAD_UNSAFE_OK
4487 /*===========================================================================*/
4489 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B */
4491 /*===========================================================================*/
4493 /* Optimized code for PNG Sub filter decoder */
4495 static void /* PRIVATE */
4496 png_read_filter_row_mmx_sub(png_row_infop row_info
, png_bytep row
)
4502 bpp
= (row_info
->pixel_depth
+ 7) >> 3; /* calc number of bytes per pixel */
4503 _FullLength
= row_info
->rowbytes
- bpp
; /* number of bytes to filter */
4505 __asm__
__volatile__ (
4506 /*pre "movl row, %%edi \n\t" */
4507 "movl %%edi, %%esi \n\t" /* lp = row */
4508 /*pre "movl bpp, %%eax \n\t" */
4509 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4510 /*irr "xorl %%eax, %%eax \n\t" */
4511 /* get # of bytes to alignment */
4512 "movl %%edi, _dif \n\t" /* take start of row */
4513 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past */
4514 /* alignment boundary */
4515 "xorl %%ecx, %%ecx \n\t"
4516 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
4517 "subl %%edi, _dif \n\t" /* subtract from start ==> value */
4518 "jz sub_go \n\t" /* ecx at alignment */
4520 "sub_lp1: \n\t" /* fix alignment */
4521 "movb (%%esi,%%ecx,), %%al \n\t"
4522 "addb %%al, (%%edi,%%ecx,) \n\t"
4524 "cmpl _dif, %%ecx \n\t"
4528 "movl _FullLength, %%eax \n\t"
4529 "movl %%eax, %%edx \n\t"
4530 "subl %%ecx, %%edx \n\t" /* subtract alignment fix */
4531 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
4532 "subl %%edx, %%eax \n\t" /* drop over bytes from length */
4533 "movl %%eax, _MMXLength \n\t"
4535 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4536 "=D" (dummy_value_D
) /* 1 */
4538 : "0" (bpp
), /* eax // input regs */
4541 : "%esi", "%ecx", "%edx" // clobber list
4543 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4544 , "%mm0", "%mm1", "%mm2", "%mm3"
4545 , "%mm4", "%mm5", "%mm6", "%mm7"
4549 /* now do the math for the rest of the row */
4554 _ActiveMask
.use
= 0x0000ffffff000000LL
;
4555 _ShiftBpp
.use
= 24; /* == 3 * 8 */
4556 _ShiftRem
.use
= 40; /* == 64 - 24 */
4558 __asm__
__volatile__ (
4559 /* preload "movl row, %%edi \n\t" */
4560 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4561 /* active byte group */
4562 "movl %%edi, %%esi \n\t" /* lp = row */
4563 /* preload "movl bpp, %%eax \n\t" */
4564 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4565 "movq %%mm7, %%mm6 \n\t"
4566 "movl _dif, %%edx \n\t"
4567 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4568 /* 3rd active byte group */
4569 /* prime the pump: load the first Raw(x-bpp) data set */
4570 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4572 "sub_3lp: \n\t" /* shift data for adding first */
4573 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4574 /* shift clears inactive bytes) */
4575 /* add 1st active group */
4576 "movq (%%edi,%%edx,), %%mm0 \n\t"
4577 "paddb %%mm1, %%mm0 \n\t"
4579 /* add 2nd active group */
4580 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4581 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4582 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
4583 "paddb %%mm1, %%mm0 \n\t"
4585 /* add 3rd active group */
4586 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4587 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4588 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
4589 "addl $8, %%edx \n\t"
4590 "paddb %%mm1, %%mm0 \n\t"
4592 "cmpl _MMXLength, %%edx \n\t"
4593 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4594 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
4597 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4598 "=D" (dummy_value_D
) /* 1 */
4600 : "0" (bpp
), /* eax // input regs */
4603 : "%edx", "%esi" /* clobber list */
4604 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4605 , "%mm0", "%mm1", "%mm6", "%mm7"
4613 __asm__
__volatile__ (
4614 "movl _dif, %%edx \n\t"
4615 /* preload "movl row, %%edi \n\t" */
4616 "cmpl _FullLength, %%edx \n\t"
4618 "movl %%edi, %%esi \n\t" /* lp = row */
4619 "xorl %%eax, %%eax \n\t"
4620 /* preload "movl bpp, %%eax \n\t" */
4621 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4624 "movb (%%esi,%%edx,), %%al \n\t"
4625 "addb %%al, (%%edi,%%edx,) \n\t"
4627 "cmpl _FullLength, %%edx \n\t"
4632 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4633 "=D" (dummy_value_D
) /* 1 */
4635 : "0" (bpp
), /* eax // input regs */
4638 : "%edx", "%esi" /* clobber list */
4645 //case 7: /* GRR BOGUS */
4646 //case 5: /* GRR BOGUS */
4648 _ShiftBpp
.use
= bpp
<< 3;
4649 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
4651 __asm__
__volatile__ (
4652 /* preload "movl row, %%edi \n\t" */
4653 "movl _dif, %%edx \n\t"
4654 "movl %%edi, %%esi \n\t" /* lp = row */
4655 /* preload "movl bpp, %%eax \n\t" */
4656 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4658 /* prime the pump: load the first Raw(x-bpp) data set */
4659 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4661 "sub_4lp: \n\t" /* shift data for adding first */
4662 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4663 /* shift clears inactive bytes) */
4664 "movq (%%edi,%%edx,), %%mm0 \n\t"
4665 "paddb %%mm1, %%mm0 \n\t"
4667 /* add 2nd active group */
4668 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4669 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4670 "addl $8, %%edx \n\t"
4671 "paddb %%mm1, %%mm0 \n\t"
4673 "cmpl _MMXLength, %%edx \n\t"
4674 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4675 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
4678 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4679 "=D" (dummy_value_D
) /* 1 */
4681 : "0" (bpp
), /* eax // input regs */
4684 : "%edx", "%esi" /* clobber list */
4685 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4694 _ActiveMask
.use
= 0x00000000ffff0000LL
;
4695 _ShiftBpp
.use
= 16; /* == 2 * 8 */
4696 _ShiftRem
.use
= 48; /* == 64 - 16 */
4698 __asm__
__volatile__ (
4699 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4700 /* active byte group */
4701 "movl _dif, %%edx \n\t"
4702 "movq %%mm7, %%mm6 \n\t"
4703 /* preload "movl row, %%edi \n\t" */
4704 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4705 /* 3rd active byte group */
4706 "movl %%edi, %%esi \n\t" /* lp = row */
4707 "movq %%mm6, %%mm5 \n\t"
4708 /* preload "movl bpp, %%eax \n\t" */
4709 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4710 "psllq _ShiftBpp, %%mm5 \n\t" /* move mask in mm5 to cover */
4711 /* 4th active byte group */
4712 /* prime the pump: load the first Raw(x-bpp) data set */
4713 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4715 "sub_2lp: \n\t" /* shift data for adding first */
4716 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4717 /* shift clears inactive bytes) */
4718 /* add 1st active group */
4719 "movq (%%edi,%%edx,), %%mm0 \n\t"
4720 "paddb %%mm1, %%mm0 \n\t"
4722 /* add 2nd active group */
4723 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4724 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4725 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
4726 "paddb %%mm1, %%mm0 \n\t"
4728 /* add 3rd active group */
4729 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4730 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4731 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
4732 "paddb %%mm1, %%mm0 \n\t"
4734 /* add 4th active group */
4735 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4736 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4737 "pand %%mm5, %%mm1 \n\t" /* mask to use 4th active group */
4738 "addl $8, %%edx \n\t"
4739 "paddb %%mm1, %%mm0 \n\t"
4740 "cmpl _MMXLength, %%edx \n\t"
4741 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4742 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
4745 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4746 "=D" (dummy_value_D
) /* 1 */
4748 : "0" (bpp
), /* eax // input regs */
4751 : "%edx", "%esi" /* clobber list */
4752 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4753 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4761 __asm__
__volatile__ (
4762 /* preload "movl row, %%edi \n\t" */
4763 "movl _dif, %%edx \n\t"
4764 "movl %%edi, %%esi \n\t" /* lp = row */
4765 /* preload "movl bpp, %%eax \n\t" */
4766 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4767 "movl _MMXLength, %%ecx \n\t"
4769 /* prime the pump: load the first Raw(x-bpp) data set */
4770 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4771 "andl $0x0000003f, %%ecx \n\t" /* calc bytes over mult of 64 */
4774 "movq (%%edi,%%edx,), %%mm0 \n\t" /* load Sub(x) for 1st 8 bytes */
4775 "paddb %%mm7, %%mm0 \n\t"
4776 "movq 8(%%edi,%%edx,), %%mm1 \n\t" /* load Sub(x) for 2nd 8 bytes */
4777 "movq %%mm0, (%%edi,%%edx,) \n\t" /* write Raw(x) for 1st 8 bytes */
4779 /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */
4780 /* This will be repeated for each group of 8 bytes with the 8th */
4781 /* group being used as the Raw(x-bpp) for the 1st group of the */
4784 "paddb %%mm0, %%mm1 \n\t"
4785 "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */
4786 "movq %%mm1, 8(%%edi,%%edx,) \n\t" /* write Raw(x) for 2nd 8 bytes */
4787 "paddb %%mm1, %%mm2 \n\t"
4788 "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */
4789 "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */
4790 "paddb %%mm2, %%mm3 \n\t"
4791 "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */
4792 "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */
4793 "paddb %%mm3, %%mm4 \n\t"
4794 "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */
4795 "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */
4796 "paddb %%mm4, %%mm5 \n\t"
4797 "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */
4798 "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */
4799 "paddb %%mm5, %%mm6 \n\t"
4800 "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */
4801 "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */
4802 "addl $64, %%edx \n\t"
4803 "paddb %%mm6, %%mm7 \n\t"
4804 "cmpl %%ecx, %%edx \n\t"
4805 "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */
4808 "cmpl _MMXLength, %%edx \n\t"
4812 "movq (%%edi,%%edx,), %%mm0 \n\t"
4813 "addl $8, %%edx \n\t"
4814 "paddb %%mm7, %%mm0 \n\t"
4815 "cmpl _MMXLength, %%edx \n\t"
4816 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */
4817 "movq %%mm0, %%mm7 \n\t" /* move calculated Raw(x) data */
4818 /* to mm1 to be new Raw(x-bpp) */
4824 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4825 "=D" (dummy_value_D
) /* 1 */
4827 : "0" (bpp
), /* eax // input regs */
4830 : "%ecx", "%edx", "%esi" /* clobber list */
4831 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4838 default: /* bpp greater than 8 bytes GRR BOGUS */
4840 __asm__
__volatile__ (
4841 "movl _dif, %%edx \n\t"
4842 /* preload "movl row, %%edi \n\t" */
4843 "movl %%edi, %%esi \n\t" /* lp = row */
4844 /* preload "movl bpp, %%eax \n\t" */
4845 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4848 "movq (%%edi,%%edx,), %%mm0 \n\t"
4849 "movq (%%esi,%%edx,), %%mm1 \n\t"
4850 "addl $8, %%edx \n\t"
4851 "paddb %%mm1, %%mm0 \n\t"
4852 "cmpl _MMXLength, %%edx \n\t"
4853 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */
4854 /* -8 to offset addl edx */
4857 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4858 "=D" (dummy_value_D
) /* 1 */
4860 : "0" (bpp
), /* eax // input regs */
4863 : "%edx", "%esi" /* clobber list */
4864 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4871 } /* end switch (bpp) */
4873 __asm__
__volatile__ (
4874 "movl _MMXLength, %%edx \n\t"
4875 /* pre "movl row, %%edi \n\t" */
4876 "cmpl _FullLength, %%edx \n\t"
4879 "movl %%edi, %%esi \n\t" /* lp = row */
4880 /* pre "movl bpp, %%eax \n\t" */
4881 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4882 "xorl %%eax, %%eax \n\t"
4885 "movb (%%esi,%%edx,), %%al \n\t"
4886 "addb %%al, (%%edi,%%edx,) \n\t"
4888 "cmpl _FullLength, %%edx \n\t"
4892 "EMMS \n\t" /* end MMX instructions */
4894 : "=a" (dummy_value_a
), /* 0 // output regs (dummy) */
4895 "=D" (dummy_value_D
) /* 1 */
4897 : "0" (bpp
), /* eax // input regs */
4900 : "%edx", "%esi" /* clobber list */
4903 } /* end of png_read_filter_row_mmx_sub() */
4909 /*===========================================================================*/
4911 /* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P */
4913 /*===========================================================================*/
4915 /* Optimized code for PNG Up filter decoder */
4917 static void /* PRIVATE */
4918 png_read_filter_row_mmx_up(png_row_infop row_info
, png_bytep row
,
4922 int dummy_value_d
; /* fix 'forbidden register 3 (dx) was spilled' error */
4926 len
= row_info
->rowbytes
; /* number of bytes to filter */
4928 __asm__
__volatile__ (
4929 /* pre "movl row, %%edi \n\t" */
4930 /* get # of bytes to alignment */
4934 "movl %%edi, %%ecx \n\t"
4935 "xorl %%ebx, %%ebx \n\t"
4936 "addl $0x7, %%ecx \n\t"
4937 "xorl %%eax, %%eax \n\t"
4938 "andl $0xfffffff8, %%ecx \n\t"
4939 /* pre "movl prev_row, %%esi \n\t" */
4940 "subl %%edi, %%ecx \n\t"
4943 "up_lp1: \n\t" /* fix alignment */
4944 "movb (%%edi,%%ebx,), %%al \n\t"
4945 "addb (%%esi,%%ebx,), %%al \n\t"
4947 "cmpl %%ecx, %%ebx \n\t"
4948 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
4949 "jb up_lp1 \n\t" /* offset incl ebx */
4952 /* pre "movl len, %%edx \n\t" */
4953 "movl %%edx, %%ecx \n\t"
4954 "subl %%ebx, %%edx \n\t" /* subtract alignment fix */
4955 "andl $0x0000003f, %%edx \n\t" /* calc bytes over mult of 64 */
4956 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
4958 /* unrolled loop - use all MMX registers and interleave to reduce */
4959 /* number of branch instructions (loops) and reduce partial stalls */
4961 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4962 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4963 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4964 "paddb %%mm1, %%mm0 \n\t"
4965 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4966 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4967 "paddb %%mm3, %%mm2 \n\t"
4968 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4969 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4970 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4971 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4972 "paddb %%mm5, %%mm4 \n\t"
4973 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4974 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4975 "paddb %%mm7, %%mm6 \n\t"
4976 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4977 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4978 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4979 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4980 "paddb %%mm1, %%mm0 \n\t"
4981 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4982 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4983 "paddb %%mm3, %%mm2 \n\t"
4984 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4985 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4986 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4987 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4988 "paddb %%mm5, %%mm4 \n\t"
4989 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4990 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4991 "addl $64, %%ebx \n\t"
4992 "paddb %%mm7, %%mm6 \n\t"
4993 "cmpl %%ecx, %%ebx \n\t"
4994 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */
4995 "jb up_loop \n\t" /* -8 to offset addl ebx */
4997 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 64 */
5000 "cmpl $8, %%edx \n\t" /* test for less than 8 bytes */
5001 "jb up_lt8 \n\t" /* [added by lcreeve@netins.net] */
5003 "addl %%edx, %%ecx \n\t"
5004 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
5005 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
5008 "up_lpA: \n\t" /* use MMX regs to update 8 bytes sim. */
5009 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5010 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5011 "addl $8, %%ebx \n\t"
5012 "paddb %%mm1, %%mm0 \n\t"
5013 "cmpl %%ecx, %%ebx \n\t"
5014 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */
5015 "jb up_lpA \n\t" /* offset add ebx */
5016 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 8 */
5020 "xorl %%eax, %%eax \n\t"
5021 "addl %%edx, %%ecx \n\t" /* move over byte count into counter */
5023 "up_lp2: \n\t" /* use x86 regs for remaining bytes */
5024 "movb (%%edi,%%ebx,), %%al \n\t"
5025 "addb (%%esi,%%ebx,), %%al \n\t"
5027 "cmpl %%ecx, %%ebx \n\t"
5028 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
5029 "jb up_lp2 \n\t" /* offset inc ebx */
5032 "EMMS \n\t" /* conversion of filtered row complete */
5037 : "=d" (dummy_value_d
), /* 0 // output regs (dummy) */
5038 "=S" (dummy_value_S
), /* 1 */
5039 "=D" (dummy_value_D
) /* 2 */
5041 : "0" (len
), /* edx // input regs */
5042 "1" (prev_row
), /* esi */
5045 : "%eax", "%ecx" // clobber list (no input regs!)
5050 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5051 , "%mm0", "%mm1", "%mm2", "%mm3"
5052 , "%mm4", "%mm5", "%mm6", "%mm7"
5056 } /* end of png_read_filter_row_mmx_up() */
5058 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5063 /*===========================================================================*/
5065 /* P N G _ R E A D _ F I L T E R _ R O W */
5067 /*===========================================================================*/
5070 /* Optimized png_read_filter_row routines */
5073 png_read_filter_row(png_structp png_ptr
, png_row_infop row_info
, png_bytep
5074 row
, png_bytep prev_row
, int filter
)
5080 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5081 /* GRR: these are superseded by png_ptr->asm_flags: */
5082 #define UseMMX_sub 1 /* GRR: converted 20000730 */
5083 #define UseMMX_up 1 /* GRR: converted 20000729 */
5084 #define UseMMX_avg 1 /* GRR: converted 20000828 (+ 16-bit bugfix 20000916) */
5085 #define UseMMX_paeth 1 /* GRR: converted 20000828 */
5087 if (_mmx_supported
== 2) {
5088 /* this should have happened in png_init_mmx_flags() already */
5089 #if !defined(PNG_1_0_X)
5090 png_warning(png_ptr
, "asm_flags may not have been initialized");
5094 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5097 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5100 case 0: sprintf(filnm
, "none");
5102 case 1: sprintf(filnm
, "sub-%s",
5103 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5104 #if !defined(PNG_1_0_X)
5105 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
)? "MMX" :
5110 case 2: sprintf(filnm
, "up-%s",
5111 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5112 #if !defined(PNG_1_0_X)
5113 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
)? "MMX" :
5118 case 3: sprintf(filnm
, "avg-%s",
5119 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5120 #if !defined(PNG_1_0_X)
5121 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
)? "MMX" :
5126 case 4: sprintf(filnm
, "Paeth-%s",
5127 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5128 #if !defined(PNG_1_0_X)
5129 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
)? "MMX":
5134 default: sprintf(filnm
, "unknw");
5137 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr
->row_number
, filnm
);
5138 png_debug1(0, "row=0x%08lx, ", (unsigned long)row
);
5139 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info
->pixel_depth
,
5140 (int)((row_info
->pixel_depth
+ 7) >> 3));
5141 png_debug1(0,"rowbytes=%8ld\n", row_info
->rowbytes
);
5142 #endif /* PNG_DEBUG */
5146 case PNG_FILTER_VALUE_NONE
:
5149 case PNG_FILTER_VALUE_SUB
:
5150 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5151 #if !defined(PNG_1_0_X)
5152 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
) &&
5153 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5154 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5159 png_read_filter_row_mmx_sub(row_info
, row
);
5162 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5165 png_uint_32 istop
= row_info
->rowbytes
;
5166 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5167 png_bytep rp
= row
+ bpp
;
5170 for (i
= bpp
; i
< istop
; i
++)
5172 *rp
= (png_byte
)(((int)(*rp
) + (int)(*lp
++)) & 0xff);
5175 } /* end !UseMMX_sub */
5178 case PNG_FILTER_VALUE_UP
:
5179 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5180 #if !defined(PNG_1_0_X)
5181 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
) &&
5182 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5183 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5188 png_read_filter_row_mmx_up(row_info
, row
, prev_row
);
5191 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5194 png_uint_32 istop
= row_info
->rowbytes
;
5196 png_bytep pp
= prev_row
;
5198 for (i
= 0; i
< istop
; ++i
)
5200 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
5203 } /* end !UseMMX_up */
5206 case PNG_FILTER_VALUE_AVG
:
5207 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5208 #if !defined(PNG_1_0_X)
5209 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
) &&
5210 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5211 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5216 png_read_filter_row_mmx_avg(row_info
, row
, prev_row
);
5219 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5223 png_bytep pp
= prev_row
;
5225 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5226 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
5228 for (i
= 0; i
< bpp
; i
++)
5230 *rp
= (png_byte
)(((int)(*rp
) +
5231 ((int)(*pp
++) >> 1)) & 0xff);
5235 for (i
= 0; i
< istop
; i
++)
5237 *rp
= (png_byte
)(((int)(*rp
) +
5238 ((int)(*pp
++ + *lp
++) >> 1)) & 0xff);
5241 } /* end !UseMMX_avg */
5244 case PNG_FILTER_VALUE_PAETH
:
5245 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5246 #if !defined(PNG_1_0_X)
5247 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
) &&
5248 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5249 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5254 png_read_filter_row_mmx_paeth(row_info
, row
, prev_row
);
5257 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5261 png_bytep pp
= prev_row
;
5263 png_bytep cp
= prev_row
;
5264 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5265 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
5267 for (i
= 0; i
< bpp
; i
++)
5269 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
5273 for (i
= 0; i
< istop
; i
++) /* use leftover rp,pp */
5275 int a
, b
, c
, pa
, pb
, pc
, p
;
5289 pa
= p
< 0 ? -p
: p
;
5290 pb
= pc
< 0 ? -pc
: pc
;
5291 pc
= (p
+ pc
) < 0 ? -(p
+ pc
) : p
+ pc
;
5295 if (pa <= pb && pa <= pc)
5303 p
= (pa
<= pb
&& pa
<= pc
) ? a
: (pb
<= pc
) ? b
: c
;
5305 *rp
= (png_byte
)(((int)(*rp
) + p
) & 0xff);
5308 } /* end !UseMMX_paeth */
5312 png_warning(png_ptr
, "Ignoring bad row-filter type");
5318 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5321 /*===========================================================================*/
5323 /* P N G _ M M X _ S U P P O R T */
5325 /*===========================================================================*/
5327 /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5328 * (2) all instructions compile with gcc 2.7.2.3 and later
5329 * (3) the function is moved down here to prevent gcc from
5330 * inlining it in multiple places and then barfing be-
5331 * cause the ".NOT_SUPPORTED" label is multiply defined
5332 * [is there a way to signal that a *single* function should
5333 * not be inlined? is there a way to modify the label for
5334 * each inlined instance, e.g., by appending _1, _2, etc.?
5335 * maybe if don't use leading "." in label name? (nope...sigh)]
5339 png_mmx_support(void)
5341 #if defined(PNG_MMX_CODE_SUPPORTED)
5342 __asm__
__volatile__ (
5343 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5344 "pushl %%ecx \n\t" // so does ecx...
5345 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5346 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5347 // "pushf \n\t" // 16-bit pushf
5348 "pushfl \n\t" // save Eflag to stack
5349 "popl %%eax \n\t" // get Eflag from stack into eax
5350 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5351 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5352 "pushl %%eax \n\t" // save modified Eflag back to stack
5353 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5354 // "popf \n\t" // 16-bit popf
5355 "popfl \n\t" // restore modified value to Eflag reg
5356 "pushfl \n\t" // save Eflag to stack
5357 "popl %%eax \n\t" // get Eflag from stack
5358 "pushl %%ecx \n\t" // save original Eflag to stack
5359 "popfl \n\t" // restore original Eflag
5360 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5361 "jz 0f \n\t" // if same, CPUID instr. is not supported
5363 "xorl %%eax, %%eax \n\t" // set eax to zero
5364 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5365 "cpuid \n\t" // get the CPU identification info
5366 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5367 "jl 0f \n\t" // if eax is zero, MMX is not supported
5369 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5370 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5371 // faster than the instruction "mov eax, 1"
5372 "cpuid \n\t" // get the CPU identification info again
5373 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5374 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5375 "jz 0f \n\t" // non-zero = yes, MMX IS supported
5377 "movl $1, %%eax \n\t" // set return value to 1
5378 "jmp 1f \n\t" // DONE: have MMX support
5380 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5381 "movl $0, %%eax \n\t" // set return value to 0
5382 "1: \n\t" // .RETURN: target label for jump instructions
5383 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5384 "popl %%edx \n\t" // restore edx
5385 "popl %%ecx \n\t" // restore ecx
5386 "popl %%ebx \n\t" // restore ebx
5388 // "ret \n\t" // DONE: no MMX support
5389 // (fall through to standard C "ret")
5391 : // output list (none)
5393 : // any variables used on input (none)
5395 : "%eax" // clobber list
5396 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5397 // , "memory" // if write to a variable gcc thought was in a reg
5398 // , "cc" // "condition codes" (flag bits)
5402 #endif /* PNG_MMX_CODE_SUPPORTED */
5404 return _mmx_supported
;
5408 #endif /* PNG_USE_PNGGCCRD */