]> git.saurik.com Git - wxWidgets.git/blame - src/png/pnggccrd.c
reverted Robert's over-optimisation, correct fix coming soon
[wxWidgets.git] / src / png / pnggccrd.c
CommitLineData
c6b71bff
GD
1/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4 *
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
8 *
706b8807 9 * libpng version 1.2.7 - September 12, 2004
c6b71bff 10 * For conditions of distribution and use, see copyright notice in png.h
5b02c8a1 11 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
c6b71bff
GD
12 * Copyright (c) 1998, Intel Corporation
13 *
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
17 *
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19 *
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21 *
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
23 *
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
26 * For djgpp, see
27 *
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29 *
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
32 *
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36 * binutils.tgz
37 *
38 * For other platforms, see the main GNU site:
39 *
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
41 *
42 * Version 2.5.2l.15 is definitely too old...
43 */
44
45/*
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
48 *
49 * 19991006:
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51 *
52 * 19991007:
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
79 *
80 * 19991016:
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
89 *
90 * 19991023:
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
96 *
97 * 19991024:
98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_read_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
113 *
114 * 19991107:
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
117 *
118 * 19991120:
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
124 *
125 * 20000213:
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
127 *
128 * 20000319:
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
131 *
132 * 20000623:
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
139 *
140 * 20000706:
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
152 *
153 * 20000729:
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
160 * mapping of latter)
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
163 *
164 * 20000731:
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166 *
167 * 20000822:
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
173 *
174 * 20000823:
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
178 *
179 * 20000826:
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
183 *
184 * 20000828:
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
189 *
190 * 20000829:
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
195 *
196 * 20000914:
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
199 *
200 * 20000916:
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
205 *
206 * 20010101:
207 * - added new png_init_mmx_flags() function (here only because it needs to
208 * call mmxsupport(), which should probably become global png_mmxsupport());
209 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
210 *
211 * 20010103:
212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 * and made it public; moved png_init_mmx_flags() to png.c as internal func
214 *
215 * 20010104:
216 * - removed dependency on png_read_filter_row_c() (C code already duplicated
217 * within MMX version of png_read_filter_row()) so no longer necessary to
218 * compile it into pngrutil.o
219 *
220 * 20010310:
221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222 *
223 * 20020304:
224 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
225 *
5b02c8a1
VS
226 * 20040724:
227 * - more tinkering with clobber list at lines 4529 and 5033, to get
228 * it to compile on gcc-3.4.
229 *
c6b71bff
GD
230 * STILL TO DO:
231 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
232 * - write MMX code for 48-bit case (pixel_bytes == 6)
233 * - figure out what's up with 24-bit case (pixel_bytes == 3):
234 * why subtract 8 from width_mmx in the pass 4/5 case?
235 * (only width_mmx case) (near line 1606)
236 * - rewrite all MMX interlacing code so it's aligned with beginning
237 * of the row buffer, not the end (see 19991007 for details)
238 * x pick one version of mmxsupport() and get rid of the other
239 * - add error messages to any remaining bogus default cases
240 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
241 * x add support for runtime enable/disable/query of various MMX routines
242 */
243
244#define PNG_INTERNAL
245#include "png.h"
246
247#if defined(PNG_USE_PNGGCCRD)
248
249int PNGAPI png_mmx_support(void);
250
251#ifdef PNG_USE_LOCAL_ARRAYS
252static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
253static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
254static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
255#endif
256
257#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
258/* djgpp, Win32, and Cygwin add their own underscores to global variables,
259 * so define them without: */
260#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
261# define _mmx_supported mmx_supported
262# define _const4 const4
263# define _const6 const6
264# define _mask8_0 mask8_0
265# define _mask16_1 mask16_1
266# define _mask16_0 mask16_0
267# define _mask24_2 mask24_2
268# define _mask24_1 mask24_1
269# define _mask24_0 mask24_0
270# define _mask32_3 mask32_3
271# define _mask32_2 mask32_2
272# define _mask32_1 mask32_1
273# define _mask32_0 mask32_0
274# define _mask48_5 mask48_5
275# define _mask48_4 mask48_4
276# define _mask48_3 mask48_3
277# define _mask48_2 mask48_2
278# define _mask48_1 mask48_1
279# define _mask48_0 mask48_0
280# define _LBCarryMask LBCarryMask
281# define _HBClearMask HBClearMask
282# define _ActiveMask ActiveMask
283# define _ActiveMask2 ActiveMask2
284# define _ActiveMaskEnd ActiveMaskEnd
285# define _ShiftBpp ShiftBpp
286# define _ShiftRem ShiftRem
287#ifdef PNG_THREAD_UNSAFE_OK
288# define _unmask unmask
289# define _FullLength FullLength
290# define _MMXLength MMXLength
291# define _dif dif
292# define _patemp patemp
293# define _pbtemp pbtemp
294# define _pctemp pctemp
295#endif
296#endif
297
298
299/* These constants are used in the inlined MMX assembly code.
300 Ignore gcc's "At top level: defined but not used" warnings. */
301
302/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
303 * since that case uses the %ebx register for indexing the Global Offset Table
304 * and there were no other registers available. But gcc 2.95 and later emit
305 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
306 * in the non-PIC case, so we'll just use the global unconditionally now.
307 */
308#ifdef PNG_THREAD_UNSAFE_OK
309static int _unmask;
310#endif
311
312static unsigned long long _mask8_0 = 0x0102040810204080LL;
313
314static unsigned long long _mask16_1 = 0x0101020204040808LL;
315static unsigned long long _mask16_0 = 0x1010202040408080LL;
316
317static unsigned long long _mask24_2 = 0x0101010202020404LL;
318static unsigned long long _mask24_1 = 0x0408080810101020LL;
319static unsigned long long _mask24_0 = 0x2020404040808080LL;
320
321static unsigned long long _mask32_3 = 0x0101010102020202LL;
322static unsigned long long _mask32_2 = 0x0404040408080808LL;
323static unsigned long long _mask32_1 = 0x1010101020202020LL;
324static unsigned long long _mask32_0 = 0x4040404080808080LL;
325
326static unsigned long long _mask48_5 = 0x0101010101010202LL;
327static unsigned long long _mask48_4 = 0x0202020204040404LL;
328static unsigned long long _mask48_3 = 0x0404080808080808LL;
329static unsigned long long _mask48_2 = 0x1010101010102020LL;
330static unsigned long long _mask48_1 = 0x2020202040404040LL;
331static unsigned long long _mask48_0 = 0x4040808080808080LL;
332
333static unsigned long long _const4 = 0x0000000000FFFFFFLL;
7f88f624 334/* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */ /* NOT USED */
c6b71bff
GD
335static unsigned long long _const6 = 0x00000000000000FFLL;
336
7f88f624
VZ
337/* These are used in the row-filter routines and should/would be local */
338/* variables if not for gcc addressing limitations. */
339/* WARNING: Their presence probably defeats the thread safety of libpng. */
c6b71bff
GD
340
341#ifdef PNG_THREAD_UNSAFE_OK
342static png_uint_32 _FullLength;
343static png_uint_32 _MMXLength;
344static int _dif;
7f88f624 345static int _patemp; /* temp variables for Paeth routine */
c6b71bff
GD
346static int _pbtemp;
347static int _pctemp;
348#endif
349
350void /* PRIVATE */
351png_squelch_warnings(void)
352{
353#ifdef PNG_THREAD_UNSAFE_OK
354 _dif = _dif;
355 _patemp = _patemp;
356 _pbtemp = _pbtemp;
357 _pctemp = _pctemp;
358 _MMXLength = _MMXLength;
359#endif
360 _const4 = _const4;
361 _const6 = _const6;
362 _mask8_0 = _mask8_0;
363 _mask16_1 = _mask16_1;
364 _mask16_0 = _mask16_0;
365 _mask24_2 = _mask24_2;
366 _mask24_1 = _mask24_1;
367 _mask24_0 = _mask24_0;
368 _mask32_3 = _mask32_3;
369 _mask32_2 = _mask32_2;
370 _mask32_1 = _mask32_1;
371 _mask32_0 = _mask32_0;
372 _mask48_5 = _mask48_5;
373 _mask48_4 = _mask48_4;
374 _mask48_3 = _mask48_3;
375 _mask48_2 = _mask48_2;
376 _mask48_1 = _mask48_1;
377 _mask48_0 = _mask48_0;
378}
379#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
380
381
382static int _mmx_supported = 2;
383
384/*===========================================================================*/
385/* */
386/* P N G _ C O M B I N E _ R O W */
387/* */
388/*===========================================================================*/
389
390#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
391
392#define BPP2 2
393#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
394#define BPP4 4
395#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
396#define BPP8 8
397
398/* Combines the row recently read in with the previous row.
399 This routine takes care of alpha and transparency if requested.
400 This routine also handles the two methods of progressive display
401 of interlaced images, depending on the mask value.
402 The mask value describes which pixels are to be combined with
403 the row. The pattern always repeats every 8 pixels, so just 8
404 bits are needed. A one indicates the pixel is to be combined; a
405 zero indicates the pixel is to be skipped. This is in addition
406 to any alpha or transparency value associated with the pixel.
407 If you want all pixels to be combined, pass 0xff (255) in mask. */
408
409/* Use this routine for the x86 platform - it uses a faster MMX routine
410 if the machine supports MMX. */
411
412void /* PRIVATE */
413png_combine_row(png_structp png_ptr, png_bytep row, int mask)
414{
415 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
416
417#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
418 if (_mmx_supported == 2) {
5b02c8a1 419#if !defined(PNG_1_0_X)
c6b71bff
GD
420 /* this should have happened in png_init_mmx_flags() already */
421 png_warning(png_ptr, "asm_flags may not have been initialized");
5b02c8a1 422#endif
c6b71bff
GD
423 png_mmx_support();
424 }
425#endif
426
427 if (mask == 0xff)
428 {
429 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
430 png_memcpy(row, png_ptr->row_buf + 1,
5b02c8a1 431 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
c6b71bff
GD
432 }
433 else /* (png_combine_row() is never called with mask == 0) */
434 {
435 switch (png_ptr->row_info.pixel_depth)
436 {
437 case 1: /* png_ptr->row_info.pixel_depth */
438 {
439 png_bytep sp;
440 png_bytep dp;
441 int s_inc, s_start, s_end;
442 int m;
443 int shift;
444 png_uint_32 i;
445
446 sp = png_ptr->row_buf + 1;
447 dp = row;
448 m = 0x80;
449#if defined(PNG_READ_PACKSWAP_SUPPORTED)
450 if (png_ptr->transformations & PNG_PACKSWAP)
451 {
452 s_start = 0;
453 s_end = 7;
454 s_inc = 1;
455 }
456 else
457#endif
458 {
459 s_start = 7;
460 s_end = 0;
461 s_inc = -1;
462 }
463
464 shift = s_start;
465
466 for (i = 0; i < png_ptr->width; i++)
467 {
468 if (m & mask)
469 {
470 int value;
471
472 value = (*sp >> shift) & 0x1;
473 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
474 *dp |= (png_byte)(value << shift);
475 }
476
477 if (shift == s_end)
478 {
479 shift = s_start;
480 sp++;
481 dp++;
482 }
483 else
484 shift += s_inc;
485
486 if (m == 1)
487 m = 0x80;
488 else
489 m >>= 1;
490 }
491 break;
492 }
493
494 case 2: /* png_ptr->row_info.pixel_depth */
495 {
496 png_bytep sp;
497 png_bytep dp;
498 int s_start, s_end, s_inc;
499 int m;
500 int shift;
501 png_uint_32 i;
502 int value;
503
504 sp = png_ptr->row_buf + 1;
505 dp = row;
506 m = 0x80;
507#if defined(PNG_READ_PACKSWAP_SUPPORTED)
508 if (png_ptr->transformations & PNG_PACKSWAP)
509 {
510 s_start = 0;
511 s_end = 6;
512 s_inc = 2;
513 }
514 else
515#endif
516 {
517 s_start = 6;
518 s_end = 0;
519 s_inc = -2;
520 }
521
522 shift = s_start;
523
524 for (i = 0; i < png_ptr->width; i++)
525 {
526 if (m & mask)
527 {
528 value = (*sp >> shift) & 0x3;
529 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
530 *dp |= (png_byte)(value << shift);
531 }
532
533 if (shift == s_end)
534 {
535 shift = s_start;
536 sp++;
537 dp++;
538 }
539 else
540 shift += s_inc;
541 if (m == 1)
542 m = 0x80;
543 else
544 m >>= 1;
545 }
546 break;
547 }
548
549 case 4: /* png_ptr->row_info.pixel_depth */
550 {
551 png_bytep sp;
552 png_bytep dp;
553 int s_start, s_end, s_inc;
554 int m;
555 int shift;
556 png_uint_32 i;
557 int value;
558
559 sp = png_ptr->row_buf + 1;
560 dp = row;
561 m = 0x80;
562#if defined(PNG_READ_PACKSWAP_SUPPORTED)
563 if (png_ptr->transformations & PNG_PACKSWAP)
564 {
565 s_start = 0;
566 s_end = 4;
567 s_inc = 4;
568 }
569 else
570#endif
571 {
572 s_start = 4;
573 s_end = 0;
574 s_inc = -4;
575 }
576 shift = s_start;
577
578 for (i = 0; i < png_ptr->width; i++)
579 {
580 if (m & mask)
581 {
582 value = (*sp >> shift) & 0xf;
583 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
584 *dp |= (png_byte)(value << shift);
585 }
586
587 if (shift == s_end)
588 {
589 shift = s_start;
590 sp++;
591 dp++;
592 }
593 else
594 shift += s_inc;
595 if (m == 1)
596 m = 0x80;
597 else
598 m >>= 1;
599 }
600 break;
601 }
602
603 case 8: /* png_ptr->row_info.pixel_depth */
604 {
605 png_bytep srcptr;
606 png_bytep dstptr;
607
608#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
609#if !defined(PNG_1_0_X)
610 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
611 /* && _mmx_supported */ )
612#else
613 if (_mmx_supported)
614#endif
615 {
616 png_uint_32 len;
617 int diff;
7f88f624 618 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
619 int dummy_value_d;
620 int dummy_value_c;
621 int dummy_value_S;
622 int dummy_value_D;
7f88f624 623 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
624 srcptr = png_ptr->row_buf + 1;
625 dstptr = row;
7f88f624
VZ
626 len = png_ptr->width &~7; /* reduce to multiple of 8 */
627 diff = (int) (png_ptr->width & 7); /* amount lost */
c6b71bff
GD
628
629 __asm__ __volatile__ (
7f88f624
VZ
630 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
631 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
632 "punpcklbw %%mm7, %%mm7 \n\t"
633 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 634 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
635
636 "movq _mask8_0, %%mm0 \n\t"
7f88f624
VZ
637 "pand %%mm7, %%mm0 \n\t" /* nonzero if keep byte */
638 "pcmpeqb %%mm6, %%mm0 \n\t" /* zeros->1s, v versa */
c6b71bff 639
7f88f624
VZ
640/* preload "movl len, %%ecx \n\t" // load length of line */
641/* preload "movl srcptr, %%esi \n\t" // load source */
642/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff 643
7f88f624 644 "cmpl $0, %%ecx \n\t" /* len == 0 ? */
c6b71bff
GD
645 "je mainloop8end \n\t"
646
647 "mainloop8: \n\t"
7f88f624 648 "movq (%%esi), %%mm4 \n\t" /* *srcptr */
c6b71bff
GD
649 "pand %%mm0, %%mm4 \n\t"
650 "movq %%mm0, %%mm6 \n\t"
7f88f624 651 "pandn (%%edi), %%mm6 \n\t" /* *dstptr */
c6b71bff
GD
652 "por %%mm6, %%mm4 \n\t"
653 "movq %%mm4, (%%edi) \n\t"
7f88f624 654 "addl $8, %%esi \n\t" /* inc by 8 bytes processed */
c6b71bff 655 "addl $8, %%edi \n\t"
7f88f624 656 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
657 "ja mainloop8 \n\t"
658
659 "mainloop8end: \n\t"
7f88f624 660/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
661 "movl %%eax, %%ecx \n\t"
662 "cmpl $0, %%ecx \n\t"
663 "jz end8 \n\t"
7f88f624
VZ
664/* preload "movl mask, %%edx \n\t" */
665 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
666
667 "secondloop8: \n\t"
7f88f624
VZ
668 "sall %%edx \n\t" /* move high bit to CF */
669 "jnc skip8 \n\t" /* if CF = 0 */
c6b71bff
GD
670 "movb (%%esi), %%al \n\t"
671 "movb %%al, (%%edi) \n\t"
672
673 "skip8: \n\t"
674 "incl %%esi \n\t"
675 "incl %%edi \n\t"
676 "decl %%ecx \n\t"
677 "jnz secondloop8 \n\t"
678
679 "end8: \n\t"
7f88f624 680 "EMMS \n\t" /* DONE */
c6b71bff 681
7f88f624 682 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
683 "=d" (dummy_value_d),
684 "=c" (dummy_value_c),
685 "=S" (dummy_value_S),
686 "=D" (dummy_value_D)
687
7f88f624
VZ
688 : "3" (srcptr), /* esi // input regs */
689 "4" (dstptr), /* edi */
690 "0" (diff), /* eax */
691/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
692 "2" (len), /* ecx */
693 "1" (mask) /* edx */
c6b71bff
GD
694
695#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 696 : "%mm0", "%mm4", "%mm6", "%mm7" /* clobber list */
c6b71bff
GD
697#endif
698 );
699 }
700 else /* mmx _not supported - Use modified C routine */
701#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
702 {
703 register png_uint_32 i;
704 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
705 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
706 register int stride = png_pass_inc[png_ptr->pass];
707 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
708 register int rep_bytes = png_pass_width[png_ptr->pass];
709 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
710 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
711 int diff = (int) (png_ptr->width & 7); /* amount lost */
712 register png_uint_32 final_val = len; /* GRR bugfix */
713
714 srcptr = png_ptr->row_buf + 1 + initial_val;
715 dstptr = row + initial_val;
716
717 for (i = initial_val; i < final_val; i += stride)
718 {
719 png_memcpy(dstptr, srcptr, rep_bytes);
720 srcptr += stride;
721 dstptr += stride;
722 }
723 if (diff) /* number of leftover pixels: 3 for pngtest */
724 {
725 final_val+=diff /* *BPP1 */ ;
726 for (; i < final_val; i += stride)
727 {
728 if (rep_bytes > (int)(final_val-i))
729 rep_bytes = (int)(final_val-i);
730 png_memcpy(dstptr, srcptr, rep_bytes);
731 srcptr += stride;
732 dstptr += stride;
733 }
734 }
735
736 } /* end of else (_mmx_supported) */
737
738 break;
739 } /* end 8 bpp */
740
741 case 16: /* png_ptr->row_info.pixel_depth */
742 {
743 png_bytep srcptr;
744 png_bytep dstptr;
745
746#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
747#if !defined(PNG_1_0_X)
748 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
749 /* && _mmx_supported */ )
750#else
751 if (_mmx_supported)
752#endif
753 {
754 png_uint_32 len;
755 int diff;
7f88f624 756 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
757 int dummy_value_d;
758 int dummy_value_c;
759 int dummy_value_S;
760 int dummy_value_D;
7f88f624 761 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
762 srcptr = png_ptr->row_buf + 1;
763 dstptr = row;
7f88f624
VZ
764 len = png_ptr->width &~7; /* reduce to multiple of 8 */
765 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
766
767 __asm__ __volatile__ (
7f88f624
VZ
768 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
769 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
770 "punpcklbw %%mm7, %%mm7 \n\t"
771 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 772 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
773
774 "movq _mask16_0, %%mm0 \n\t"
775 "movq _mask16_1, %%mm1 \n\t"
776
777 "pand %%mm7, %%mm0 \n\t"
778 "pand %%mm7, %%mm1 \n\t"
779
780 "pcmpeqb %%mm6, %%mm0 \n\t"
781 "pcmpeqb %%mm6, %%mm1 \n\t"
782
7f88f624
VZ
783/* preload "movl len, %%ecx \n\t" // load length of line */
784/* preload "movl srcptr, %%esi \n\t" // load source */
785/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff
GD
786
787 "cmpl $0, %%ecx \n\t"
788 "jz mainloop16end \n\t"
789
790 "mainloop16: \n\t"
791 "movq (%%esi), %%mm4 \n\t"
792 "pand %%mm0, %%mm4 \n\t"
793 "movq %%mm0, %%mm6 \n\t"
794 "movq (%%edi), %%mm7 \n\t"
795 "pandn %%mm7, %%mm6 \n\t"
796 "por %%mm6, %%mm4 \n\t"
797 "movq %%mm4, (%%edi) \n\t"
798
799 "movq 8(%%esi), %%mm5 \n\t"
800 "pand %%mm1, %%mm5 \n\t"
801 "movq %%mm1, %%mm7 \n\t"
802 "movq 8(%%edi), %%mm6 \n\t"
803 "pandn %%mm6, %%mm7 \n\t"
804 "por %%mm7, %%mm5 \n\t"
805 "movq %%mm5, 8(%%edi) \n\t"
806
7f88f624 807 "addl $16, %%esi \n\t" /* inc by 16 bytes processed */
c6b71bff 808 "addl $16, %%edi \n\t"
7f88f624 809 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
810 "ja mainloop16 \n\t"
811
812 "mainloop16end: \n\t"
7f88f624 813/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
814 "movl %%eax, %%ecx \n\t"
815 "cmpl $0, %%ecx \n\t"
816 "jz end16 \n\t"
7f88f624
VZ
817/* preload "movl mask, %%edx \n\t" */
818 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
819
820 "secondloop16: \n\t"
7f88f624
VZ
821 "sall %%edx \n\t" /* move high bit to CF */
822 "jnc skip16 \n\t" /* if CF = 0 */
c6b71bff
GD
823 "movw (%%esi), %%ax \n\t"
824 "movw %%ax, (%%edi) \n\t"
825
826 "skip16: \n\t"
827 "addl $2, %%esi \n\t"
828 "addl $2, %%edi \n\t"
829 "decl %%ecx \n\t"
830 "jnz secondloop16 \n\t"
831
832 "end16: \n\t"
7f88f624 833 "EMMS \n\t" /* DONE */
c6b71bff 834
7f88f624 835 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
836 "=c" (dummy_value_c),
837 "=d" (dummy_value_d),
838 "=S" (dummy_value_S),
839 "=D" (dummy_value_D)
840
7f88f624
VZ
841 : "0" (diff), /* eax // input regs */
842/* was (unmask) " " RESERVED // ebx // Global Offset Table idx */
843 "1" (len), /* ecx */
844 "2" (mask), /* edx */
845 "3" (srcptr), /* esi */
846 "4" (dstptr) /* edi */
c6b71bff
GD
847
848#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 849 : "%mm0", "%mm1", "%mm4" /* clobber list */
c6b71bff
GD
850 , "%mm5", "%mm6", "%mm7"
851#endif
852 );
853 }
854 else /* mmx _not supported - Use modified C routine */
855#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
856 {
857 register png_uint_32 i;
858 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
859 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
860 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
861 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
862 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
863 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
864 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
865 int diff = (int) (png_ptr->width & 7); /* amount lost */
866 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
867
868 srcptr = png_ptr->row_buf + 1 + initial_val;
869 dstptr = row + initial_val;
870
871 for (i = initial_val; i < final_val; i += stride)
872 {
873 png_memcpy(dstptr, srcptr, rep_bytes);
874 srcptr += stride;
875 dstptr += stride;
876 }
877 if (diff) /* number of leftover pixels: 3 for pngtest */
878 {
879 final_val+=diff*BPP2;
880 for (; i < final_val; i += stride)
881 {
882 if (rep_bytes > (int)(final_val-i))
883 rep_bytes = (int)(final_val-i);
884 png_memcpy(dstptr, srcptr, rep_bytes);
885 srcptr += stride;
886 dstptr += stride;
887 }
888 }
889 } /* end of else (_mmx_supported) */
890
891 break;
892 } /* end 16 bpp */
893
894 case 24: /* png_ptr->row_info.pixel_depth */
895 {
896 png_bytep srcptr;
897 png_bytep dstptr;
898
899#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
900#if !defined(PNG_1_0_X)
901 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
902 /* && _mmx_supported */ )
903#else
904 if (_mmx_supported)
905#endif
906 {
907 png_uint_32 len;
908 int diff;
7f88f624 909 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
910 int dummy_value_d;
911 int dummy_value_c;
912 int dummy_value_S;
913 int dummy_value_D;
7f88f624 914 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
915 srcptr = png_ptr->row_buf + 1;
916 dstptr = row;
7f88f624
VZ
917 len = png_ptr->width &~7; /* reduce to multiple of 8 */
918 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
919
920 __asm__ __volatile__ (
7f88f624
VZ
921 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
922 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
923 "punpcklbw %%mm7, %%mm7 \n\t"
924 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 925 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
926
927 "movq _mask24_0, %%mm0 \n\t"
928 "movq _mask24_1, %%mm1 \n\t"
929 "movq _mask24_2, %%mm2 \n\t"
930
931 "pand %%mm7, %%mm0 \n\t"
932 "pand %%mm7, %%mm1 \n\t"
933 "pand %%mm7, %%mm2 \n\t"
934
935 "pcmpeqb %%mm6, %%mm0 \n\t"
936 "pcmpeqb %%mm6, %%mm1 \n\t"
937 "pcmpeqb %%mm6, %%mm2 \n\t"
938
7f88f624
VZ
939/* preload "movl len, %%ecx \n\t" // load length of line */
940/* preload "movl srcptr, %%esi \n\t" // load source */
941/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff
GD
942
943 "cmpl $0, %%ecx \n\t"
944 "jz mainloop24end \n\t"
945
946 "mainloop24: \n\t"
947 "movq (%%esi), %%mm4 \n\t"
948 "pand %%mm0, %%mm4 \n\t"
949 "movq %%mm0, %%mm6 \n\t"
950 "movq (%%edi), %%mm7 \n\t"
951 "pandn %%mm7, %%mm6 \n\t"
952 "por %%mm6, %%mm4 \n\t"
953 "movq %%mm4, (%%edi) \n\t"
954
955 "movq 8(%%esi), %%mm5 \n\t"
956 "pand %%mm1, %%mm5 \n\t"
957 "movq %%mm1, %%mm7 \n\t"
958 "movq 8(%%edi), %%mm6 \n\t"
959 "pandn %%mm6, %%mm7 \n\t"
960 "por %%mm7, %%mm5 \n\t"
961 "movq %%mm5, 8(%%edi) \n\t"
962
963 "movq 16(%%esi), %%mm6 \n\t"
964 "pand %%mm2, %%mm6 \n\t"
965 "movq %%mm2, %%mm4 \n\t"
966 "movq 16(%%edi), %%mm7 \n\t"
967 "pandn %%mm7, %%mm4 \n\t"
968 "por %%mm4, %%mm6 \n\t"
969 "movq %%mm6, 16(%%edi) \n\t"
970
7f88f624 971 "addl $24, %%esi \n\t" /* inc by 24 bytes processed */
c6b71bff 972 "addl $24, %%edi \n\t"
7f88f624 973 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
974
975 "ja mainloop24 \n\t"
976
977 "mainloop24end: \n\t"
7f88f624 978/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
979 "movl %%eax, %%ecx \n\t"
980 "cmpl $0, %%ecx \n\t"
981 "jz end24 \n\t"
7f88f624
VZ
982/* preload "movl mask, %%edx \n\t" */
983 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
984
985 "secondloop24: \n\t"
7f88f624
VZ
986 "sall %%edx \n\t" /* move high bit to CF */
987 "jnc skip24 \n\t" /* if CF = 0 */
c6b71bff
GD
988 "movw (%%esi), %%ax \n\t"
989 "movw %%ax, (%%edi) \n\t"
990 "xorl %%eax, %%eax \n\t"
991 "movb 2(%%esi), %%al \n\t"
992 "movb %%al, 2(%%edi) \n\t"
993
994 "skip24: \n\t"
995 "addl $3, %%esi \n\t"
996 "addl $3, %%edi \n\t"
997 "decl %%ecx \n\t"
998 "jnz secondloop24 \n\t"
999
1000 "end24: \n\t"
7f88f624 1001 "EMMS \n\t" /* DONE */
c6b71bff 1002
7f88f624 1003 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
1004 "=d" (dummy_value_d),
1005 "=c" (dummy_value_c),
1006 "=S" (dummy_value_S),
1007 "=D" (dummy_value_D)
1008
7f88f624
VZ
1009 : "3" (srcptr), /* esi // input regs */
1010 "4" (dstptr), /* edi */
1011 "0" (diff), /* eax */
1012/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1013 "2" (len), /* ecx */
1014 "1" (mask) /* edx */
c6b71bff
GD
1015
1016#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1017 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1018 , "%mm4", "%mm5", "%mm6", "%mm7"
1019#endif
1020 );
1021 }
1022 else /* mmx _not supported - Use modified C routine */
1023#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1024 {
1025 register png_uint_32 i;
1026 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1027 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1028 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1029 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1030 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1031 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1032 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1033 int diff = (int) (png_ptr->width & 7); /* amount lost */
1034 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1035
1036 srcptr = png_ptr->row_buf + 1 + initial_val;
1037 dstptr = row + initial_val;
1038
1039 for (i = initial_val; i < final_val; i += stride)
1040 {
1041 png_memcpy(dstptr, srcptr, rep_bytes);
1042 srcptr += stride;
1043 dstptr += stride;
1044 }
1045 if (diff) /* number of leftover pixels: 3 for pngtest */
1046 {
1047 final_val+=diff*BPP3;
1048 for (; i < final_val; i += stride)
1049 {
1050 if (rep_bytes > (int)(final_val-i))
1051 rep_bytes = (int)(final_val-i);
1052 png_memcpy(dstptr, srcptr, rep_bytes);
1053 srcptr += stride;
1054 dstptr += stride;
1055 }
1056 }
1057 } /* end of else (_mmx_supported) */
1058
1059 break;
1060 } /* end 24 bpp */
1061
1062 case 32: /* png_ptr->row_info.pixel_depth */
1063 {
1064 png_bytep srcptr;
1065 png_bytep dstptr;
1066
1067#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1068#if !defined(PNG_1_0_X)
1069 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1070 /* && _mmx_supported */ )
1071#else
1072 if (_mmx_supported)
1073#endif
1074 {
1075 png_uint_32 len;
1076 int diff;
7f88f624 1077 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
1078 int dummy_value_d;
1079 int dummy_value_c;
1080 int dummy_value_S;
1081 int dummy_value_D;
7f88f624 1082 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
1083 srcptr = png_ptr->row_buf + 1;
1084 dstptr = row;
7f88f624
VZ
1085 len = png_ptr->width &~7; /* reduce to multiple of 8 */
1086 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
1087
1088 __asm__ __volatile__ (
7f88f624
VZ
1089 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1090 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
1091 "punpcklbw %%mm7, %%mm7 \n\t"
1092 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 1093 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
1094
1095 "movq _mask32_0, %%mm0 \n\t"
1096 "movq _mask32_1, %%mm1 \n\t"
1097 "movq _mask32_2, %%mm2 \n\t"
1098 "movq _mask32_3, %%mm3 \n\t"
1099
1100 "pand %%mm7, %%mm0 \n\t"
1101 "pand %%mm7, %%mm1 \n\t"
1102 "pand %%mm7, %%mm2 \n\t"
1103 "pand %%mm7, %%mm3 \n\t"
1104
1105 "pcmpeqb %%mm6, %%mm0 \n\t"
1106 "pcmpeqb %%mm6, %%mm1 \n\t"
1107 "pcmpeqb %%mm6, %%mm2 \n\t"
1108 "pcmpeqb %%mm6, %%mm3 \n\t"
1109
7f88f624
VZ
1110/* preload "movl len, %%ecx \n\t" // load length of line */
1111/* preload "movl srcptr, %%esi \n\t" // load source */
1112/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff 1113
7f88f624 1114 "cmpl $0, %%ecx \n\t" /* lcr */
c6b71bff
GD
1115 "jz mainloop32end \n\t"
1116
1117 "mainloop32: \n\t"
1118 "movq (%%esi), %%mm4 \n\t"
1119 "pand %%mm0, %%mm4 \n\t"
1120 "movq %%mm0, %%mm6 \n\t"
1121 "movq (%%edi), %%mm7 \n\t"
1122 "pandn %%mm7, %%mm6 \n\t"
1123 "por %%mm6, %%mm4 \n\t"
1124 "movq %%mm4, (%%edi) \n\t"
1125
1126 "movq 8(%%esi), %%mm5 \n\t"
1127 "pand %%mm1, %%mm5 \n\t"
1128 "movq %%mm1, %%mm7 \n\t"
1129 "movq 8(%%edi), %%mm6 \n\t"
1130 "pandn %%mm6, %%mm7 \n\t"
1131 "por %%mm7, %%mm5 \n\t"
1132 "movq %%mm5, 8(%%edi) \n\t"
1133
1134 "movq 16(%%esi), %%mm6 \n\t"
1135 "pand %%mm2, %%mm6 \n\t"
1136 "movq %%mm2, %%mm4 \n\t"
1137 "movq 16(%%edi), %%mm7 \n\t"
1138 "pandn %%mm7, %%mm4 \n\t"
1139 "por %%mm4, %%mm6 \n\t"
1140 "movq %%mm6, 16(%%edi) \n\t"
1141
1142 "movq 24(%%esi), %%mm7 \n\t"
1143 "pand %%mm3, %%mm7 \n\t"
1144 "movq %%mm3, %%mm5 \n\t"
1145 "movq 24(%%edi), %%mm4 \n\t"
1146 "pandn %%mm4, %%mm5 \n\t"
1147 "por %%mm5, %%mm7 \n\t"
1148 "movq %%mm7, 24(%%edi) \n\t"
1149
7f88f624 1150 "addl $32, %%esi \n\t" /* inc by 32 bytes processed */
c6b71bff 1151 "addl $32, %%edi \n\t"
7f88f624 1152 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
1153 "ja mainloop32 \n\t"
1154
1155 "mainloop32end: \n\t"
7f88f624 1156/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
1157 "movl %%eax, %%ecx \n\t"
1158 "cmpl $0, %%ecx \n\t"
1159 "jz end32 \n\t"
7f88f624
VZ
1160/* preload "movl mask, %%edx \n\t" */
1161 "sall $24, %%edx \n\t" /* low byte => high byte */
c6b71bff
GD
1162
1163 "secondloop32: \n\t"
7f88f624
VZ
1164 "sall %%edx \n\t" /* move high bit to CF */
1165 "jnc skip32 \n\t" /* if CF = 0 */
c6b71bff
GD
1166 "movl (%%esi), %%eax \n\t"
1167 "movl %%eax, (%%edi) \n\t"
1168
1169 "skip32: \n\t"
1170 "addl $4, %%esi \n\t"
1171 "addl $4, %%edi \n\t"
1172 "decl %%ecx \n\t"
1173 "jnz secondloop32 \n\t"
1174
1175 "end32: \n\t"
7f88f624 1176 "EMMS \n\t" /* DONE */
c6b71bff 1177
7f88f624 1178 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
1179 "=d" (dummy_value_d),
1180 "=c" (dummy_value_c),
1181 "=S" (dummy_value_S),
1182 "=D" (dummy_value_D)
1183
7f88f624
VZ
1184 : "3" (srcptr), /* esi // input regs */
1185 "4" (dstptr), /* edi */
1186 "0" (diff), /* eax */
1187/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1188 "2" (len), /* ecx */
1189 "1" (mask) /* edx */
c6b71bff
GD
1190
1191#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1192 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
c6b71bff
GD
1193 , "%mm4", "%mm5", "%mm6", "%mm7"
1194#endif
1195 );
1196 }
1197 else /* mmx _not supported - Use modified C routine */
1198#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1199 {
1200 register png_uint_32 i;
1201 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1202 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1203 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1204 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1205 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1206 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1207 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1208 int diff = (int) (png_ptr->width & 7); /* amount lost */
1209 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1210
1211 srcptr = png_ptr->row_buf + 1 + initial_val;
1212 dstptr = row + initial_val;
1213
1214 for (i = initial_val; i < final_val; i += stride)
1215 {
1216 png_memcpy(dstptr, srcptr, rep_bytes);
1217 srcptr += stride;
1218 dstptr += stride;
1219 }
1220 if (diff) /* number of leftover pixels: 3 for pngtest */
1221 {
1222 final_val+=diff*BPP4;
1223 for (; i < final_val; i += stride)
1224 {
1225 if (rep_bytes > (int)(final_val-i))
1226 rep_bytes = (int)(final_val-i);
1227 png_memcpy(dstptr, srcptr, rep_bytes);
1228 srcptr += stride;
1229 dstptr += stride;
1230 }
1231 }
1232 } /* end of else (_mmx_supported) */
1233
1234 break;
1235 } /* end 32 bpp */
1236
1237 case 48: /* png_ptr->row_info.pixel_depth */
1238 {
1239 png_bytep srcptr;
1240 png_bytep dstptr;
1241
1242#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1243#if !defined(PNG_1_0_X)
1244 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1245 /* && _mmx_supported */ )
1246#else
1247 if (_mmx_supported)
1248#endif
1249 {
1250 png_uint_32 len;
1251 int diff;
7f88f624 1252 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
1253 int dummy_value_d;
1254 int dummy_value_c;
1255 int dummy_value_S;
1256 int dummy_value_D;
7f88f624 1257 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
1258 srcptr = png_ptr->row_buf + 1;
1259 dstptr = row;
7f88f624
VZ
1260 len = png_ptr->width &~7; /* reduce to multiple of 8 */
1261 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
1262
1263 __asm__ __volatile__ (
7f88f624
VZ
1264 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1265 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
1266 "punpcklbw %%mm7, %%mm7 \n\t"
1267 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 1268 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
1269
1270 "movq _mask48_0, %%mm0 \n\t"
1271 "movq _mask48_1, %%mm1 \n\t"
1272 "movq _mask48_2, %%mm2 \n\t"
1273 "movq _mask48_3, %%mm3 \n\t"
1274 "movq _mask48_4, %%mm4 \n\t"
1275 "movq _mask48_5, %%mm5 \n\t"
1276
1277 "pand %%mm7, %%mm0 \n\t"
1278 "pand %%mm7, %%mm1 \n\t"
1279 "pand %%mm7, %%mm2 \n\t"
1280 "pand %%mm7, %%mm3 \n\t"
1281 "pand %%mm7, %%mm4 \n\t"
1282 "pand %%mm7, %%mm5 \n\t"
1283
1284 "pcmpeqb %%mm6, %%mm0 \n\t"
1285 "pcmpeqb %%mm6, %%mm1 \n\t"
1286 "pcmpeqb %%mm6, %%mm2 \n\t"
1287 "pcmpeqb %%mm6, %%mm3 \n\t"
1288 "pcmpeqb %%mm6, %%mm4 \n\t"
1289 "pcmpeqb %%mm6, %%mm5 \n\t"
1290
7f88f624
VZ
1291/* preload "movl len, %%ecx \n\t" // load length of line */
1292/* preload "movl srcptr, %%esi \n\t" // load source */
1293/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff
GD
1294
1295 "cmpl $0, %%ecx \n\t"
1296 "jz mainloop48end \n\t"
1297
1298 "mainloop48: \n\t"
1299 "movq (%%esi), %%mm7 \n\t"
1300 "pand %%mm0, %%mm7 \n\t"
1301 "movq %%mm0, %%mm6 \n\t"
1302 "pandn (%%edi), %%mm6 \n\t"
1303 "por %%mm6, %%mm7 \n\t"
1304 "movq %%mm7, (%%edi) \n\t"
1305
1306 "movq 8(%%esi), %%mm6 \n\t"
1307 "pand %%mm1, %%mm6 \n\t"
1308 "movq %%mm1, %%mm7 \n\t"
1309 "pandn 8(%%edi), %%mm7 \n\t"
1310 "por %%mm7, %%mm6 \n\t"
1311 "movq %%mm6, 8(%%edi) \n\t"
1312
1313 "movq 16(%%esi), %%mm6 \n\t"
1314 "pand %%mm2, %%mm6 \n\t"
1315 "movq %%mm2, %%mm7 \n\t"
1316 "pandn 16(%%edi), %%mm7 \n\t"
1317 "por %%mm7, %%mm6 \n\t"
1318 "movq %%mm6, 16(%%edi) \n\t"
1319
1320 "movq 24(%%esi), %%mm7 \n\t"
1321 "pand %%mm3, %%mm7 \n\t"
1322 "movq %%mm3, %%mm6 \n\t"
1323 "pandn 24(%%edi), %%mm6 \n\t"
1324 "por %%mm6, %%mm7 \n\t"
1325 "movq %%mm7, 24(%%edi) \n\t"
1326
1327 "movq 32(%%esi), %%mm6 \n\t"
1328 "pand %%mm4, %%mm6 \n\t"
1329 "movq %%mm4, %%mm7 \n\t"
1330 "pandn 32(%%edi), %%mm7 \n\t"
1331 "por %%mm7, %%mm6 \n\t"
1332 "movq %%mm6, 32(%%edi) \n\t"
1333
1334 "movq 40(%%esi), %%mm7 \n\t"
1335 "pand %%mm5, %%mm7 \n\t"
1336 "movq %%mm5, %%mm6 \n\t"
1337 "pandn 40(%%edi), %%mm6 \n\t"
1338 "por %%mm6, %%mm7 \n\t"
1339 "movq %%mm7, 40(%%edi) \n\t"
1340
7f88f624 1341 "addl $48, %%esi \n\t" /* inc by 48 bytes processed */
c6b71bff 1342 "addl $48, %%edi \n\t"
7f88f624 1343 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
1344
1345 "ja mainloop48 \n\t"
1346
1347 "mainloop48end: \n\t"
7f88f624 1348/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
1349 "movl %%eax, %%ecx \n\t"
1350 "cmpl $0, %%ecx \n\t"
1351 "jz end48 \n\t"
7f88f624
VZ
1352/* preload "movl mask, %%edx \n\t" */
1353 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
1354
1355 "secondloop48: \n\t"
7f88f624
VZ
1356 "sall %%edx \n\t" /* move high bit to CF */
1357 "jnc skip48 \n\t" /* if CF = 0 */
c6b71bff
GD
1358 "movl (%%esi), %%eax \n\t"
1359 "movl %%eax, (%%edi) \n\t"
1360
1361 "skip48: \n\t"
1362 "addl $4, %%esi \n\t"
1363 "addl $4, %%edi \n\t"
1364 "decl %%ecx \n\t"
1365 "jnz secondloop48 \n\t"
1366
1367 "end48: \n\t"
7f88f624 1368 "EMMS \n\t" /* DONE */
c6b71bff 1369
7f88f624 1370 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
1371 "=d" (dummy_value_d),
1372 "=c" (dummy_value_c),
1373 "=S" (dummy_value_S),
1374 "=D" (dummy_value_D)
1375
7f88f624
VZ
1376 : "3" (srcptr), /* esi // input regs */
1377 "4" (dstptr), /* edi */
1378 "0" (diff), /* eax */
1379/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1380 "2" (len), /* ecx */
1381 "1" (mask) /* edx */
c6b71bff
GD
1382
1383#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1384 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
c6b71bff
GD
1385 , "%mm4", "%mm5", "%mm6", "%mm7"
1386#endif
1387 );
1388 }
1389 else /* mmx _not supported - Use modified C routine */
1390#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1391 {
1392 register png_uint_32 i;
1393 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1394 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1395 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1396 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1397 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1398 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1399 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1400 int diff = (int) (png_ptr->width & 7); /* amount lost */
1401 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1402
1403 srcptr = png_ptr->row_buf + 1 + initial_val;
1404 dstptr = row + initial_val;
1405
1406 for (i = initial_val; i < final_val; i += stride)
1407 {
1408 png_memcpy(dstptr, srcptr, rep_bytes);
1409 srcptr += stride;
1410 dstptr += stride;
1411 }
1412 if (diff) /* number of leftover pixels: 3 for pngtest */
1413 {
1414 final_val+=diff*BPP6;
1415 for (; i < final_val; i += stride)
1416 {
1417 if (rep_bytes > (int)(final_val-i))
1418 rep_bytes = (int)(final_val-i);
1419 png_memcpy(dstptr, srcptr, rep_bytes);
1420 srcptr += stride;
1421 dstptr += stride;
1422 }
1423 }
1424 } /* end of else (_mmx_supported) */
1425
1426 break;
1427 } /* end 48 bpp */
1428
1429 case 64: /* png_ptr->row_info.pixel_depth */
1430 {
1431 png_bytep srcptr;
1432 png_bytep dstptr;
1433 register png_uint_32 i;
1434 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1435 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1436 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1437 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1438 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1439 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1440 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1441 int diff = (int) (png_ptr->width & 7); /* amount lost */
1442 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1443
1444 srcptr = png_ptr->row_buf + 1 + initial_val;
1445 dstptr = row + initial_val;
1446
1447 for (i = initial_val; i < final_val; i += stride)
1448 {
1449 png_memcpy(dstptr, srcptr, rep_bytes);
1450 srcptr += stride;
1451 dstptr += stride;
1452 }
1453 if (diff) /* number of leftover pixels: 3 for pngtest */
1454 {
1455 final_val+=diff*BPP8;
1456 for (; i < final_val; i += stride)
1457 {
1458 if (rep_bytes > (int)(final_val-i))
1459 rep_bytes = (int)(final_val-i);
1460 png_memcpy(dstptr, srcptr, rep_bytes);
1461 srcptr += stride;
1462 dstptr += stride;
1463 }
1464 }
1465
1466 break;
1467 } /* end 64 bpp */
1468
1469 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1470 {
1471 /* this should never happen */
1472 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1473 break;
1474 }
1475 } /* end switch (png_ptr->row_info.pixel_depth) */
1476
1477 } /* end if (non-trivial mask) */
1478
1479} /* end png_combine_row() */
1480
1481#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1482
1483
1484
1485
1486/*===========================================================================*/
1487/* */
1488/* P N G _ D O _ R E A D _ I N T E R L A C E */
1489/* */
1490/*===========================================================================*/
1491
1492#if defined(PNG_READ_INTERLACING_SUPPORTED)
1493#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1494
1495/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1496 * has taken place. [GRR: what other steps come before and/or after?]
1497 */
1498
1499void /* PRIVATE */
1500png_do_read_interlace(png_structp png_ptr)
1501{
1502 png_row_infop row_info = &(png_ptr->row_info);
1503 png_bytep row = png_ptr->row_buf + 1;
1504 int pass = png_ptr->pass;
1505#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1506 png_uint_32 transformations = png_ptr->transformations;
1507#endif
1508
1509 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1510
1511#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1512 if (_mmx_supported == 2) {
1513#if !defined(PNG_1_0_X)
1514 /* this should have happened in png_init_mmx_flags() already */
1515 png_warning(png_ptr, "asm_flags may not have been initialized");
1516#endif
1517 png_mmx_support();
1518 }
1519#endif
1520
1521 if (row != NULL && row_info != NULL)
1522 {
1523 png_uint_32 final_width;
1524
1525 final_width = row_info->width * png_pass_inc[pass];
1526
1527 switch (row_info->pixel_depth)
1528 {
1529 case 1:
1530 {
1531 png_bytep sp, dp;
1532 int sshift, dshift;
1533 int s_start, s_end, s_inc;
1534 png_byte v;
1535 png_uint_32 i;
1536 int j;
1537
1538 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1539 dp = row + (png_size_t)((final_width - 1) >> 3);
1540#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541 if (transformations & PNG_PACKSWAP)
1542 {
1543 sshift = (int)((row_info->width + 7) & 7);
1544 dshift = (int)((final_width + 7) & 7);
1545 s_start = 7;
1546 s_end = 0;
1547 s_inc = -1;
1548 }
1549 else
1550#endif
1551 {
1552 sshift = 7 - (int)((row_info->width + 7) & 7);
1553 dshift = 7 - (int)((final_width + 7) & 7);
1554 s_start = 0;
1555 s_end = 7;
1556 s_inc = 1;
1557 }
1558
1559 for (i = row_info->width; i; i--)
1560 {
1561 v = (png_byte)((*sp >> sshift) & 0x1);
1562 for (j = 0; j < png_pass_inc[pass]; j++)
1563 {
1564 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1565 *dp |= (png_byte)(v << dshift);
1566 if (dshift == s_end)
1567 {
1568 dshift = s_start;
1569 dp--;
1570 }
1571 else
1572 dshift += s_inc;
1573 }
1574 if (sshift == s_end)
1575 {
1576 sshift = s_start;
1577 sp--;
1578 }
1579 else
1580 sshift += s_inc;
1581 }
1582 break;
1583 }
1584
1585 case 2:
1586 {
1587 png_bytep sp, dp;
1588 int sshift, dshift;
1589 int s_start, s_end, s_inc;
1590 png_uint_32 i;
1591
1592 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1593 dp = row + (png_size_t)((final_width - 1) >> 2);
1594#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1595 if (transformations & PNG_PACKSWAP)
1596 {
1597 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1598 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1599 s_start = 6;
1600 s_end = 0;
1601 s_inc = -2;
1602 }
1603 else
1604#endif
1605 {
1606 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1607 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1608 s_start = 0;
1609 s_end = 6;
1610 s_inc = 2;
1611 }
1612
1613 for (i = row_info->width; i; i--)
1614 {
1615 png_byte v;
1616 int j;
1617
1618 v = (png_byte)((*sp >> sshift) & 0x3);
1619 for (j = 0; j < png_pass_inc[pass]; j++)
1620 {
1621 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1622 *dp |= (png_byte)(v << dshift);
1623 if (dshift == s_end)
1624 {
1625 dshift = s_start;
1626 dp--;
1627 }
1628 else
1629 dshift += s_inc;
1630 }
1631 if (sshift == s_end)
1632 {
1633 sshift = s_start;
1634 sp--;
1635 }
1636 else
1637 sshift += s_inc;
1638 }
1639 break;
1640 }
1641
1642 case 4:
1643 {
1644 png_bytep sp, dp;
1645 int sshift, dshift;
1646 int s_start, s_end, s_inc;
1647 png_uint_32 i;
1648
1649 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1650 dp = row + (png_size_t)((final_width - 1) >> 1);
1651#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1652 if (transformations & PNG_PACKSWAP)
1653 {
1654 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1655 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1656 s_start = 4;
1657 s_end = 0;
1658 s_inc = -4;
1659 }
1660 else
1661#endif
1662 {
1663 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1664 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1665 s_start = 0;
1666 s_end = 4;
1667 s_inc = 4;
1668 }
1669
1670 for (i = row_info->width; i; i--)
1671 {
1672 png_byte v;
1673 int j;
1674
1675 v = (png_byte)((*sp >> sshift) & 0xf);
1676 for (j = 0; j < png_pass_inc[pass]; j++)
1677 {
1678 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1679 *dp |= (png_byte)(v << dshift);
1680 if (dshift == s_end)
1681 {
1682 dshift = s_start;
1683 dp--;
1684 }
1685 else
1686 dshift += s_inc;
1687 }
1688 if (sshift == s_end)
1689 {
1690 sshift = s_start;
1691 sp--;
1692 }
1693 else
1694 sshift += s_inc;
1695 }
1696 break;
1697 }
1698
1699 /*====================================================================*/
1700
1701 default: /* 8-bit or larger (this is where the routine is modified) */
1702 {
1703#if 0
7f88f624
VZ
1704/* static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1705/* static unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
1706/* unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1707/* unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
c6b71bff
GD
1708#endif
1709 png_bytep sptr, dp;
1710 png_uint_32 i;
1711 png_size_t pixel_bytes;
1712 int width = (int)row_info->width;
1713
1714 pixel_bytes = (row_info->pixel_depth >> 3);
1715
1716 /* point sptr at the last pixel in the pre-expanded row: */
1717 sptr = row + (width - 1) * pixel_bytes;
1718
1719 /* point dp at the last pixel position in the expanded row: */
1720 dp = row + (final_width - 1) * pixel_bytes;
1721
1722 /* New code by Nirav Chhatrapati - Intel Corporation */
1723
1724#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1725#if !defined(PNG_1_0_X)
1726 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1727 /* && _mmx_supported */ )
1728#else
1729 if (_mmx_supported)
1730#endif
1731 {
1732 //--------------------------------------------------------------
1733 if (pixel_bytes == 3)
1734 {
1735 if (((pass == 0) || (pass == 1)) && width)
1736 {
7f88f624 1737 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1738 int dummy_value_S;
1739 int dummy_value_D;
1740
1741 __asm__ __volatile__ (
1742 "subl $21, %%edi \n\t"
7f88f624 1743 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1744
1745 ".loop3_pass0: \n\t"
7f88f624
VZ
1746 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1747 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1748 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1749 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1750 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1751 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1752 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1753 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1754 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
1755 "movq %%mm0, %%mm3 \n\t" /* 2 1 0 2 1 0 2 1 */
1756 "psllq $16, %%mm0 \n\t" /* 0 2 1 0 2 1 z z */
1757 "movq %%mm3, %%mm4 \n\t" /* 2 1 0 2 1 0 2 1 */
1758 "punpckhdq %%mm0, %%mm3 \n\t" /* 0 2 1 0 2 1 0 2 */
c6b71bff 1759 "movq %%mm4, 16(%%edi) \n\t"
7f88f624 1760 "psrlq $32, %%mm0 \n\t" /* z z z z 0 2 1 0 */
c6b71bff 1761 "movq %%mm3, 8(%%edi) \n\t"
7f88f624 1762 "punpckldq %%mm4, %%mm0 \n\t" /* 1 0 2 1 0 2 1 0 */
c6b71bff
GD
1763 "subl $3, %%esi \n\t"
1764 "movq %%mm0, (%%edi) \n\t"
1765 "subl $24, %%edi \n\t"
1766 "decl %%ecx \n\t"
1767 "jnz .loop3_pass0 \n\t"
7f88f624 1768 "EMMS \n\t" /* DONE */
c6b71bff 1769
7f88f624 1770 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1771 "=S" (dummy_value_S),
1772 "=D" (dummy_value_D)
1773
5b02c8a1
VS
1774 : "1" (sptr), // esi // input regs
1775 "2" (dp), // edi
1776 "0" (width), // ecx
1777 "rim" (_const4) // %1(?) (0x0000000000FFFFFFLL)
c6b71bff
GD
1778
1779#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1780 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1781 , "%mm3", "%mm4"
1782#endif
1783 );
1784 }
1785 else if (((pass == 2) || (pass == 3)) && width)
1786 {
7f88f624 1787 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1788 int dummy_value_S;
1789 int dummy_value_D;
1790
1791 __asm__ __volatile__ (
1792 "subl $9, %%edi \n\t"
7f88f624 1793 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1794
1795 ".loop3_pass2: \n\t"
7f88f624
VZ
1796 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1797 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1798 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1799 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1800 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1801 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1802 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1803 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1804 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
c6b71bff 1805 "movq %%mm0, 4(%%edi) \n\t"
7f88f624 1806 "psrlq $16, %%mm0 \n\t" /* z z 2 1 0 2 1 0 */
c6b71bff
GD
1807 "subl $3, %%esi \n\t"
1808 "movd %%mm0, (%%edi) \n\t"
1809 "subl $12, %%edi \n\t"
1810 "decl %%ecx \n\t"
1811 "jnz .loop3_pass2 \n\t"
7f88f624 1812 "EMMS \n\t" /* DONE */
c6b71bff 1813
7f88f624 1814 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1815 "=S" (dummy_value_S),
1816 "=D" (dummy_value_D)
1817
5b02c8a1
VS
1818 : "1" (sptr), // esi // input regs
1819 "2" (dp), // edi
1820 "0" (width), // ecx
1821 "rim" (_const4) // (0x0000000000FFFFFFLL)
c6b71bff
GD
1822
1823#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1824 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1825#endif
1826 );
1827 }
1828 else if (width) /* && ((pass == 4) || (pass == 5)) */
1829 {
7f88f624 1830 int width_mmx = ((width >> 1) << 1) - 8; /* GRR: huh? */
c6b71bff
GD
1831 if (width_mmx < 0)
1832 width_mmx = 0;
7f88f624 1833 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */
c6b71bff
GD
1834 if (width_mmx)
1835 {
7f88f624
VZ
1836 /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1837 /* sptr points at last pixel in pre-expanded row */
1838 /* dp points at last pixel position in expanded row */
1839 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1840 int dummy_value_S;
1841 int dummy_value_D;
1842
1843 __asm__ __volatile__ (
1844 "subl $3, %%esi \n\t"
1845 "subl $9, %%edi \n\t"
7f88f624 1846 /* (png_pass_inc[pass] + 1)*pixel_bytes */
c6b71bff
GD
1847
1848 ".loop3_pass4: \n\t"
7f88f624
VZ
1849 "movq (%%esi), %%mm0 \n\t" /* x x 5 4 3 2 1 0 */
1850 "movq %%mm0, %%mm1 \n\t" /* x x 5 4 3 2 1 0 */
1851 "movq %%mm0, %%mm2 \n\t" /* x x 5 4 3 2 1 0 */
1852 "psllq $24, %%mm0 \n\t" /* 4 3 2 1 0 z z z */
1853 "pand _const4, %%mm1 \n\t" /* z z z z z 2 1 0 */
1854 "psrlq $24, %%mm2 \n\t" /* z z z x x 5 4 3 */
1855 "por %%mm1, %%mm0 \n\t" /* 4 3 2 1 0 2 1 0 */
1856 "movq %%mm2, %%mm3 \n\t" /* z z z x x 5 4 3 */
1857 "psllq $8, %%mm2 \n\t" /* z z x x 5 4 3 z */
c6b71bff 1858 "movq %%mm0, (%%edi) \n\t"
7f88f624
VZ
1859 "psrlq $16, %%mm3 \n\t" /* z z z z z x x 5 */
1860 "pand _const6, %%mm3 \n\t" /* z z z z z z z 5 */
1861 "por %%mm3, %%mm2 \n\t" /* z z x x 5 4 3 5 */
c6b71bff
GD
1862 "subl $6, %%esi \n\t"
1863 "movd %%mm2, 8(%%edi) \n\t"
1864 "subl $12, %%edi \n\t"
1865 "subl $2, %%ecx \n\t"
1866 "jnz .loop3_pass4 \n\t"
7f88f624 1867 "EMMS \n\t" /* DONE */
c6b71bff 1868
7f88f624 1869 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1870 "=S" (dummy_value_S),
1871 "=D" (dummy_value_D)
1872
5b02c8a1
VS
1873 : "1" (sptr), // esi // input regs
1874 "2" (dp), // edi
1875 "0" (width_mmx), // ecx
1876 "rim" (_const4), // 0x0000000000FFFFFFLL
1877 "rim" (_const6) // 0x00000000000000FFLL
c6b71bff
GD
1878
1879#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1880 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
1881 , "%mm2", "%mm3"
1882#endif
1883 );
1884 }
1885
1886 sptr -= width_mmx*3;
1887 dp -= width_mmx*6;
1888 for (i = width; i; i--)
1889 {
1890 png_byte v[8];
1891 int j;
1892
1893 png_memcpy(v, sptr, 3);
1894 for (j = 0; j < png_pass_inc[pass]; j++)
1895 {
1896 png_memcpy(dp, v, 3);
1897 dp -= 3;
1898 }
1899 sptr -= 3;
1900 }
1901 }
1902 } /* end of pixel_bytes == 3 */
1903
1904 //--------------------------------------------------------------
1905 else if (pixel_bytes == 1)
1906 {
1907 if (((pass == 0) || (pass == 1)) && width)
1908 {
1909 int width_mmx = ((width >> 2) << 2);
7f88f624 1910 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */
c6b71bff
GD
1911 if (width_mmx)
1912 {
7f88f624 1913 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1914 int dummy_value_S;
1915 int dummy_value_D;
1916
1917 __asm__ __volatile__ (
1918 "subl $3, %%esi \n\t"
1919 "subl $31, %%edi \n\t"
1920
1921 ".loop1_pass0: \n\t"
7f88f624
VZ
1922 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
1923 "movq %%mm0, %%mm1 \n\t" /* x x x x 3 2 1 0 */
1924 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
1925 "movq %%mm0, %%mm2 \n\t" /* 3 3 2 2 1 1 0 0 */
1926 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
1927 "movq %%mm0, %%mm3 \n\t" /* 1 1 1 1 0 0 0 0 */
1928 "punpckldq %%mm0, %%mm0 \n\t" /* 0 0 0 0 0 0 0 0 */
1929 "punpckhdq %%mm3, %%mm3 \n\t" /* 1 1 1 1 1 1 1 1 */
c6b71bff 1930 "movq %%mm0, (%%edi) \n\t"
7f88f624 1931 "punpckhwd %%mm2, %%mm2 \n\t" /* 3 3 3 3 2 2 2 2 */
c6b71bff 1932 "movq %%mm3, 8(%%edi) \n\t"
7f88f624
VZ
1933 "movq %%mm2, %%mm4 \n\t" /* 3 3 3 3 2 2 2 2 */
1934 "punpckldq %%mm2, %%mm2 \n\t" /* 2 2 2 2 2 2 2 2 */
1935 "punpckhdq %%mm4, %%mm4 \n\t" /* 3 3 3 3 3 3 3 3 */
c6b71bff
GD
1936 "movq %%mm2, 16(%%edi) \n\t"
1937 "subl $4, %%esi \n\t"
1938 "movq %%mm4, 24(%%edi) \n\t"
1939 "subl $32, %%edi \n\t"
1940 "subl $4, %%ecx \n\t"
1941 "jnz .loop1_pass0 \n\t"
7f88f624 1942 "EMMS \n\t" /* DONE */
c6b71bff 1943
7f88f624 1944 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1945 "=S" (dummy_value_S),
1946 "=D" (dummy_value_D)
1947
7f88f624
VZ
1948 : "1" (sptr), /* esi // input regs */
1949 "2" (dp), /* edi */
1950 "0" (width_mmx) /* ecx */
c6b71bff
GD
1951
1952#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1953 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1954 , "%mm3", "%mm4"
1955#endif
1956 );
1957 }
1958
1959 sptr -= width_mmx;
1960 dp -= width_mmx*8;
1961 for (i = width; i; i--)
1962 {
1963 int j;
1964
1965 /* I simplified this part in version 1.0.4e
1966 * here and in several other instances where
1967 * pixel_bytes == 1 -- GR-P
1968 *
1969 * Original code:
1970 *
1971 * png_byte v[8];
1972 * png_memcpy(v, sptr, pixel_bytes);
1973 * for (j = 0; j < png_pass_inc[pass]; j++)
1974 * {
1975 * png_memcpy(dp, v, pixel_bytes);
1976 * dp -= pixel_bytes;
1977 * }
1978 * sptr -= pixel_bytes;
1979 *
1980 * Replacement code is in the next three lines:
1981 */
1982
1983 for (j = 0; j < png_pass_inc[pass]; j++)
1984 {
1985 *dp-- = *sptr;
1986 }
1987 --sptr;
1988 }
1989 }
1990 else if (((pass == 2) || (pass == 3)) && width)
1991 {
1992 int width_mmx = ((width >> 2) << 2);
7f88f624 1993 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */
c6b71bff
GD
1994 if (width_mmx)
1995 {
7f88f624 1996 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1997 int dummy_value_S;
1998 int dummy_value_D;
1999
2000 __asm__ __volatile__ (
2001 "subl $3, %%esi \n\t"
2002 "subl $15, %%edi \n\t"
2003
2004 ".loop1_pass2: \n\t"
7f88f624
VZ
2005 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2006 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
2007 "movq %%mm0, %%mm1 \n\t" /* 3 3 2 2 1 1 0 0 */
2008 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
2009 "punpckhwd %%mm1, %%mm1 \n\t" /* 3 3 3 3 2 2 2 2 */
c6b71bff
GD
2010 "movq %%mm0, (%%edi) \n\t"
2011 "subl $4, %%esi \n\t"
2012 "movq %%mm1, 8(%%edi) \n\t"
2013 "subl $16, %%edi \n\t"
2014 "subl $4, %%ecx \n\t"
2015 "jnz .loop1_pass2 \n\t"
7f88f624 2016 "EMMS \n\t" /* DONE */
c6b71bff 2017
7f88f624 2018 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2019 "=S" (dummy_value_S),
2020 "=D" (dummy_value_D)
2021
7f88f624
VZ
2022 : "1" (sptr), /* esi // input regs */
2023 "2" (dp), /* edi */
2024 "0" (width_mmx) /* ecx */
c6b71bff
GD
2025
2026#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2027 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2028#endif
2029 );
2030 }
2031
2032 sptr -= width_mmx;
2033 dp -= width_mmx*4;
2034 for (i = width; i; i--)
2035 {
2036 int j;
2037
2038 for (j = 0; j < png_pass_inc[pass]; j++)
2039 {
2040 *dp-- = *sptr;
2041 }
2042 --sptr;
2043 }
2044 }
2045 else if (width) /* && ((pass == 4) || (pass == 5)) */
2046 {
2047 int width_mmx = ((width >> 3) << 3);
7f88f624 2048 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */
c6b71bff
GD
2049 if (width_mmx)
2050 {
7f88f624 2051 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2052 int dummy_value_S;
2053 int dummy_value_D;
2054
2055 __asm__ __volatile__ (
2056 "subl $7, %%esi \n\t"
2057 "subl $15, %%edi \n\t"
2058
2059 ".loop1_pass4: \n\t"
7f88f624
VZ
2060 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2061 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2062 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
2063 "punpckhbw %%mm1, %%mm1 \n\t" /* 7 7 6 6 5 5 4 4 */
c6b71bff
GD
2064 "movq %%mm1, 8(%%edi) \n\t"
2065 "subl $8, %%esi \n\t"
2066 "movq %%mm0, (%%edi) \n\t"
2067 "subl $16, %%edi \n\t"
2068 "subl $8, %%ecx \n\t"
2069 "jnz .loop1_pass4 \n\t"
7f88f624 2070 "EMMS \n\t" /* DONE */
c6b71bff 2071
7f88f624 2072 : "=c" (dummy_value_c), /* output regs (none) */
c6b71bff
GD
2073 "=S" (dummy_value_S),
2074 "=D" (dummy_value_D)
2075
7f88f624
VZ
2076 : "1" (sptr), /* esi // input regs */
2077 "2" (dp), /* edi */
2078 "0" (width_mmx) /* ecx */
c6b71bff
GD
2079
2080#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2081 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2082#endif
2083 );
2084 }
2085
2086 sptr -= width_mmx;
2087 dp -= width_mmx*2;
2088 for (i = width; i; i--)
2089 {
2090 int j;
2091
2092 for (j = 0; j < png_pass_inc[pass]; j++)
2093 {
2094 *dp-- = *sptr;
2095 }
2096 --sptr;
2097 }
2098 }
2099 } /* end of pixel_bytes == 1 */
2100
2101 //--------------------------------------------------------------
2102 else if (pixel_bytes == 2)
2103 {
2104 if (((pass == 0) || (pass == 1)) && width)
2105 {
2106 int width_mmx = ((width >> 1) << 1);
7f88f624 2107 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */
c6b71bff
GD
2108 if (width_mmx)
2109 {
7f88f624 2110 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2111 int dummy_value_S;
2112 int dummy_value_D;
2113
2114 __asm__ __volatile__ (
2115 "subl $2, %%esi \n\t"
2116 "subl $30, %%edi \n\t"
2117
2118 ".loop2_pass0: \n\t"
7f88f624
VZ
2119 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2120 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2121 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2122 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2123 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
c6b71bff
GD
2124 "movq %%mm0, (%%edi) \n\t"
2125 "movq %%mm0, 8(%%edi) \n\t"
2126 "movq %%mm1, 16(%%edi) \n\t"
2127 "subl $4, %%esi \n\t"
2128 "movq %%mm1, 24(%%edi) \n\t"
2129 "subl $32, %%edi \n\t"
2130 "subl $2, %%ecx \n\t"
2131 "jnz .loop2_pass0 \n\t"
7f88f624 2132 "EMMS \n\t" /* DONE */
c6b71bff 2133
7f88f624 2134 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2135 "=S" (dummy_value_S),
2136 "=D" (dummy_value_D)
2137
7f88f624
VZ
2138 : "1" (sptr), /* esi // input regs */
2139 "2" (dp), /* edi */
2140 "0" (width_mmx) /* ecx */
c6b71bff
GD
2141
2142#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2143 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2144#endif
2145 );
2146 }
2147
7f88f624
VZ
2148 sptr -= (width_mmx*2 - 2); /* sign fixed */
2149 dp -= (width_mmx*16 - 2); /* sign fixed */
c6b71bff
GD
2150 for (i = width; i; i--)
2151 {
2152 png_byte v[8];
2153 int j;
2154 sptr -= 2;
2155 png_memcpy(v, sptr, 2);
2156 for (j = 0; j < png_pass_inc[pass]; j++)
2157 {
2158 dp -= 2;
2159 png_memcpy(dp, v, 2);
2160 }
2161 }
2162 }
2163 else if (((pass == 2) || (pass == 3)) && width)
2164 {
2165 int width_mmx = ((width >> 1) << 1) ;
7f88f624 2166 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */
c6b71bff
GD
2167 if (width_mmx)
2168 {
7f88f624 2169 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2170 int dummy_value_S;
2171 int dummy_value_D;
2172
2173 __asm__ __volatile__ (
2174 "subl $2, %%esi \n\t"
2175 "subl $14, %%edi \n\t"
2176
2177 ".loop2_pass2: \n\t"
7f88f624
VZ
2178 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2179 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2180 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2181 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2182 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
c6b71bff
GD
2183 "movq %%mm0, (%%edi) \n\t"
2184 "subl $4, %%esi \n\t"
2185 "movq %%mm1, 8(%%edi) \n\t"
2186 "subl $16, %%edi \n\t"
2187 "subl $2, %%ecx \n\t"
2188 "jnz .loop2_pass2 \n\t"
7f88f624 2189 "EMMS \n\t" /* DONE */
c6b71bff 2190
7f88f624 2191 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2192 "=S" (dummy_value_S),
2193 "=D" (dummy_value_D)
2194
7f88f624
VZ
2195 : "1" (sptr), /* esi // input regs */
2196 "2" (dp), /* edi */
2197 "0" (width_mmx) /* ecx */
c6b71bff
GD
2198
2199#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2200 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2201#endif
2202 );
2203 }
2204
7f88f624
VZ
2205 sptr -= (width_mmx*2 - 2); /* sign fixed */
2206 dp -= (width_mmx*8 - 2); /* sign fixed */
c6b71bff
GD
2207 for (i = width; i; i--)
2208 {
2209 png_byte v[8];
2210 int j;
2211 sptr -= 2;
2212 png_memcpy(v, sptr, 2);
2213 for (j = 0; j < png_pass_inc[pass]; j++)
2214 {
2215 dp -= 2;
2216 png_memcpy(dp, v, 2);
2217 }
2218 }
2219 }
7f88f624 2220 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
2221 {
2222 int width_mmx = ((width >> 1) << 1) ;
7f88f624 2223 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */
c6b71bff
GD
2224 if (width_mmx)
2225 {
7f88f624 2226 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2227 int dummy_value_S;
2228 int dummy_value_D;
2229
2230 __asm__ __volatile__ (
2231 "subl $2, %%esi \n\t"
2232 "subl $6, %%edi \n\t"
2233
2234 ".loop2_pass4: \n\t"
7f88f624
VZ
2235 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2236 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
c6b71bff
GD
2237 "subl $4, %%esi \n\t"
2238 "movq %%mm0, (%%edi) \n\t"
2239 "subl $8, %%edi \n\t"
2240 "subl $2, %%ecx \n\t"
2241 "jnz .loop2_pass4 \n\t"
7f88f624 2242 "EMMS \n\t" /* DONE */
c6b71bff 2243
7f88f624 2244 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2245 "=S" (dummy_value_S),
2246 "=D" (dummy_value_D)
2247
7f88f624
VZ
2248 : "1" (sptr), /* esi // input regs */
2249 "2" (dp), /* edi */
2250 "0" (width_mmx) /* ecx */
c6b71bff
GD
2251
2252#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2253 : "%mm0" /* clobber list */
c6b71bff
GD
2254#endif
2255 );
2256 }
2257
7f88f624
VZ
2258 sptr -= (width_mmx*2 - 2); /* sign fixed */
2259 dp -= (width_mmx*4 - 2); /* sign fixed */
c6b71bff
GD
2260 for (i = width; i; i--)
2261 {
2262 png_byte v[8];
2263 int j;
2264 sptr -= 2;
2265 png_memcpy(v, sptr, 2);
2266 for (j = 0; j < png_pass_inc[pass]; j++)
2267 {
2268 dp -= 2;
2269 png_memcpy(dp, v, 2);
2270 }
2271 }
2272 }
2273 } /* end of pixel_bytes == 2 */
2274
2275 //--------------------------------------------------------------
2276 else if (pixel_bytes == 4)
2277 {
2278 if (((pass == 0) || (pass == 1)) && width)
2279 {
2280 int width_mmx = ((width >> 1) << 1);
7f88f624 2281 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */
c6b71bff
GD
2282 if (width_mmx)
2283 {
7f88f624 2284 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2285 int dummy_value_S;
2286 int dummy_value_D;
2287
2288 __asm__ __volatile__ (
2289 "subl $4, %%esi \n\t"
2290 "subl $60, %%edi \n\t"
2291
2292 ".loop4_pass0: \n\t"
7f88f624
VZ
2293 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2294 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2295 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2296 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
c6b71bff
GD
2297 "movq %%mm0, (%%edi) \n\t"
2298 "movq %%mm0, 8(%%edi) \n\t"
2299 "movq %%mm0, 16(%%edi) \n\t"
2300 "movq %%mm0, 24(%%edi) \n\t"
2301 "movq %%mm1, 32(%%edi) \n\t"
2302 "movq %%mm1, 40(%%edi) \n\t"
2303 "movq %%mm1, 48(%%edi) \n\t"
2304 "subl $8, %%esi \n\t"
2305 "movq %%mm1, 56(%%edi) \n\t"
2306 "subl $64, %%edi \n\t"
2307 "subl $2, %%ecx \n\t"
2308 "jnz .loop4_pass0 \n\t"
7f88f624 2309 "EMMS \n\t" /* DONE */
c6b71bff 2310
7f88f624 2311 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2312 "=S" (dummy_value_S),
2313 "=D" (dummy_value_D)
2314
7f88f624
VZ
2315 : "1" (sptr), /* esi // input regs */
2316 "2" (dp), /* edi */
2317 "0" (width_mmx) /* ecx */
c6b71bff
GD
2318
2319#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2320 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2321#endif
2322 );
2323 }
2324
7f88f624
VZ
2325 sptr -= (width_mmx*4 - 4); /* sign fixed */
2326 dp -= (width_mmx*32 - 4); /* sign fixed */
c6b71bff
GD
2327 for (i = width; i; i--)
2328 {
2329 png_byte v[8];
2330 int j;
2331 sptr -= 4;
2332 png_memcpy(v, sptr, 4);
2333 for (j = 0; j < png_pass_inc[pass]; j++)
2334 {
2335 dp -= 4;
2336 png_memcpy(dp, v, 4);
2337 }
2338 }
2339 }
2340 else if (((pass == 2) || (pass == 3)) && width)
2341 {
2342 int width_mmx = ((width >> 1) << 1);
7f88f624 2343 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */
c6b71bff
GD
2344 if (width_mmx)
2345 {
7f88f624 2346 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2347 int dummy_value_S;
2348 int dummy_value_D;
2349
2350 __asm__ __volatile__ (
2351 "subl $4, %%esi \n\t"
2352 "subl $28, %%edi \n\t"
2353
2354 ".loop4_pass2: \n\t"
7f88f624
VZ
2355 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2356 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2357 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2358 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
c6b71bff
GD
2359 "movq %%mm0, (%%edi) \n\t"
2360 "movq %%mm0, 8(%%edi) \n\t"
2361 "movq %%mm1, 16(%%edi) \n\t"
2362 "movq %%mm1, 24(%%edi) \n\t"
2363 "subl $8, %%esi \n\t"
2364 "subl $32, %%edi \n\t"
2365 "subl $2, %%ecx \n\t"
2366 "jnz .loop4_pass2 \n\t"
7f88f624 2367 "EMMS \n\t" /* DONE */
c6b71bff 2368
7f88f624 2369 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2370 "=S" (dummy_value_S),
2371 "=D" (dummy_value_D)
2372
7f88f624
VZ
2373 : "1" (sptr), /* esi // input regs */
2374 "2" (dp), /* edi */
2375 "0" (width_mmx) /* ecx */
c6b71bff
GD
2376
2377#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2378 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2379#endif
2380 );
2381 }
2382
7f88f624
VZ
2383 sptr -= (width_mmx*4 - 4); /* sign fixed */
2384 dp -= (width_mmx*16 - 4); /* sign fixed */
c6b71bff
GD
2385 for (i = width; i; i--)
2386 {
2387 png_byte v[8];
2388 int j;
2389 sptr -= 4;
2390 png_memcpy(v, sptr, 4);
2391 for (j = 0; j < png_pass_inc[pass]; j++)
2392 {
2393 dp -= 4;
2394 png_memcpy(dp, v, 4);
2395 }
2396 }
2397 }
7f88f624 2398 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
2399 {
2400 int width_mmx = ((width >> 1) << 1) ;
7f88f624 2401 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */
c6b71bff
GD
2402 if (width_mmx)
2403 {
7f88f624 2404 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2405 int dummy_value_S;
2406 int dummy_value_D;
2407
2408 __asm__ __volatile__ (
2409 "subl $4, %%esi \n\t"
2410 "subl $12, %%edi \n\t"
2411
2412 ".loop4_pass4: \n\t"
7f88f624
VZ
2413 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2414 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2415 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2416 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
c6b71bff
GD
2417 "movq %%mm0, (%%edi) \n\t"
2418 "subl $8, %%esi \n\t"
2419 "movq %%mm1, 8(%%edi) \n\t"
2420 "subl $16, %%edi \n\t"
2421 "subl $2, %%ecx \n\t"
2422 "jnz .loop4_pass4 \n\t"
7f88f624 2423 "EMMS \n\t" /* DONE */
c6b71bff 2424
7f88f624 2425 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2426 "=S" (dummy_value_S),
2427 "=D" (dummy_value_D)
2428
7f88f624
VZ
2429 : "1" (sptr), /* esi // input regs */
2430 "2" (dp), /* edi */
2431 "0" (width_mmx) /* ecx */
c6b71bff
GD
2432
2433#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2434 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2435#endif
2436 );
2437 }
2438
7f88f624
VZ
2439 sptr -= (width_mmx*4 - 4); /* sign fixed */
2440 dp -= (width_mmx*8 - 4); /* sign fixed */
c6b71bff
GD
2441 for (i = width; i; i--)
2442 {
2443 png_byte v[8];
2444 int j;
2445 sptr -= 4;
2446 png_memcpy(v, sptr, 4);
2447 for (j = 0; j < png_pass_inc[pass]; j++)
2448 {
2449 dp -= 4;
2450 png_memcpy(dp, v, 4);
2451 }
2452 }
2453 }
2454 } /* end of pixel_bytes == 4 */
2455
2456 //--------------------------------------------------------------
2457 else if (pixel_bytes == 8)
2458 {
7f88f624
VZ
2459/* GRR TEST: should work, but needs testing (special 64-bit version of rpng2?) */
2460 /* GRR NOTE: no need to combine passes here! */
c6b71bff
GD
2461 if (((pass == 0) || (pass == 1)) && width)
2462 {
7f88f624 2463 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2464 int dummy_value_S;
2465 int dummy_value_D;
2466
7f88f624
VZ
2467 /* source is 8-byte RRGGBBAA */
2468 /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */
c6b71bff 2469 __asm__ __volatile__ (
7f88f624 2470 "subl $56, %%edi \n\t" /* start of last block */
c6b71bff
GD
2471
2472 ".loop8_pass0: \n\t"
7f88f624 2473 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
c6b71bff
GD
2474 "movq %%mm0, (%%edi) \n\t"
2475 "movq %%mm0, 8(%%edi) \n\t"
2476 "movq %%mm0, 16(%%edi) \n\t"
2477 "movq %%mm0, 24(%%edi) \n\t"
2478 "movq %%mm0, 32(%%edi) \n\t"
2479 "movq %%mm0, 40(%%edi) \n\t"
2480 "movq %%mm0, 48(%%edi) \n\t"
2481 "subl $8, %%esi \n\t"
2482 "movq %%mm0, 56(%%edi) \n\t"
2483 "subl $64, %%edi \n\t"
2484 "decl %%ecx \n\t"
2485 "jnz .loop8_pass0 \n\t"
7f88f624 2486 "EMMS \n\t" /* DONE */
c6b71bff 2487
7f88f624 2488 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2489 "=S" (dummy_value_S),
2490 "=D" (dummy_value_D)
2491
7f88f624
VZ
2492 : "1" (sptr), /* esi // input regs */
2493 "2" (dp), /* edi */
2494 "0" (width) /* ecx */
c6b71bff
GD
2495
2496#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2497 : "%mm0" /* clobber list */
c6b71bff
GD
2498#endif
2499 );
2500 }
2501 else if (((pass == 2) || (pass == 3)) && width)
2502 {
7f88f624
VZ
2503 /* source is 8-byte RRGGBBAA */
2504 /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
2505 /* (recall that expansion is _in place_: sptr and dp */
2506 /* both point at locations within same row buffer) */
c6b71bff 2507 {
7f88f624 2508 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2509 int dummy_value_S;
2510 int dummy_value_D;
2511
2512 __asm__ __volatile__ (
7f88f624 2513 "subl $24, %%edi \n\t" /* start of last block */
c6b71bff
GD
2514
2515 ".loop8_pass2: \n\t"
7f88f624 2516 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
c6b71bff
GD
2517 "movq %%mm0, (%%edi) \n\t"
2518 "movq %%mm0, 8(%%edi) \n\t"
2519 "movq %%mm0, 16(%%edi) \n\t"
2520 "subl $8, %%esi \n\t"
2521 "movq %%mm0, 24(%%edi) \n\t"
2522 "subl $32, %%edi \n\t"
2523 "decl %%ecx \n\t"
2524 "jnz .loop8_pass2 \n\t"
7f88f624 2525 "EMMS \n\t" /* DONE */
c6b71bff 2526
7f88f624 2527 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2528 "=S" (dummy_value_S),
2529 "=D" (dummy_value_D)
2530
7f88f624
VZ
2531 : "1" (sptr), /* esi // input regs */
2532 "2" (dp), /* edi */
2533 "0" (width) /* ecx */
c6b71bff
GD
2534
2535#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2536 : "%mm0" /* clobber list */
c6b71bff
GD
2537#endif
2538 );
2539 }
2540 }
7f88f624 2541 else if (width) /* pass == 4 or 5 */
c6b71bff 2542 {
7f88f624
VZ
2543 /* source is 8-byte RRGGBBAA */
2544 /* dest is 16-byte RRGGBBAA RRGGBBAA */
c6b71bff 2545 {
7f88f624 2546 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2547 int dummy_value_S;
2548 int dummy_value_D;
2549
2550 __asm__ __volatile__ (
7f88f624 2551 "subl $8, %%edi \n\t" /* start of last block */
c6b71bff
GD
2552
2553 ".loop8_pass4: \n\t"
7f88f624 2554 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
c6b71bff
GD
2555 "movq %%mm0, (%%edi) \n\t"
2556 "subl $8, %%esi \n\t"
2557 "movq %%mm0, 8(%%edi) \n\t"
2558 "subl $16, %%edi \n\t"
2559 "decl %%ecx \n\t"
2560 "jnz .loop8_pass4 \n\t"
7f88f624 2561 "EMMS \n\t" /* DONE */
c6b71bff 2562
7f88f624 2563 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2564 "=S" (dummy_value_S),
2565 "=D" (dummy_value_D)
2566
7f88f624
VZ
2567 : "1" (sptr), /* esi // input regs */
2568 "2" (dp), /* edi */
2569 "0" (width) /* ecx */
c6b71bff
GD
2570
2571#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2572 : "%mm0" /* clobber list */
c6b71bff
GD
2573#endif
2574 );
2575 }
2576 }
2577
2578 } /* end of pixel_bytes == 8 */
2579
2580 //--------------------------------------------------------------
2581 else if (pixel_bytes == 6)
2582 {
2583 for (i = width; i; i--)
2584 {
2585 png_byte v[8];
2586 int j;
2587 png_memcpy(v, sptr, 6);
2588 for (j = 0; j < png_pass_inc[pass]; j++)
2589 {
2590 png_memcpy(dp, v, 6);
2591 dp -= 6;
2592 }
2593 sptr -= 6;
2594 }
2595 } /* end of pixel_bytes == 6 */
2596
2597 //--------------------------------------------------------------
2598 else
2599 {
2600 for (i = width; i; i--)
2601 {
2602 png_byte v[8];
2603 int j;
2604 png_memcpy(v, sptr, pixel_bytes);
2605 for (j = 0; j < png_pass_inc[pass]; j++)
2606 {
2607 png_memcpy(dp, v, pixel_bytes);
2608 dp -= pixel_bytes;
2609 }
2610 sptr-= pixel_bytes;
2611 }
2612 }
7f88f624 2613 } /* end of _mmx_supported ======================================== */
c6b71bff
GD
2614
2615 else /* MMX not supported: use modified C code - takes advantage
2616 * of inlining of png_memcpy for a constant */
2617 /* GRR 19991007: does it? or should pixel_bytes in each
2618 * block be replaced with immediate value (e.g., 1)? */
2619 /* GRR 19991017: replaced with constants in each case */
2620#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2621 {
2622 if (pixel_bytes == 1)
2623 {
2624 for (i = width; i; i--)
2625 {
2626 int j;
2627 for (j = 0; j < png_pass_inc[pass]; j++)
2628 {
2629 *dp-- = *sptr;
2630 }
2631 --sptr;
2632 }
2633 }
2634 else if (pixel_bytes == 3)
2635 {
2636 for (i = width; i; i--)
2637 {
2638 png_byte v[8];
2639 int j;
2640 png_memcpy(v, sptr, 3);
2641 for (j = 0; j < png_pass_inc[pass]; j++)
2642 {
2643 png_memcpy(dp, v, 3);
2644 dp -= 3;
2645 }
2646 sptr -= 3;
2647 }
2648 }
2649 else if (pixel_bytes == 2)
2650 {
2651 for (i = width; i; i--)
2652 {
2653 png_byte v[8];
2654 int j;
2655 png_memcpy(v, sptr, 2);
2656 for (j = 0; j < png_pass_inc[pass]; j++)
2657 {
2658 png_memcpy(dp, v, 2);
2659 dp -= 2;
2660 }
2661 sptr -= 2;
2662 }
2663 }
2664 else if (pixel_bytes == 4)
2665 {
2666 for (i = width; i; i--)
2667 {
2668 png_byte v[8];
2669 int j;
2670 png_memcpy(v, sptr, 4);
2671 for (j = 0; j < png_pass_inc[pass]; j++)
2672 {
2673#ifdef PNG_DEBUG
2674 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2675 {
2676 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2677 row, dp, row+png_ptr->row_buf_size);
2678 printf("row_buf=%d\n",png_ptr->row_buf_size);
2679 }
2680#endif
2681 png_memcpy(dp, v, 4);
2682 dp -= 4;
2683 }
2684 sptr -= 4;
2685 }
2686 }
2687 else if (pixel_bytes == 6)
2688 {
2689 for (i = width; i; i--)
2690 {
2691 png_byte v[8];
2692 int j;
2693 png_memcpy(v, sptr, 6);
2694 for (j = 0; j < png_pass_inc[pass]; j++)
2695 {
2696 png_memcpy(dp, v, 6);
2697 dp -= 6;
2698 }
2699 sptr -= 6;
2700 }
2701 }
2702 else if (pixel_bytes == 8)
2703 {
2704 for (i = width; i; i--)
2705 {
2706 png_byte v[8];
2707 int j;
2708 png_memcpy(v, sptr, 8);
2709 for (j = 0; j < png_pass_inc[pass]; j++)
2710 {
2711 png_memcpy(dp, v, 8);
2712 dp -= 8;
2713 }
2714 sptr -= 8;
2715 }
2716 }
2717 else /* GRR: should never be reached */
2718 {
2719 for (i = width; i; i--)
2720 {
2721 png_byte v[8];
2722 int j;
2723 png_memcpy(v, sptr, pixel_bytes);
2724 for (j = 0; j < png_pass_inc[pass]; j++)
2725 {
2726 png_memcpy(dp, v, pixel_bytes);
2727 dp -= pixel_bytes;
2728 }
2729 sptr -= pixel_bytes;
2730 }
2731 }
2732
2733 } /* end if (MMX not supported) */
2734 break;
2735 }
2736 } /* end switch (row_info->pixel_depth) */
2737
2738 row_info->width = final_width;
5b02c8a1
VS
2739
2740 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
c6b71bff
GD
2741 }
2742
2743} /* end png_do_read_interlace() */
2744
2745#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2746#endif /* PNG_READ_INTERLACING_SUPPORTED */
2747
2748
2749
2750#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2751#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2752
7f88f624
VZ
2753/* These variables are utilized in the functions below. They are declared */
2754/* globally here to ensure alignment on 8-byte boundaries. */
c6b71bff
GD
2755
2756union uAll {
2757 long long use;
2758 double align;
2759} _LBCarryMask = {0x0101010101010101LL},
2760 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2761 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2762
2763#ifdef PNG_THREAD_UNSAFE_OK
7f88f624
VZ
2764/*===========================================================================*/
2765/* */
2766/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G */
2767/* */
2768/*===========================================================================*/
c6b71bff 2769
7f88f624 2770/* Optimized code for PNG Average filter decoder */
c6b71bff
GD
2771
2772static void /* PRIVATE */
2773png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2774 png_bytep prev_row)
2775{
2776 int bpp;
7f88f624 2777 int dummy_value_c; /* fix 'forbidden register 2 (cx) was spilled' error */
c6b71bff
GD
2778 int dummy_value_S;
2779 int dummy_value_D;
2780
7f88f624
VZ
2781 bpp = (row_info->pixel_depth + 7) >> 3; /* get # bytes per pixel */
2782 _FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
2783
2784 __asm__ __volatile__ (
7f88f624 2785 /* initialize address pointers and offset */
c6b71bff 2786#ifdef __PIC__
7f88f624 2787 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff 2788#endif
7f88f624
VZ
2789/*pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
2790 "xorl %%ebx, %%ebx \n\t" /* ebx: x */
c6b71bff 2791 "movl %%edi, %%edx \n\t"
7f88f624
VZ
2792/*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
2793/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
2794 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
c6b71bff
GD
2795
2796 "xorl %%eax,%%eax \n\t"
2797
7f88f624
VZ
2798 /* Compute the Raw value for the first bpp bytes */
2799 /* Raw(x) = Avg(x) + (Prior(x)/2) */
c6b71bff 2800 "avg_rlp: \n\t"
7f88f624 2801 "movb (%%esi,%%ebx,),%%al \n\t" /* load al with Prior(x) */
c6b71bff 2802 "incl %%ebx \n\t"
7f88f624
VZ
2803 "shrb %%al \n\t" /* divide by 2 */
2804 "addb -1(%%edi,%%ebx,),%%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2805/* pre "cmpl bpp, %%ebx \n\t" */ /* (bpp is preloaded into ecx) */
c6b71bff 2806 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
2807 "movb %%al,-1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2808 "jb avg_rlp \n\t" /* mov does not affect flags */
2809
2810 /* get # of bytes to alignment */
2811 "movl %%edi, _dif \n\t" /* take start of row */
2812 "addl %%ebx, _dif \n\t" /* add bpp */
2813 "addl $0xf, _dif \n\t" /* add 7+8 to incr past alignment bdry */
2814 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
2815 "subl %%edi, _dif \n\t" /* subtract from start => value ebx at */
2816 "jz avg_go \n\t" /* alignment */
2817
2818 /* fix alignment */
2819 /* Compute the Raw value for the bytes up to the alignment boundary */
2820 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff
GD
2821 "xorl %%ecx, %%ecx \n\t"
2822
2823 "avg_lp1: \n\t"
2824 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
2825 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
2826 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
c6b71bff
GD
2827 "addw %%cx, %%ax \n\t"
2828 "incl %%ebx \n\t"
7f88f624
VZ
2829 "shrw %%ax \n\t" /* divide by 2 */
2830 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2831 "cmpl _dif, %%ebx \n\t" /* check if at alignment boundary */
2832 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2833 "jb avg_lp1 \n\t" /* repeat until at alignment boundary */
c6b71bff
GD
2834
2835 "avg_go: \n\t"
2836 "movl _FullLength, %%eax \n\t"
2837 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
2838 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
2839 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
2840 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
c6b71bff
GD
2841 "movl %%ecx, _MMXLength \n\t"
2842#ifdef __PIC__
7f88f624 2843 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
2844#endif
2845
7f88f624 2846 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2847 "=S" (dummy_value_S),
2848 "=D" (dummy_value_D)
2849
7f88f624
VZ
2850 : "0" (bpp), /* ecx // input regs */
2851 "1" (prev_row), /* esi */
2852 "2" (row) /* edi */
c6b71bff 2853
7f88f624 2854 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
2855#ifndef __PIC__
2856 , "%ebx"
2857#endif
7f88f624
VZ
2858 /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */
2859 /* (seems to work fine without...) */
c6b71bff
GD
2860 );
2861
7f88f624 2862 /* now do the math for the rest of the row */
c6b71bff
GD
2863 switch (bpp)
2864 {
2865 case 3:
2866 {
2867 _ActiveMask.use = 0x0000000000ffffffLL;
7f88f624
VZ
2868 _ShiftBpp.use = 24; /* == 3 * 8 */
2869 _ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
2870
2871 __asm__ __volatile__ (
7f88f624 2872 /* re-init address pointers and offset */
c6b71bff 2873 "movq _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
2874 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2875 "movq _LBCarryMask, %%mm5 \n\t" /* alignment boundary */
2876/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 2877 "movq _HBClearMask, %%mm4 \n\t"
7f88f624 2878/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 2879
7f88f624
VZ
2880 /* prime the pump: load the first Raw(x-bpp) data set */
2881 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2882 /* (correct pos. in loop below) */
c6b71bff 2883 "avg_3lp: \n\t"
7f88f624 2884 "movq (%%edi,%%ecx,), %%mm0 \n\t" /* load mm0 with Avg(x) */
c6b71bff 2885 "movq %%mm5, %%mm3 \n\t"
7f88f624
VZ
2886 "psrlq _ShiftRem, %%mm2 \n\t" /* correct position Raw(x-bpp) */
2887 /* data */
2888 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* load mm1 with Prior(x) */
c6b71bff 2889 "movq %%mm7, %%mm6 \n\t"
7f88f624
VZ
2890 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
2891 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
2892 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
2893 /* byte */
2894 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
2895 /* each byte */
2896 /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */
2897 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2898 /* LBCarrys */
2899 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2900 /* where both */
2901 /* lsb's were == 1 (only valid for active group) */
2902 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2903 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2904 /* byte */
2905 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2906 /* for each byte */
2907 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
2908 /* bytes to add to Avg */
2909 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2910 /* Avg for each Active */
2911 /* byte */
2912 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
2913 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
2914 /* bytes 3-5 */
2915 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2916 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2917 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2918 /* LBCarrys */
2919 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2920 /* where both */
2921 /* lsb's were == 1 (only valid for active group) */
2922 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2923 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2924 /* byte */
2925 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2926 /* for each byte */
2927 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2928 /* bytes to add to Avg */
2929 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2930 /* Avg for each Active */
2931 /* byte */
2932
2933 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
2934 "psllq _ShiftBpp, %%mm6 \n\t" /* shift mm6 mask to cover last */
2935 /* two */
2936 /* bytes */
2937 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2938 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2939 /* Data only needs to be shifted once here to */
2940 /* get the correct x-bpp offset. */
2941 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2942 /* LBCarrys */
2943 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2944 /* where both */
2945 /* lsb's were == 1 (only valid for active group) */
2946 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2947 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2948 /* byte */
2949 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2950 /* for each byte */
2951 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2952 /* bytes to add to Avg */
c6b71bff 2953 "addl $8, %%ecx \n\t"
7f88f624
VZ
2954 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2955 /* Avg for each Active */
2956 /* byte */
2957 /* now ready to write back to memory */
c6b71bff 2958 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624 2959 /* move updated Raw(x) to use as Raw(x-bpp) for next loop */
c6b71bff 2960 "cmpl _MMXLength, %%ecx \n\t"
7f88f624 2961 "movq %%mm0, %%mm2 \n\t" /* mov updated Raw(x) to mm2 */
c6b71bff
GD
2962 "jb avg_3lp \n\t"
2963
7f88f624 2964 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
2965 "=D" (dummy_value_D)
2966
7f88f624
VZ
2967 : "0" (prev_row), /* esi // input regs */
2968 "1" (row) /* edi */
c6b71bff 2969
7f88f624 2970 : "%ecx" /* clobber list */
c6b71bff
GD
2971#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2972 , "%mm0", "%mm1", "%mm2", "%mm3"
2973 , "%mm4", "%mm5", "%mm6", "%mm7"
2974#endif
2975 );
2976 }
7f88f624 2977 break; /* end 3 bpp */
c6b71bff
GD
2978
2979 case 6:
2980 case 4:
7f88f624
VZ
2981 //case 7: /* who wrote this? PNG doesn't support 5 or 7 bytes/pixel */
2982 //case 5: /* GRR BOGUS */
c6b71bff 2983 {
7f88f624
VZ
2984 _ActiveMask.use = 0xffffffffffffffffLL; /* use shift below to clear */
2985 /* appropriate inactive bytes */
c6b71bff
GD
2986 _ShiftBpp.use = bpp << 3;
2987 _ShiftRem.use = 64 - _ShiftBpp.use;
2988
2989 __asm__ __volatile__ (
2990 "movq _HBClearMask, %%mm4 \n\t"
2991
7f88f624
VZ
2992 /* re-init address pointers and offset */
2993 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2994 /* alignment boundary */
c6b71bff 2995
7f88f624 2996 /* load _ActiveMask and clear all bytes except for 1st active group */
c6b71bff 2997 "movq _ActiveMask, %%mm7 \n\t"
7f88f624 2998/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 2999 "psrlq _ShiftRem, %%mm7 \n\t"
7f88f624 3000/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff
GD
3001 "movq %%mm7, %%mm6 \n\t"
3002 "movq _LBCarryMask, %%mm5 \n\t"
7f88f624
VZ
3003 "psllq _ShiftBpp, %%mm6 \n\t" /* create mask for 2nd active */
3004 /* group */
c6b71bff 3005
7f88f624
VZ
3006 /* prime the pump: load the first Raw(x-bpp) data set */
3007 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3008 /* (we correct pos. in loop below) */
c6b71bff
GD
3009 "avg_4lp: \n\t"
3010 "movq (%%edi,%%ecx,), %%mm0 \n\t"
7f88f624 3011 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
c6b71bff 3012 "movq (%%esi,%%ecx,), %%mm1 \n\t"
7f88f624 3013 /* add (Prev_row/2) to average */
c6b71bff 3014 "movq %%mm5, %%mm3 \n\t"
7f88f624
VZ
3015 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3016 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3017 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3018 /* byte */
3019 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3020 /* each byte */
3021 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3022 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3023 /* LBCarrys */
3024 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3025 /* where both */
3026 /* lsb's were == 1 (only valid for active group) */
3027 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3028 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3029 /* byte */
3030 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3031 /* for each byte */
3032 "pand %%mm7, %%mm2 \n\t" /* leave only Active Group 1 */
3033 /* bytes to add to Avg */
3034 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3035 /* for each Active */
3036 /* byte */
3037 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3038 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3039 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
c6b71bff 3040 "addl $8, %%ecx \n\t"
7f88f624
VZ
3041 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3042 /* LBCarrys */
3043 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3044 /* where both */
3045 /* lsb's were == 1 (only valid for active group) */
3046 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3047 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3048 /* byte */
3049 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3050 /* for each byte */
3051 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3052 /* bytes to add to Avg */
3053 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3054 /* Avg for each Active */
3055 /* byte */
c6b71bff 3056 "cmpl _MMXLength, %%ecx \n\t"
7f88f624 3057 /* now ready to write back to memory */
c6b71bff 3058 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624
VZ
3059 /* prep Raw(x-bpp) for next loop */
3060 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
c6b71bff
GD
3061 "jb avg_4lp \n\t"
3062
7f88f624 3063 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3064 "=D" (dummy_value_D)
3065
7f88f624
VZ
3066 : "0" (prev_row), /* esi // input regs */
3067 "1" (row) /* edi */
c6b71bff 3068
7f88f624 3069 : "%ecx" /* clobber list */
c6b71bff
GD
3070#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3071 , "%mm0", "%mm1", "%mm2", "%mm3"
3072 , "%mm4", "%mm5", "%mm6", "%mm7"
3073#endif
3074 );
3075 }
7f88f624 3076 break; /* end 4,6 bpp */
c6b71bff
GD
3077
3078 case 2:
3079 {
3080 _ActiveMask.use = 0x000000000000ffffLL;
7f88f624
VZ
3081 _ShiftBpp.use = 16; /* == 2 * 8 */
3082 _ShiftRem.use = 48; /* == 64 - 16 */
c6b71bff
GD
3083
3084 __asm__ __volatile__ (
7f88f624 3085 /* load _ActiveMask */
c6b71bff 3086 "movq _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
3087 /* re-init address pointers and offset */
3088 "movl _dif, %%ecx \n\t" /* ecx: x = offset to alignment */
3089 /* boundary */
c6b71bff 3090 "movq _LBCarryMask, %%mm5 \n\t"
7f88f624 3091/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 3092 "movq _HBClearMask, %%mm4 \n\t"
7f88f624 3093/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 3094
7f88f624
VZ
3095 /* prime the pump: load the first Raw(x-bpp) data set */
3096 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3097 /* (we correct pos. in loop below) */
c6b71bff
GD
3098 "avg_2lp: \n\t"
3099 "movq (%%edi,%%ecx,), %%mm0 \n\t"
7f88f624
VZ
3100 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
3101 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* (GRR BUGFIX: was psllq) */
3102 /* add (Prev_row/2) to average */
c6b71bff 3103 "movq %%mm5, %%mm3 \n\t"
7f88f624
VZ
3104 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3105 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3106 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3107 /* byte */
c6b71bff 3108 "movq %%mm7, %%mm6 \n\t"
7f88f624
VZ
3109 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3110 /* each byte */
3111
3112 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3113 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3114 /* LBCarrys */
3115 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3116 /* where both */
3117 /* lsb's were == 1 (only valid */
3118 /* for active group) */
3119 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3120 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3121 /* byte */
3122 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3123 /* for each byte */
3124 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
3125 /* bytes to add to Avg */
3126 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3127 /* for each Active byte */
3128
3129 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3130 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3131 /* bytes 2 & 3 */
3132 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3133 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3134 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3135 /* LBCarrys */
3136 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3137 /* where both */
3138 /* lsb's were == 1 (only valid */
3139 /* for active group) */
3140 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3141 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3142 /* byte */
3143 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3144 /* for each byte */
3145 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3146 /* bytes to add to Avg */
3147 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3148 /* Avg for each Active byte */
3149
3150 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
3151 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3152 /* bytes 4 & 5 */
3153 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3154 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3155 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3156 /* LBCarrys */
3157 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3158 /* where both lsb's were == 1 */
3159 /* (only valid for active group) */
3160 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3161 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3162 /* byte */
3163 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3164 /* for each byte */
3165 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3166 /* bytes to add to Avg */
3167 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3168 /* Avg for each Active byte */
3169
3170 /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */
3171 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3172 /* bytes 6 & 7 */
3173 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3174 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
c6b71bff 3175 "addl $8, %%ecx \n\t"
7f88f624
VZ
3176 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3177 /* LBCarrys */
3178 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3179 /* where both */
3180 /* lsb's were == 1 (only valid */
3181 /* for active group) */
3182 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3183 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3184 /* byte */
3185 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3186 /* for each byte */
3187 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3188 /* bytes to add to Avg */
3189 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3190 /* Avg for each Active byte */
c6b71bff
GD
3191
3192 "cmpl _MMXLength, %%ecx \n\t"
7f88f624 3193 /* now ready to write back to memory */
c6b71bff 3194 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624
VZ
3195 /* prep Raw(x-bpp) for next loop */
3196 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
c6b71bff
GD
3197 "jb avg_2lp \n\t"
3198
7f88f624 3199 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3200 "=D" (dummy_value_D)
3201
7f88f624
VZ
3202 : "0" (prev_row), /* esi // input regs */
3203 "1" (row) /* edi */
c6b71bff 3204
7f88f624 3205 : "%ecx" /* clobber list */
c6b71bff
GD
3206#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3207 , "%mm0", "%mm1", "%mm2", "%mm3"
3208 , "%mm4", "%mm5", "%mm6", "%mm7"
3209#endif
3210 );
3211 }
7f88f624 3212 break; /* end 2 bpp */
c6b71bff
GD
3213
3214 case 1:
3215 {
3216 __asm__ __volatile__ (
7f88f624 3217 /* re-init address pointers and offset */
c6b71bff 3218#ifdef __PIC__
7f88f624 3219 "pushl %%ebx \n\t" /* save Global Offset Table index */
c6b71bff 3220#endif
7f88f624
VZ
3221 "movl _dif, %%ebx \n\t" /* ebx: x = offset to alignment */
3222 /* boundary */
3223/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3224 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
c6b71bff 3225 "jnb avg_1end \n\t"
7f88f624
VZ
3226 /* do Paeth decode for remaining bytes */
3227/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 3228 "movl %%edi, %%edx \n\t"
7f88f624
VZ
3229/* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
3230 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3231 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
3232 /* in loop below */
c6b71bff 3233 "avg_1lp: \n\t"
7f88f624 3234 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 3235 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3236 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3237 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
c6b71bff
GD
3238 "addw %%cx, %%ax \n\t"
3239 "incl %%ebx \n\t"
7f88f624
VZ
3240 "shrw %%ax \n\t" /* divide by 2 */
3241 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */
3242 /* inc ebx */
3243 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3244 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */
3245 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
3246 "jb avg_1lp \n\t"
3247
3248 "avg_1end: \n\t"
3249#ifdef __PIC__
7f88f624 3250 "popl %%ebx \n\t" /* Global Offset Table index */
c6b71bff
GD
3251#endif
3252
7f88f624 3253 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
3254 "=S" (dummy_value_S),
3255 "=D" (dummy_value_D)
3256
7f88f624
VZ
3257 : "0" (bpp), /* ecx // input regs */
3258 "1" (prev_row), /* esi */
3259 "2" (row) /* edi */
c6b71bff 3260
7f88f624 3261 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
3262#ifndef __PIC__
3263 , "%ebx"
3264#endif
3265 );
3266 }
7f88f624 3267 return; /* end 1 bpp */
c6b71bff
GD
3268
3269 case 8:
3270 {
3271 __asm__ __volatile__ (
7f88f624
VZ
3272 /* re-init address pointers and offset */
3273 "movl _dif, %%ecx \n\t" /* ecx: x == offset to alignment */
3274 "movq _LBCarryMask, %%mm5 \n\t" /* boundary */
3275/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 3276 "movq _HBClearMask, %%mm4 \n\t"
7f88f624 3277/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 3278
7f88f624
VZ
3279 /* prime the pump: load the first Raw(x-bpp) data set */
3280 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3281 /* (NO NEED to correct pos. in loop below) */
c6b71bff
GD
3282
3283 "avg_8lp: \n\t"
3284 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3285 "movq %%mm5, %%mm3 \n\t"
3286 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3287 "addl $8, %%ecx \n\t"
7f88f624
VZ
3288 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3289 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3290 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3291 /* where both lsb's were == 1 */
3292 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3293 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7, each byte */
3294 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg, each byte */
3295 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7, each byte */
3296 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg, each */
3297 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
c6b71bff
GD
3298 "cmpl _MMXLength, %%ecx \n\t"
3299 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624 3300 "movq %%mm0, %%mm2 \n\t" /* reuse as Raw(x-bpp) */
c6b71bff
GD
3301 "jb avg_8lp \n\t"
3302
7f88f624 3303 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3304 "=D" (dummy_value_D)
3305
7f88f624
VZ
3306 : "0" (prev_row), /* esi // input regs */
3307 "1" (row) /* edi */
c6b71bff 3308
7f88f624 3309 : "%ecx" /* clobber list */
c6b71bff
GD
3310#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3311 , "%mm0", "%mm1", "%mm2"
3312 , "%mm3", "%mm4", "%mm5"
3313#endif
3314 );
3315 }
7f88f624 3316 break; /* end 8 bpp */
c6b71bff 3317
7f88f624 3318 default: /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */
c6b71bff
GD
3319 {
3320
3321#ifdef PNG_DEBUG
7f88f624 3322 /* GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED */
c6b71bff
GD
3323 png_debug(1,
3324 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3325#endif
3326
3327#if 0
3328 __asm__ __volatile__ (
3329 "movq _LBCarryMask, %%mm5 \n\t"
7f88f624
VZ
3330 /* re-init address pointers and offset */
3331 "movl _dif, %%ebx \n\t" /* ebx: x = offset to */
3332 /* alignment boundary */
3333 "movl row, %%edi \n\t" /* edi: Avg(x) */
c6b71bff
GD
3334 "movq _HBClearMask, %%mm4 \n\t"
3335 "movl %%edi, %%edx \n\t"
7f88f624
VZ
3336 "movl prev_row, %%esi \n\t" /* esi: Prior(x) */
3337 "subl bpp, %%edx \n\t" /* edx: Raw(x-bpp) */
c6b71bff
GD
3338 "avg_Alp: \n\t"
3339 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3340 "movq %%mm5, %%mm3 \n\t"
3341 "movq (%%esi,%%ebx,), %%mm1 \n\t"
7f88f624 3342 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
c6b71bff 3343 "movq (%%edx,%%ebx,), %%mm2 \n\t"
7f88f624
VZ
3344 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3345 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3346 /* where both lsb's were == 1 */
3347 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3348 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3349 /* byte */
3350 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg for each */
3351 /* byte */
3352 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3353 /* byte */
3354 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3355 /* each byte */
c6b71bff 3356 "addl $8, %%ebx \n\t"
7f88f624
VZ
3357 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
3358 /* byte */
c6b71bff
GD
3359 "cmpl _MMXLength, %%ebx \n\t"
3360 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3361 "jb avg_Alp \n\t"
3362
7f88f624 3363 : /* FIXASM: output regs/vars go here, e.g.: "=m" (memory_var) */
c6b71bff 3364
7f88f624 3365 : /* FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest) */
c6b71bff 3366
7f88f624 3367 : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */
c6b71bff
GD
3368 );
3369#endif /* 0 - NEVER REACHED */
3370 }
3371 break;
3372
7f88f624 3373 } /* end switch (bpp) */
c6b71bff
GD
3374
3375 __asm__ __volatile__ (
7f88f624
VZ
3376 /* MMX acceleration complete; now do clean-up */
3377 /* check if any remaining bytes left to decode */
c6b71bff 3378#ifdef __PIC__
7f88f624 3379 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff 3380#endif
7f88f624
VZ
3381 "movl _MMXLength, %%ebx \n\t" /* ebx: x == offset bytes after MMX */
3382/* pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
3383 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
c6b71bff
GD
3384 "jnb avg_end \n\t"
3385
7f88f624
VZ
3386 /* do Avg decode for remaining bytes */
3387/*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
c6b71bff 3388 "movl %%edi, %%edx \n\t"
7f88f624
VZ
3389/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
3390 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3391 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
c6b71bff
GD
3392
3393 "avg_lp2: \n\t"
7f88f624 3394 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 3395 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3396 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3397 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
c6b71bff
GD
3398 "addw %%cx, %%ax \n\t"
3399 "incl %%ebx \n\t"
7f88f624
VZ
3400 "shrw %%ax \n\t" /* divide by 2 */
3401 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
3402 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3403 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */
3404 "jb avg_lp2 \n\t" /* affect flags; -1 to offset inc ebx] */
c6b71bff
GD
3405
3406 "avg_end: \n\t"
7f88f624 3407 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
c6b71bff 3408#ifdef __PIC__
7f88f624 3409 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
3410#endif
3411
7f88f624 3412 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
3413 "=S" (dummy_value_S),
3414 "=D" (dummy_value_D)
3415
7f88f624
VZ
3416 : "0" (bpp), /* ecx // input regs */
3417 "1" (prev_row), /* esi */
3418 "2" (row) /* edi */
c6b71bff 3419
7f88f624 3420 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
3421#ifndef __PIC__
3422 , "%ebx"
3423#endif
3424 );
3425
3426} /* end png_read_filter_row_mmx_avg() */
3427#endif
3428
3429
3430
3431#ifdef PNG_THREAD_UNSAFE_OK
7f88f624
VZ
3432/*===========================================================================*/
3433/* */
3434/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H */
3435/* */
3436/*===========================================================================*/
c6b71bff 3437
7f88f624 3438/* Optimized code for PNG Paeth filter decoder */
c6b71bff
GD
3439
3440static void /* PRIVATE */
3441png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3442 png_bytep prev_row)
3443{
3444 int bpp;
7f88f624 3445 int dummy_value_c; /* fix 'forbidden register 2 (cx) was spilled' error */
c6b71bff
GD
3446 int dummy_value_S;
3447 int dummy_value_D;
3448
7f88f624
VZ
3449 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3450 _FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
3451
3452 __asm__ __volatile__ (
3453#ifdef __PIC__
7f88f624 3454 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff 3455#endif
7f88f624
VZ
3456 "xorl %%ebx, %%ebx \n\t" /* ebx: x offset */
3457/*pre "movl row, %%edi \n\t" */
3458 "xorl %%edx, %%edx \n\t" /* edx: x-bpp offset */
3459/*pre "movl prev_row, %%esi \n\t" */
c6b71bff
GD
3460 "xorl %%eax, %%eax \n\t"
3461
7f88f624
VZ
3462 /* Compute the Raw value for the first bpp bytes */
3463 /* Note: the formula works out to be always */
3464 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
c6b71bff
GD
3465 "paeth_rlp: \n\t"
3466 "movb (%%edi,%%ebx,), %%al \n\t"
3467 "addb (%%esi,%%ebx,), %%al \n\t"
3468 "incl %%ebx \n\t"
7f88f624 3469/*pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx) */
c6b71bff
GD
3470 "cmpl %%ecx, %%ebx \n\t"
3471 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3472 "jb paeth_rlp \n\t"
7f88f624
VZ
3473 /* get # of bytes to alignment */
3474 "movl %%edi, _dif \n\t" /* take start of row */
3475 "addl %%ebx, _dif \n\t" /* add bpp */
c6b71bff 3476 "xorl %%ecx, %%ecx \n\t"
7f88f624
VZ
3477 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past alignment */
3478 /* boundary */
3479 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
3480 "subl %%edi, _dif \n\t" /* subtract from start ==> value ebx */
3481 /* at alignment */
c6b71bff 3482 "jz paeth_go \n\t"
7f88f624 3483 /* fix alignment */
c6b71bff
GD
3484
3485 "paeth_lp1: \n\t"
3486 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3487 /* pav = p - a = (a + b - c) - a = b - c */
3488 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
3489 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3490 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
3491 "movl %%eax, _patemp \n\t" /* Save pav for later use */
c6b71bff 3492 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3493 /* pbv = p - b = (a + b - c) - b = a - c */
3494 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
3495 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
c6b71bff 3496 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
3497 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3498 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
3499 /* pc = abs(pcv) */
c6b71bff
GD
3500 "testl $0x80000000, %%eax \n\t"
3501 "jz paeth_pca \n\t"
7f88f624 3502 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
3503
3504 "paeth_pca: \n\t"
7f88f624
VZ
3505 "movl %%eax, _pctemp \n\t" /* save pc for later use */
3506 /* pb = abs(pbv) */
c6b71bff
GD
3507 "testl $0x80000000, %%ecx \n\t"
3508 "jz paeth_pba \n\t"
7f88f624 3509 "negl %%ecx \n\t" /* reverse sign of neg values */
c6b71bff
GD
3510
3511 "paeth_pba: \n\t"
7f88f624
VZ
3512 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
3513 /* pa = abs(pav) */
c6b71bff
GD
3514 "movl _patemp, %%eax \n\t"
3515 "testl $0x80000000, %%eax \n\t"
3516 "jz paeth_paa \n\t"
7f88f624 3517 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
3518
3519 "paeth_paa: \n\t"
7f88f624
VZ
3520 "movl %%eax, _patemp \n\t" /* save pa for later use */
3521 /* test if pa <= pb */
c6b71bff
GD
3522 "cmpl %%ecx, %%eax \n\t"
3523 "jna paeth_abb \n\t"
7f88f624 3524 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
3525 "cmpl _pctemp, %%ecx \n\t"
3526 "jna paeth_bbc \n\t"
7f88f624
VZ
3527 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3528 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
3529 "jmp paeth_paeth \n\t"
3530
3531 "paeth_bbc: \n\t"
7f88f624
VZ
3532 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3533 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
c6b71bff
GD
3534 "jmp paeth_paeth \n\t"
3535
3536 "paeth_abb: \n\t"
7f88f624 3537 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
3538 "cmpl _pctemp, %%eax \n\t"
3539 "jna paeth_abc \n\t"
7f88f624
VZ
3540 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3541 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
3542 "jmp paeth_paeth \n\t"
3543
3544 "paeth_abc: \n\t"
7f88f624
VZ
3545 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3546 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
c6b71bff
GD
3547
3548 "paeth_paeth: \n\t"
3549 "incl %%ebx \n\t"
3550 "incl %%edx \n\t"
7f88f624 3551 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
3552 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3553 "cmpl _dif, %%ebx \n\t"
3554 "jb paeth_lp1 \n\t"
3555
3556 "paeth_go: \n\t"
3557 "movl _FullLength, %%ecx \n\t"
3558 "movl %%ecx, %%eax \n\t"
7f88f624
VZ
3559 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
3560 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
3561 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
c6b71bff
GD
3562 "movl %%ecx, _MMXLength \n\t"
3563#ifdef __PIC__
7f88f624 3564 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
3565#endif
3566
7f88f624 3567 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
3568 "=S" (dummy_value_S),
3569 "=D" (dummy_value_D)
3570
7f88f624
VZ
3571 : "0" (bpp), /* ecx // input regs */
3572 "1" (prev_row), /* esi */
3573 "2" (row) /* edi */
c6b71bff 3574
7f88f624 3575 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
3576#ifndef __PIC__
3577 , "%ebx"
3578#endif
3579 );
3580
7f88f624 3581 /* now do the math for the rest of the row */
c6b71bff
GD
3582 switch (bpp)
3583 {
3584 case 3:
3585 {
3586 _ActiveMask.use = 0x0000000000ffffffLL;
3587 _ActiveMaskEnd.use = 0xffff000000000000LL;
7f88f624
VZ
3588 _ShiftBpp.use = 24; /* == bpp(3) * 8 */
3589 _ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
3590
3591 __asm__ __volatile__ (
3592 "movl _dif, %%ecx \n\t"
7f88f624
VZ
3593/* preload "movl row, %%edi \n\t" */
3594/* preload "movl prev_row, %%esi \n\t" */
c6b71bff 3595 "pxor %%mm0, %%mm0 \n\t"
7f88f624 3596 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
3597 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3598 "paeth_3lp: \n\t"
7f88f624
VZ
3599 "psrlq _ShiftRem, %%mm1 \n\t" /* shift last 3 bytes to 1st */
3600 /* 3 bytes */
3601 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3602 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3603 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */
3604 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3605 "psrlq _ShiftRem, %%mm3 \n\t" /* shift last 3 bytes to 1st */
3606 /* 3 bytes */
3607 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3608 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3609 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3610 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3611 "movq %%mm1, %%mm5 \n\t"
3612 "psubw %%mm3, %%mm4 \n\t"
3613 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3614 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3615 "movq %%mm4, %%mm6 \n\t"
3616 "psubw %%mm3, %%mm5 \n\t"
3617
7f88f624
VZ
3618 /* pa = abs(p-a) = abs(pav) */
3619 /* pb = abs(p-b) = abs(pbv) */
3620 /* pc = abs(p-c) = abs(pcv) */
3621 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3622 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3623 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3624 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3625 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3626 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3627 "psubw %%mm0, %%mm4 \n\t"
3628 "psubw %%mm7, %%mm5 \n\t"
3629 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3630 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3631 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3632 "psubw %%mm7, %%mm5 \n\t"
3633 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3634 /* test pa <= pb */
c6b71bff
GD
3635 "movq %%mm4, %%mm7 \n\t"
3636 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3637 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3638 "movq %%mm7, %%mm0 \n\t"
7f88f624 3639 /* use mm7 mask to merge pa & pb */
c6b71bff 3640 "pand %%mm7, %%mm5 \n\t"
7f88f624 3641 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3642 "pand %%mm0, %%mm2 \n\t"
3643 "pandn %%mm4, %%mm7 \n\t"
3644 "pandn %%mm1, %%mm0 \n\t"
3645 "paddw %%mm5, %%mm7 \n\t"
3646 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3647 /* test ((pa <= pb)? pa:pb) <= pc */
3648 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3649 "pxor %%mm1, %%mm1 \n\t"
3650 "pand %%mm7, %%mm3 \n\t"
3651 "pandn %%mm0, %%mm7 \n\t"
3652 "paddw %%mm3, %%mm7 \n\t"
3653 "pxor %%mm0, %%mm0 \n\t"
3654 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 3655 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
c6b71bff 3656 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
3657 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
3658 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3659 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3660 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3661 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as */
3662 /* Raw(x-bpp) */
3663 /* now do Paeth for 2nd set of bytes (3-5) */
3664 "psrlq _ShiftBpp, %%mm2 \n\t" /* load b=Prior(x) step 2 */
3665 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
c6b71bff 3666 "pxor %%mm7, %%mm7 \n\t"
7f88f624
VZ
3667 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3668 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 3669 "movq %%mm1, %%mm5 \n\t"
7f88f624 3670 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff
GD
3671 "movq %%mm2, %%mm4 \n\t"
3672 "psubw %%mm3, %%mm5 \n\t"
3673 "psubw %%mm3, %%mm4 \n\t"
7f88f624
VZ
3674 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
3675 /* pav + pbv = pbv + pav */
c6b71bff
GD
3676 "movq %%mm5, %%mm6 \n\t"
3677 "paddw %%mm4, %%mm6 \n\t"
3678
7f88f624
VZ
3679 /* pa = abs(p-a) = abs(pav) */
3680 /* pb = abs(p-b) = abs(pbv) */
3681 /* pc = abs(p-c) = abs(pcv) */
3682 "pcmpgtw %%mm5, %%mm0 \n\t" /* create mask pbv bytes < 0 */
3683 "pcmpgtw %%mm4, %%mm7 \n\t" /* create mask pav bytes < 0 */
3684 "pand %%mm5, %%mm0 \n\t" /* only pbv bytes < 0 in mm0 */
3685 "pand %%mm4, %%mm7 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3686 "psubw %%mm0, %%mm5 \n\t"
3687 "psubw %%mm7, %%mm4 \n\t"
3688 "psubw %%mm0, %%mm5 \n\t"
3689 "psubw %%mm7, %%mm4 \n\t"
3690 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3691 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3692 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff 3693 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3694 /* test pa <= pb */
c6b71bff
GD
3695 "movq %%mm4, %%mm7 \n\t"
3696 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3697 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3698 "movq %%mm7, %%mm0 \n\t"
7f88f624 3699 /* use mm7 mask to merge pa & pb */
c6b71bff 3700 "pand %%mm7, %%mm5 \n\t"
7f88f624 3701 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3702 "pand %%mm0, %%mm2 \n\t"
3703 "pandn %%mm4, %%mm7 \n\t"
3704 "pandn %%mm1, %%mm0 \n\t"
3705 "paddw %%mm5, %%mm7 \n\t"
3706 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3707 /* test ((pa <= pb)? pa:pb) <= pc */
3708 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3709 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
c6b71bff
GD
3710 "pand %%mm7, %%mm3 \n\t"
3711 "pandn %%mm0, %%mm7 \n\t"
3712 "pxor %%mm1, %%mm1 \n\t"
3713 "paddw %%mm3, %%mm7 \n\t"
3714 "pxor %%mm0, %%mm0 \n\t"
3715 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 3716 "movq %%mm2, %%mm3 \n\t" /* load c=Prior(x-bpp) step 1 */
c6b71bff 3717 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
3718 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3719 "psllq _ShiftBpp, %%mm7 \n\t" /* shift bytes to 2nd group of */
3720 /* 3 bytes */
3721 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3722 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3723 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3724 "psllq _ShiftBpp, %%mm3 \n\t" /* load c=Prior(x-bpp) step 2 */
3725 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
c6b71bff 3726 "movq %%mm7, %%mm1 \n\t"
7f88f624
VZ
3727 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3728 "psllq _ShiftBpp, %%mm1 \n\t" /* shift bytes */
3729 /* now mm1 will be used as Raw(x-bpp) */
3730 /* now do Paeth for 3rd, and final, set of bytes (6-7) */
c6b71bff 3731 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3732 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
c6b71bff 3733 "psubw %%mm3, %%mm4 \n\t"
7f88f624 3734 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 3735 "movq %%mm1, %%mm5 \n\t"
7f88f624 3736 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3737 "movq %%mm4, %%mm6 \n\t"
3738 "psubw %%mm3, %%mm5 \n\t"
3739 "pxor %%mm0, %%mm0 \n\t"
3740 "paddw %%mm5, %%mm6 \n\t"
3741
7f88f624
VZ
3742 /* pa = abs(p-a) = abs(pav) */
3743 /* pb = abs(p-b) = abs(pbv) */
3744 /* pc = abs(p-c) = abs(pcv) */
3745 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3746 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3747 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3748 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3749 "psubw %%mm0, %%mm4 \n\t"
3750 "psubw %%mm7, %%mm5 \n\t"
3751 "psubw %%mm0, %%mm4 \n\t"
3752 "psubw %%mm7, %%mm5 \n\t"
3753 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3754 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3755 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff 3756 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3757 /* test pa <= pb */
c6b71bff
GD
3758 "movq %%mm4, %%mm7 \n\t"
3759 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3760 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3761 "movq %%mm7, %%mm0 \n\t"
7f88f624 3762 /* use mm0 mask copy to merge a & b */
c6b71bff 3763 "pand %%mm0, %%mm2 \n\t"
7f88f624 3764 /* use mm7 mask to merge pa & pb */
c6b71bff
GD
3765 "pand %%mm7, %%mm5 \n\t"
3766 "pandn %%mm1, %%mm0 \n\t"
3767 "pandn %%mm4, %%mm7 \n\t"
3768 "paddw %%mm2, %%mm0 \n\t"
3769 "paddw %%mm5, %%mm7 \n\t"
7f88f624
VZ
3770 /* test ((pa <= pb)? pa:pb) <= pc */
3771 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3772 "pand %%mm7, %%mm3 \n\t"
3773 "pandn %%mm0, %%mm7 \n\t"
3774 "paddw %%mm3, %%mm7 \n\t"
3775 "pxor %%mm1, %%mm1 \n\t"
3776 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 3777 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
3778 "addl $8, %%ecx \n\t"
3779 "pand _ActiveMaskEnd, %%mm1 \n\t"
7f88f624
VZ
3780 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */
3781 /* Raw(x) */
c6b71bff
GD
3782
3783 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
3784 "pxor %%mm0, %%mm0 \n\t" /* pxor does not affect flags */
3785 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3786 /* mm1 will be used as Raw(x-bpp) next loop */
3787 /* mm3 ready to be used as Prior(x-bpp) next loop */
c6b71bff
GD
3788 "jb paeth_3lp \n\t"
3789
7f88f624 3790 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3791 "=D" (dummy_value_D)
3792
7f88f624
VZ
3793 : "0" (prev_row), /* esi // input regs */
3794 "1" (row) /* edi */
c6b71bff 3795
7f88f624 3796 : "%ecx" /* clobber list */
c6b71bff
GD
3797#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3798 , "%mm0", "%mm1", "%mm2", "%mm3"
3799 , "%mm4", "%mm5", "%mm6", "%mm7"
3800#endif
3801 );
3802 }
7f88f624 3803 break; /* end 3 bpp */
c6b71bff
GD
3804
3805 case 6:
7f88f624
VZ
3806 //case 7: /* GRR BOGUS */
3807 //case 5: /* GRR BOGUS */
c6b71bff
GD
3808 {
3809 _ActiveMask.use = 0x00000000ffffffffLL;
3810 _ActiveMask2.use = 0xffffffff00000000LL;
7f88f624 3811 _ShiftBpp.use = bpp << 3; /* == bpp * 8 */
c6b71bff
GD
3812 _ShiftRem.use = 64 - _ShiftBpp.use;
3813
3814 __asm__ __volatile__ (
3815 "movl _dif, %%ecx \n\t"
7f88f624
VZ
3816/* preload "movl row, %%edi \n\t" */
3817/* preload "movl prev_row, %%esi \n\t" */
3818 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
3819 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3820 "pxor %%mm0, %%mm0 \n\t"
3821
3822 "paeth_6lp: \n\t"
7f88f624 3823 /* must shift to position Raw(x-bpp) data */
c6b71bff 3824 "psrlq _ShiftRem, %%mm1 \n\t"
7f88f624
VZ
3825 /* do first set of 4 bytes */
3826 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3827 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3828 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3829 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
3830 /* must shift to position Prior(x-bpp) data */
c6b71bff 3831 "psrlq _ShiftRem, %%mm3 \n\t"
7f88f624 3832 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3833 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3834 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
3835 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3836 "movq %%mm1, %%mm5 \n\t"
3837 "psubw %%mm3, %%mm4 \n\t"
3838 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3839 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3840 "movq %%mm4, %%mm6 \n\t"
3841 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
3842 /* pa = abs(p-a) = abs(pav) */
3843 /* pb = abs(p-b) = abs(pbv) */
3844 /* pc = abs(p-c) = abs(pcv) */
3845 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3846 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3847 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3848 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3849 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3850 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3851 "psubw %%mm0, %%mm4 \n\t"
3852 "psubw %%mm7, %%mm5 \n\t"
3853 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3854 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3855 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3856 "psubw %%mm7, %%mm5 \n\t"
3857 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3858 /* test pa <= pb */
c6b71bff
GD
3859 "movq %%mm4, %%mm7 \n\t"
3860 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3861 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3862 "movq %%mm7, %%mm0 \n\t"
7f88f624 3863 /* use mm7 mask to merge pa & pb */
c6b71bff 3864 "pand %%mm7, %%mm5 \n\t"
7f88f624 3865 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3866 "pand %%mm0, %%mm2 \n\t"
3867 "pandn %%mm4, %%mm7 \n\t"
3868 "pandn %%mm1, %%mm0 \n\t"
3869 "paddw %%mm5, %%mm7 \n\t"
3870 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3871 /* test ((pa <= pb)? pa:pb) <= pc */
3872 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3873 "pxor %%mm1, %%mm1 \n\t"
3874 "pand %%mm7, %%mm3 \n\t"
3875 "pandn %%mm0, %%mm7 \n\t"
3876 "paddw %%mm3, %%mm7 \n\t"
3877 "pxor %%mm0, %%mm0 \n\t"
3878 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 3879 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
c6b71bff
GD
3880 "pand _ActiveMask, %%mm7 \n\t"
3881 "psrlq _ShiftRem, %%mm3 \n\t"
7f88f624
VZ
3882 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) step 1 */
3883 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */
c6b71bff 3884 "movq %%mm2, %%mm6 \n\t"
7f88f624 3885 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
c6b71bff
GD
3886 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3887 "psllq _ShiftBpp, %%mm6 \n\t"
3888 "movq %%mm7, %%mm5 \n\t"
3889 "psrlq _ShiftRem, %%mm1 \n\t"
3890 "por %%mm6, %%mm3 \n\t"
3891 "psllq _ShiftBpp, %%mm5 \n\t"
7f88f624 3892 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
c6b71bff 3893 "por %%mm5, %%mm1 \n\t"
7f88f624
VZ
3894 /* do second set of 4 bytes */
3895 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3896 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3897 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3898 "movq %%mm2, %%mm4 \n\t"
7f88f624 3899 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3900 "movq %%mm1, %%mm5 \n\t"
3901 "psubw %%mm3, %%mm4 \n\t"
3902 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3903 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3904 "movq %%mm4, %%mm6 \n\t"
3905 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
3906 /* pa = abs(p-a) = abs(pav) */
3907 /* pb = abs(p-b) = abs(pbv) */
3908 /* pc = abs(p-c) = abs(pcv) */
3909 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3910 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3911 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3912 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3913 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3914 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3915 "psubw %%mm0, %%mm4 \n\t"
3916 "psubw %%mm7, %%mm5 \n\t"
3917 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3918 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3919 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3920 "psubw %%mm7, %%mm5 \n\t"
3921 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3922 /* test pa <= pb */
c6b71bff
GD
3923 "movq %%mm4, %%mm7 \n\t"
3924 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3925 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3926 "movq %%mm7, %%mm0 \n\t"
7f88f624 3927 /* use mm7 mask to merge pa & pb */
c6b71bff 3928 "pand %%mm7, %%mm5 \n\t"
7f88f624 3929 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3930 "pand %%mm0, %%mm2 \n\t"
3931 "pandn %%mm4, %%mm7 \n\t"
3932 "pandn %%mm1, %%mm0 \n\t"
3933 "paddw %%mm5, %%mm7 \n\t"
3934 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3935 /* test ((pa <= pb)? pa:pb) <= pc */
3936 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3937 "pxor %%mm1, %%mm1 \n\t"
3938 "pand %%mm7, %%mm3 \n\t"
3939 "pandn %%mm0, %%mm7 \n\t"
3940 "pxor %%mm1, %%mm1 \n\t"
3941 "paddw %%mm3, %%mm7 \n\t"
3942 "pxor %%mm0, %%mm0 \n\t"
7f88f624 3943 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
3944 "addl $8, %%ecx \n\t"
3945 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 3946 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
c6b71bff 3947 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
3948 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3949 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff
GD
3950 "jb paeth_6lp \n\t"
3951
7f88f624 3952 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3953 "=D" (dummy_value_D)
3954
7f88f624
VZ
3955 : "0" (prev_row), /* esi // input regs */
3956 "1" (row) /* edi */
c6b71bff 3957
7f88f624 3958 : "%ecx" /* clobber list */
c6b71bff
GD
3959#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3960 , "%mm0", "%mm1", "%mm2", "%mm3"
3961 , "%mm4", "%mm5", "%mm6", "%mm7"
3962#endif
3963 );
3964 }
7f88f624 3965 break; /* end 6 bpp */
c6b71bff
GD
3966
3967 case 4:
3968 {
3969 _ActiveMask.use = 0x00000000ffffffffLL;
3970
3971 __asm__ __volatile__ (
3972 "movl _dif, %%ecx \n\t"
7f88f624
VZ
3973/* preload "movl row, %%edi \n\t" */
3974/* preload "movl prev_row, %%esi \n\t" */
c6b71bff 3975 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3976 /* prime the pump: load the first Raw(x-bpp) data set */
3977 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
3978 /* a=Raw(x-bpp) bytes */
c6b71bff 3979 "paeth_4lp: \n\t"
7f88f624
VZ
3980 /* do first set of 4 bytes */
3981 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3982 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3983 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3984 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3985 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3986 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3987 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3988 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3989 "movq %%mm1, %%mm5 \n\t"
3990 "psubw %%mm3, %%mm4 \n\t"
3991 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3992 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3993 "movq %%mm4, %%mm6 \n\t"
3994 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
3995 /* pa = abs(p-a) = abs(pav) */
3996 /* pb = abs(p-b) = abs(pbv) */
3997 /* pc = abs(p-c) = abs(pcv) */
3998 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3999 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4000 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4001 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4002 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4003 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4004 "psubw %%mm0, %%mm4 \n\t"
4005 "psubw %%mm7, %%mm5 \n\t"
4006 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4007 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4008 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4009 "psubw %%mm7, %%mm5 \n\t"
4010 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4011 /* test pa <= pb */
c6b71bff
GD
4012 "movq %%mm4, %%mm7 \n\t"
4013 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4014 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4015 "movq %%mm7, %%mm0 \n\t"
7f88f624 4016 /* use mm7 mask to merge pa & pb */
c6b71bff 4017 "pand %%mm7, %%mm5 \n\t"
7f88f624 4018 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4019 "pand %%mm0, %%mm2 \n\t"
4020 "pandn %%mm4, %%mm7 \n\t"
4021 "pandn %%mm1, %%mm0 \n\t"
4022 "paddw %%mm5, %%mm7 \n\t"
4023 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4024 /* test ((pa <= pb)? pa:pb) <= pc */
4025 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4026 "pxor %%mm1, %%mm1 \n\t"
4027 "pand %%mm7, %%mm3 \n\t"
4028 "pandn %%mm0, %%mm7 \n\t"
4029 "paddw %%mm3, %%mm7 \n\t"
4030 "pxor %%mm0, %%mm0 \n\t"
4031 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 4032 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
c6b71bff 4033 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
4034 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
4035 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4036 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4037 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4038 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as Raw(x-bpp) */
4039 /* do second set of 4 bytes */
4040 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4041 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4042 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 4043 "movq %%mm2, %%mm4 \n\t"
7f88f624 4044 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
4045 "movq %%mm1, %%mm5 \n\t"
4046 "psubw %%mm3, %%mm4 \n\t"
4047 "pxor %%mm7, %%mm7 \n\t"
7f88f624 4048 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
4049 "movq %%mm4, %%mm6 \n\t"
4050 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
4051 /* pa = abs(p-a) = abs(pav) */
4052 /* pb = abs(p-b) = abs(pbv) */
4053 /* pc = abs(p-c) = abs(pcv) */
4054 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 4055 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4056 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4057 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4058 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4059 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4060 "psubw %%mm0, %%mm4 \n\t"
4061 "psubw %%mm7, %%mm5 \n\t"
4062 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4063 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4064 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4065 "psubw %%mm7, %%mm5 \n\t"
4066 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4067 /* test pa <= pb */
c6b71bff
GD
4068 "movq %%mm4, %%mm7 \n\t"
4069 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4070 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4071 "movq %%mm7, %%mm0 \n\t"
7f88f624 4072 /* use mm7 mask to merge pa & pb */
c6b71bff 4073 "pand %%mm7, %%mm5 \n\t"
7f88f624 4074 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4075 "pand %%mm0, %%mm2 \n\t"
4076 "pandn %%mm4, %%mm7 \n\t"
4077 "pandn %%mm1, %%mm0 \n\t"
4078 "paddw %%mm5, %%mm7 \n\t"
4079 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4080 /* test ((pa <= pb)? pa:pb) <= pc */
4081 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4082 "pxor %%mm1, %%mm1 \n\t"
4083 "pand %%mm7, %%mm3 \n\t"
4084 "pandn %%mm0, %%mm7 \n\t"
4085 "pxor %%mm1, %%mm1 \n\t"
4086 "paddw %%mm3, %%mm7 \n\t"
4087 "pxor %%mm0, %%mm0 \n\t"
7f88f624 4088 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
4089 "addl $8, %%ecx \n\t"
4090 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 4091 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */
c6b71bff 4092 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
4093 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4094 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff
GD
4095 "jb paeth_4lp \n\t"
4096
7f88f624 4097 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
4098 "=D" (dummy_value_D)
4099
7f88f624
VZ
4100 : "0" (prev_row), /* esi // input regs */
4101 "1" (row) /* edi */
c6b71bff 4102
7f88f624 4103 : "%ecx" /* clobber list */
c6b71bff
GD
4104#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4105 , "%mm0", "%mm1", "%mm2", "%mm3"
4106 , "%mm4", "%mm5", "%mm6", "%mm7"
4107#endif
4108 );
4109 }
7f88f624 4110 break; /* end 4 bpp */
c6b71bff 4111
7f88f624 4112 case 8: /* bpp == 8 */
c6b71bff
GD
4113 {
4114 _ActiveMask.use = 0x00000000ffffffffLL;
4115
4116 __asm__ __volatile__ (
4117 "movl _dif, %%ecx \n\t"
7f88f624
VZ
4118/* preload "movl row, %%edi \n\t" */
4119/* preload "movl prev_row, %%esi \n\t" */
c6b71bff 4120 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4121 /* prime the pump: load the first Raw(x-bpp) data set */
4122 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
4123 /* a=Raw(x-bpp) bytes */
c6b71bff 4124 "paeth_8lp: \n\t"
7f88f624
VZ
4125 /* do first set of 4 bytes */
4126 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4127 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4128 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4129 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4130 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 4131 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
4132 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
4133 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
4134 "movq %%mm1, %%mm5 \n\t"
4135 "psubw %%mm3, %%mm4 \n\t"
4136 "pxor %%mm7, %%mm7 \n\t"
7f88f624 4137 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
4138 "movq %%mm4, %%mm6 \n\t"
4139 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
4140 /* pa = abs(p-a) = abs(pav) */
4141 /* pb = abs(p-b) = abs(pbv) */
4142 /* pc = abs(p-c) = abs(pcv) */
4143 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 4144 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4145 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4146 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4147 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4148 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4149 "psubw %%mm0, %%mm4 \n\t"
4150 "psubw %%mm7, %%mm5 \n\t"
4151 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4152 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4153 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4154 "psubw %%mm7, %%mm5 \n\t"
4155 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4156 /* test pa <= pb */
c6b71bff
GD
4157 "movq %%mm4, %%mm7 \n\t"
4158 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4159 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4160 "movq %%mm7, %%mm0 \n\t"
7f88f624 4161 /* use mm7 mask to merge pa & pb */
c6b71bff 4162 "pand %%mm7, %%mm5 \n\t"
7f88f624 4163 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4164 "pand %%mm0, %%mm2 \n\t"
4165 "pandn %%mm4, %%mm7 \n\t"
4166 "pandn %%mm1, %%mm0 \n\t"
4167 "paddw %%mm5, %%mm7 \n\t"
4168 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4169 /* test ((pa <= pb)? pa:pb) <= pc */
4170 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4171 "pxor %%mm1, %%mm1 \n\t"
4172 "pand %%mm7, %%mm3 \n\t"
4173 "pandn %%mm0, %%mm7 \n\t"
4174 "paddw %%mm3, %%mm7 \n\t"
4175 "pxor %%mm0, %%mm0 \n\t"
4176 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 4177 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
c6b71bff 4178 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
4179 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4180 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4181 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4182 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4183 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */
4184
4185 /* do second set of 4 bytes */
4186 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
4187 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
4188 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 4189 "movq %%mm2, %%mm4 \n\t"
7f88f624 4190 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
4191 "movq %%mm1, %%mm5 \n\t"
4192 "psubw %%mm3, %%mm4 \n\t"
4193 "pxor %%mm7, %%mm7 \n\t"
7f88f624 4194 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
4195 "movq %%mm4, %%mm6 \n\t"
4196 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
4197 /* pa = abs(p-a) = abs(pav) */
4198 /* pb = abs(p-b) = abs(pbv) */
4199 /* pc = abs(p-c) = abs(pcv) */
4200 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 4201 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4202 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4203 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4204 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4205 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4206 "psubw %%mm0, %%mm4 \n\t"
4207 "psubw %%mm7, %%mm5 \n\t"
4208 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4209 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4210 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4211 "psubw %%mm7, %%mm5 \n\t"
4212 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4213 /* test pa <= pb */
c6b71bff
GD
4214 "movq %%mm4, %%mm7 \n\t"
4215 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4216 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4217 "movq %%mm7, %%mm0 \n\t"
7f88f624 4218 /* use mm7 mask to merge pa & pb */
c6b71bff 4219 "pand %%mm7, %%mm5 \n\t"
7f88f624 4220 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4221 "pand %%mm0, %%mm2 \n\t"
4222 "pandn %%mm4, %%mm7 \n\t"
4223 "pandn %%mm1, %%mm0 \n\t"
4224 "paddw %%mm5, %%mm7 \n\t"
4225 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4226 /* test ((pa <= pb)? pa:pb) <= pc */
4227 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4228 "pxor %%mm1, %%mm1 \n\t"
4229 "pand %%mm7, %%mm3 \n\t"
4230 "pandn %%mm0, %%mm7 \n\t"
4231 "pxor %%mm1, %%mm1 \n\t"
4232 "paddw %%mm3, %%mm7 \n\t"
4233 "pxor %%mm0, %%mm0 \n\t"
7f88f624 4234 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
4235 "addl $8, %%ecx \n\t"
4236 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 4237 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
c6b71bff 4238 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
4239 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4240 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff
GD
4241 "jb paeth_8lp \n\t"
4242
7f88f624 4243 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
4244 "=D" (dummy_value_D)
4245
7f88f624
VZ
4246 : "0" (prev_row), /* esi // input regs */
4247 "1" (row) /* edi */
c6b71bff 4248
7f88f624 4249 : "%ecx" /* clobber list */
c6b71bff
GD
4250#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4251 , "%mm0", "%mm1", "%mm2", "%mm3"
4252 , "%mm4", "%mm5", "%mm6", "%mm7"
4253#endif
4254 );
4255 }
7f88f624 4256 break; /* end 8 bpp */
c6b71bff 4257
7f88f624
VZ
4258 case 1: /* bpp = 1 */
4259 case 2: /* bpp = 2 */
4260 default: /* bpp > 8 */
c6b71bff
GD
4261 {
4262 __asm__ __volatile__ (
4263#ifdef __PIC__
7f88f624 4264 "pushl %%ebx \n\t" /* save Global Offset Table index */
c6b71bff
GD
4265#endif
4266 "movl _dif, %%ebx \n\t"
4267 "cmpl _FullLength, %%ebx \n\t"
4268 "jnb paeth_dend \n\t"
4269
7f88f624
VZ
4270/* preload "movl row, %%edi \n\t" */
4271/* preload "movl prev_row, %%esi \n\t" */
4272 /* do Paeth decode for remaining bytes */
c6b71bff 4273 "movl %%ebx, %%edx \n\t"
7f88f624
VZ
4274/* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
4275 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4276 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
c6b71bff
GD
4277
4278 "paeth_dlp: \n\t"
4279 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4280 /* pav = p - a = (a + b - c) - a = b - c */
4281 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4282 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4283 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4284 "movl %%eax, _patemp \n\t" /* Save pav for later use */
c6b71bff 4285 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4286 /* pbv = p - b = (a + b - c) - b = a - c */
4287 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4288 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
c6b71bff 4289 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
4290 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4291 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4292 /* pc = abs(pcv) */
c6b71bff
GD
4293 "testl $0x80000000, %%eax \n\t"
4294 "jz paeth_dpca \n\t"
7f88f624 4295 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4296
4297 "paeth_dpca: \n\t"
7f88f624
VZ
4298 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4299 /* pb = abs(pbv) */
c6b71bff
GD
4300 "testl $0x80000000, %%ecx \n\t"
4301 "jz paeth_dpba \n\t"
7f88f624 4302 "negl %%ecx \n\t" /* reverse sign of neg values */
c6b71bff
GD
4303
4304 "paeth_dpba: \n\t"
7f88f624
VZ
4305 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4306 /* pa = abs(pav) */
c6b71bff
GD
4307 "movl _patemp, %%eax \n\t"
4308 "testl $0x80000000, %%eax \n\t"
4309 "jz paeth_dpaa \n\t"
7f88f624 4310 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4311
4312 "paeth_dpaa: \n\t"
7f88f624
VZ
4313 "movl %%eax, _patemp \n\t" /* save pa for later use */
4314 /* test if pa <= pb */
c6b71bff
GD
4315 "cmpl %%ecx, %%eax \n\t"
4316 "jna paeth_dabb \n\t"
7f88f624 4317 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
4318 "cmpl _pctemp, %%ecx \n\t"
4319 "jna paeth_dbbc \n\t"
7f88f624
VZ
4320 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4321 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4322 "jmp paeth_dpaeth \n\t"
4323
4324 "paeth_dbbc: \n\t"
7f88f624
VZ
4325 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4326 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
c6b71bff
GD
4327 "jmp paeth_dpaeth \n\t"
4328
4329 "paeth_dabb: \n\t"
7f88f624 4330 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
4331 "cmpl _pctemp, %%eax \n\t"
4332 "jna paeth_dabc \n\t"
7f88f624
VZ
4333 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4334 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4335 "jmp paeth_dpaeth \n\t"
4336
4337 "paeth_dabc: \n\t"
7f88f624
VZ
4338 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4339 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
c6b71bff
GD
4340
4341 "paeth_dpaeth: \n\t"
4342 "incl %%ebx \n\t"
4343 "incl %%edx \n\t"
7f88f624 4344 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
4345 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4346 "cmpl _FullLength, %%ebx \n\t"
4347 "jb paeth_dlp \n\t"
4348
4349 "paeth_dend: \n\t"
4350#ifdef __PIC__
7f88f624 4351 "popl %%ebx \n\t" /* index to Global Offset Table */
c6b71bff
GD
4352#endif
4353
7f88f624 4354 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
4355 "=S" (dummy_value_S),
4356 "=D" (dummy_value_D)
4357
7f88f624
VZ
4358 : "0" (bpp), /* ecx // input regs */
4359 "1" (prev_row), /* esi */
4360 "2" (row) /* edi */
c6b71bff 4361
7f88f624 4362 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
4363#ifndef __PIC__
4364 , "%ebx"
4365#endif
4366 );
4367 }
7f88f624 4368 return; /* No need to go further with this one */
c6b71bff 4369
7f88f624 4370 } /* end switch (bpp) */
c6b71bff
GD
4371
4372 __asm__ __volatile__ (
7f88f624
VZ
4373 /* MMX acceleration complete; now do clean-up */
4374 /* check if any remaining bytes left to decode */
c6b71bff 4375#ifdef __PIC__
7f88f624 4376 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff
GD
4377#endif
4378 "movl _MMXLength, %%ebx \n\t"
4379 "cmpl _FullLength, %%ebx \n\t"
4380 "jnb paeth_end \n\t"
7f88f624
VZ
4381/*pre "movl row, %%edi \n\t" */
4382/*pre "movl prev_row, %%esi \n\t" */
4383 /* do Paeth decode for remaining bytes */
c6b71bff 4384 "movl %%ebx, %%edx \n\t"
7f88f624
VZ
4385/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
4386 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4387 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
c6b71bff
GD
4388
4389 "paeth_lp2: \n\t"
4390 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4391 /* pav = p - a = (a + b - c) - a = b - c */
4392 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4393 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4394 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4395 "movl %%eax, _patemp \n\t" /* Save pav for later use */
c6b71bff 4396 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4397 /* pbv = p - b = (a + b - c) - b = a - c */
4398 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4399 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
c6b71bff 4400 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
4401 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4402 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4403 /* pc = abs(pcv) */
c6b71bff
GD
4404 "testl $0x80000000, %%eax \n\t"
4405 "jz paeth_pca2 \n\t"
7f88f624 4406 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4407
4408 "paeth_pca2: \n\t"
7f88f624
VZ
4409 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4410 /* pb = abs(pbv) */
c6b71bff
GD
4411 "testl $0x80000000, %%ecx \n\t"
4412 "jz paeth_pba2 \n\t"
7f88f624 4413 "negl %%ecx \n\t" /* reverse sign of neg values */
c6b71bff
GD
4414
4415 "paeth_pba2: \n\t"
7f88f624
VZ
4416 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4417 /* pa = abs(pav) */
c6b71bff
GD
4418 "movl _patemp, %%eax \n\t"
4419 "testl $0x80000000, %%eax \n\t"
4420 "jz paeth_paa2 \n\t"
7f88f624 4421 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4422
4423 "paeth_paa2: \n\t"
7f88f624
VZ
4424 "movl %%eax, _patemp \n\t" /* save pa for later use */
4425 /* test if pa <= pb */
c6b71bff
GD
4426 "cmpl %%ecx, %%eax \n\t"
4427 "jna paeth_abb2 \n\t"
7f88f624 4428 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
4429 "cmpl _pctemp, %%ecx \n\t"
4430 "jna paeth_bbc2 \n\t"
7f88f624
VZ
4431 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4432 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4433 "jmp paeth_paeth2 \n\t"
4434
4435 "paeth_bbc2: \n\t"
7f88f624
VZ
4436 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4437 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
c6b71bff
GD
4438 "jmp paeth_paeth2 \n\t"
4439
4440 "paeth_abb2: \n\t"
7f88f624 4441 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
4442 "cmpl _pctemp, %%eax \n\t"
4443 "jna paeth_abc2 \n\t"
7f88f624
VZ
4444 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4445 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4446 "jmp paeth_paeth2 \n\t"
4447
4448 "paeth_abc2: \n\t"
7f88f624
VZ
4449 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4450 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
c6b71bff
GD
4451
4452 "paeth_paeth2: \n\t"
4453 "incl %%ebx \n\t"
4454 "incl %%edx \n\t"
7f88f624 4455 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
4456 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4457 "cmpl _FullLength, %%ebx \n\t"
4458 "jb paeth_lp2 \n\t"
4459
4460 "paeth_end: \n\t"
7f88f624 4461 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
c6b71bff 4462#ifdef __PIC__
7f88f624 4463 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
4464#endif
4465
7f88f624 4466 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
4467 "=S" (dummy_value_S),
4468 "=D" (dummy_value_D)
4469
7f88f624
VZ
4470 : "0" (bpp), /* ecx // input regs */
4471 "1" (prev_row), /* esi */
4472 "2" (row) /* edi */
c6b71bff 4473
7f88f624 4474 : "%eax", "%edx" /* clobber list (no input regs!) */
c6b71bff
GD
4475#ifndef __PIC__
4476 , "%ebx"
4477#endif
4478 );
4479
4480} /* end png_read_filter_row_mmx_paeth() */
4481#endif
4482
4483
4484
4485
4486#ifdef PNG_THREAD_UNSAFE_OK
7f88f624
VZ
4487/*===========================================================================*/
4488/* */
4489/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B */
4490/* */
4491/*===========================================================================*/
c6b71bff 4492
7f88f624 4493/* Optimized code for PNG Sub filter decoder */
c6b71bff
GD
4494
4495static void /* PRIVATE */
4496png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4497{
4498 int bpp;
4499 int dummy_value_a;
4500 int dummy_value_D;
4501
7f88f624
VZ
4502 bpp = (row_info->pixel_depth + 7) >> 3; /* calc number of bytes per pixel */
4503 _FullLength = row_info->rowbytes - bpp; /* number of bytes to filter */
c6b71bff
GD
4504
4505 __asm__ __volatile__ (
7f88f624
VZ
4506/*pre "movl row, %%edi \n\t" */
4507 "movl %%edi, %%esi \n\t" /* lp = row */
4508/*pre "movl bpp, %%eax \n\t" */
4509 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4510/*irr "xorl %%eax, %%eax \n\t" */
4511 /* get # of bytes to alignment */
4512 "movl %%edi, _dif \n\t" /* take start of row */
4513 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past */
4514 /* alignment boundary */
c6b71bff 4515 "xorl %%ecx, %%ecx \n\t"
7f88f624
VZ
4516 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
4517 "subl %%edi, _dif \n\t" /* subtract from start ==> value */
4518 "jz sub_go \n\t" /* ecx at alignment */
c6b71bff 4519
7f88f624 4520 "sub_lp1: \n\t" /* fix alignment */
c6b71bff
GD
4521 "movb (%%esi,%%ecx,), %%al \n\t"
4522 "addb %%al, (%%edi,%%ecx,) \n\t"
4523 "incl %%ecx \n\t"
4524 "cmpl _dif, %%ecx \n\t"
4525 "jb sub_lp1 \n\t"
4526
4527 "sub_go: \n\t"
4528 "movl _FullLength, %%eax \n\t"
4529 "movl %%eax, %%edx \n\t"
7f88f624
VZ
4530 "subl %%ecx, %%edx \n\t" /* subtract alignment fix */
4531 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
4532 "subl %%edx, %%eax \n\t" /* drop over bytes from length */
c6b71bff
GD
4533 "movl %%eax, _MMXLength \n\t"
4534
7f88f624
VZ
4535 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4536 "=D" (dummy_value_D) /* 1 */
c6b71bff 4537
7f88f624
VZ
4538 : "0" (bpp), /* eax // input regs */
4539 "1" (row) /* edi */
c6b71bff 4540
5b02c8a1 4541 : "%esi", "%ecx", "%edx" // clobber list
c6b71bff
GD
4542
4543#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4544 , "%mm0", "%mm1", "%mm2", "%mm3"
4545 , "%mm4", "%mm5", "%mm6", "%mm7"
4546#endif
4547 );
4548
7f88f624 4549 /* now do the math for the rest of the row */
c6b71bff
GD
4550 switch (bpp)
4551 {
4552 case 3:
4553 {
4554 _ActiveMask.use = 0x0000ffffff000000LL;
7f88f624
VZ
4555 _ShiftBpp.use = 24; /* == 3 * 8 */
4556 _ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
4557
4558 __asm__ __volatile__ (
7f88f624
VZ
4559/* preload "movl row, %%edi \n\t" */
4560 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4561 /* active byte group */
4562 "movl %%edi, %%esi \n\t" /* lp = row */
4563/* preload "movl bpp, %%eax \n\t" */
4564 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4565 "movq %%mm7, %%mm6 \n\t"
4566 "movl _dif, %%edx \n\t"
7f88f624
VZ
4567 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4568 /* 3rd active byte group */
4569 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
4570 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4571
7f88f624
VZ
4572 "sub_3lp: \n\t" /* shift data for adding first */
4573 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4574 /* shift clears inactive bytes) */
4575 /* add 1st active group */
c6b71bff
GD
4576 "movq (%%edi,%%edx,), %%mm0 \n\t"
4577 "paddb %%mm1, %%mm0 \n\t"
4578
7f88f624
VZ
4579 /* add 2nd active group */
4580 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4581 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4582 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
c6b71bff
GD
4583 "paddb %%mm1, %%mm0 \n\t"
4584
7f88f624
VZ
4585 /* add 3rd active group */
4586 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4587 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4588 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
c6b71bff
GD
4589 "addl $8, %%edx \n\t"
4590 "paddb %%mm1, %%mm0 \n\t"
4591
4592 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4593 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4594 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
c6b71bff
GD
4595 "jb sub_3lp \n\t"
4596
7f88f624
VZ
4597 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4598 "=D" (dummy_value_D) /* 1 */
c6b71bff 4599
7f88f624
VZ
4600 : "0" (bpp), /* eax // input regs */
4601 "1" (row) /* edi */
c6b71bff 4602
7f88f624 4603 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4604#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4605 , "%mm0", "%mm1", "%mm6", "%mm7"
4606#endif
4607 );
4608 }
4609 break;
4610
4611 case 1:
4612 {
4613 __asm__ __volatile__ (
4614 "movl _dif, %%edx \n\t"
7f88f624 4615/* preload "movl row, %%edi \n\t" */
c6b71bff
GD
4616 "cmpl _FullLength, %%edx \n\t"
4617 "jnb sub_1end \n\t"
7f88f624 4618 "movl %%edi, %%esi \n\t" /* lp = row */
c6b71bff 4619 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4620/* preload "movl bpp, %%eax \n\t" */
4621 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4622
4623 "sub_1lp: \n\t"
4624 "movb (%%esi,%%edx,), %%al \n\t"
4625 "addb %%al, (%%edi,%%edx,) \n\t"
4626 "incl %%edx \n\t"
4627 "cmpl _FullLength, %%edx \n\t"
4628 "jb sub_1lp \n\t"
4629
4630 "sub_1end: \n\t"
4631
7f88f624
VZ
4632 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4633 "=D" (dummy_value_D) /* 1 */
c6b71bff 4634
7f88f624
VZ
4635 : "0" (bpp), /* eax // input regs */
4636 "1" (row) /* edi */
c6b71bff 4637
7f88f624 4638 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4639 );
4640 }
4641 return;
4642
4643 case 6:
4644 case 4:
7f88f624
VZ
4645 //case 7: /* GRR BOGUS */
4646 //case 5: /* GRR BOGUS */
c6b71bff
GD
4647 {
4648 _ShiftBpp.use = bpp << 3;
4649 _ShiftRem.use = 64 - _ShiftBpp.use;
4650
4651 __asm__ __volatile__ (
7f88f624 4652/* preload "movl row, %%edi \n\t" */
c6b71bff 4653 "movl _dif, %%edx \n\t"
7f88f624
VZ
4654 "movl %%edi, %%esi \n\t" /* lp = row */
4655/* preload "movl bpp, %%eax \n\t" */
4656 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff 4657
7f88f624 4658 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
4659 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4660
7f88f624
VZ
4661 "sub_4lp: \n\t" /* shift data for adding first */
4662 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4663 /* shift clears inactive bytes) */
c6b71bff
GD
4664 "movq (%%edi,%%edx,), %%mm0 \n\t"
4665 "paddb %%mm1, %%mm0 \n\t"
4666
7f88f624
VZ
4667 /* add 2nd active group */
4668 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4669 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
c6b71bff
GD
4670 "addl $8, %%edx \n\t"
4671 "paddb %%mm1, %%mm0 \n\t"
4672
4673 "cmpl _MMXLength, %%edx \n\t"
4674 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
7f88f624 4675 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
c6b71bff
GD
4676 "jb sub_4lp \n\t"
4677
7f88f624
VZ
4678 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4679 "=D" (dummy_value_D) /* 1 */
c6b71bff 4680
7f88f624
VZ
4681 : "0" (bpp), /* eax // input regs */
4682 "1" (row) /* edi */
c6b71bff 4683
7f88f624 4684 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4685#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4686 , "%mm0", "%mm1"
4687#endif
4688 );
4689 }
4690 break;
4691
4692 case 2:
4693 {
4694 _ActiveMask.use = 0x00000000ffff0000LL;
7f88f624
VZ
4695 _ShiftBpp.use = 16; /* == 2 * 8 */
4696 _ShiftRem.use = 48; /* == 64 - 16 */
c6b71bff
GD
4697
4698 __asm__ __volatile__ (
7f88f624
VZ
4699 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4700 /* active byte group */
c6b71bff
GD
4701 "movl _dif, %%edx \n\t"
4702 "movq %%mm7, %%mm6 \n\t"
7f88f624
VZ
4703/* preload "movl row, %%edi \n\t" */
4704 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4705 /* 3rd active byte group */
4706 "movl %%edi, %%esi \n\t" /* lp = row */
c6b71bff 4707 "movq %%mm6, %%mm5 \n\t"
7f88f624
VZ
4708/* preload "movl bpp, %%eax \n\t" */
4709 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4710 "psllq _ShiftBpp, %%mm5 \n\t" /* move mask in mm5 to cover */
4711 /* 4th active byte group */
4712 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
4713 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4714
7f88f624
VZ
4715 "sub_2lp: \n\t" /* shift data for adding first */
4716 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4717 /* shift clears inactive bytes) */
4718 /* add 1st active group */
c6b71bff
GD
4719 "movq (%%edi,%%edx,), %%mm0 \n\t"
4720 "paddb %%mm1, %%mm0 \n\t"
4721
7f88f624
VZ
4722 /* add 2nd active group */
4723 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4724 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4725 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
c6b71bff
GD
4726 "paddb %%mm1, %%mm0 \n\t"
4727
7f88f624
VZ
4728 /* add 3rd active group */
4729 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4730 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4731 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
c6b71bff
GD
4732 "paddb %%mm1, %%mm0 \n\t"
4733
7f88f624
VZ
4734 /* add 4th active group */
4735 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4736 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4737 "pand %%mm5, %%mm1 \n\t" /* mask to use 4th active group */
c6b71bff
GD
4738 "addl $8, %%edx \n\t"
4739 "paddb %%mm1, %%mm0 \n\t"
4740 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4741 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4742 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
c6b71bff
GD
4743 "jb sub_2lp \n\t"
4744
7f88f624
VZ
4745 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4746 "=D" (dummy_value_D) /* 1 */
c6b71bff 4747
7f88f624
VZ
4748 : "0" (bpp), /* eax // input regs */
4749 "1" (row) /* edi */
c6b71bff 4750
7f88f624 4751 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4752#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4753 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4754#endif
4755 );
4756 }
4757 break;
4758
4759 case 8:
4760 {
4761 __asm__ __volatile__ (
7f88f624 4762/* preload "movl row, %%edi \n\t" */
c6b71bff 4763 "movl _dif, %%edx \n\t"
7f88f624
VZ
4764 "movl %%edi, %%esi \n\t" /* lp = row */
4765/* preload "movl bpp, %%eax \n\t" */
4766 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4767 "movl _MMXLength, %%ecx \n\t"
4768
7f88f624 4769 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff 4770 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
7f88f624 4771 "andl $0x0000003f, %%ecx \n\t" /* calc bytes over mult of 64 */
c6b71bff
GD
4772
4773 "sub_8lp: \n\t"
7f88f624 4774 "movq (%%edi,%%edx,), %%mm0 \n\t" /* load Sub(x) for 1st 8 bytes */
c6b71bff 4775 "paddb %%mm7, %%mm0 \n\t"
7f88f624
VZ
4776 "movq 8(%%edi,%%edx,), %%mm1 \n\t" /* load Sub(x) for 2nd 8 bytes */
4777 "movq %%mm0, (%%edi,%%edx,) \n\t" /* write Raw(x) for 1st 8 bytes */
c6b71bff 4778
7f88f624
VZ
4779 /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */
4780 /* This will be repeated for each group of 8 bytes with the 8th */
4781 /* group being used as the Raw(x-bpp) for the 1st group of the */
4782 /* next loop. */
c6b71bff
GD
4783
4784 "paddb %%mm0, %%mm1 \n\t"
7f88f624
VZ
4785 "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */
4786 "movq %%mm1, 8(%%edi,%%edx,) \n\t" /* write Raw(x) for 2nd 8 bytes */
c6b71bff 4787 "paddb %%mm1, %%mm2 \n\t"
7f88f624
VZ
4788 "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */
4789 "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */
c6b71bff 4790 "paddb %%mm2, %%mm3 \n\t"
7f88f624
VZ
4791 "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */
4792 "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */
c6b71bff 4793 "paddb %%mm3, %%mm4 \n\t"
7f88f624
VZ
4794 "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */
4795 "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */
c6b71bff 4796 "paddb %%mm4, %%mm5 \n\t"
7f88f624
VZ
4797 "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */
4798 "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */
c6b71bff 4799 "paddb %%mm5, %%mm6 \n\t"
7f88f624
VZ
4800 "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */
4801 "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */
c6b71bff
GD
4802 "addl $64, %%edx \n\t"
4803 "paddb %%mm6, %%mm7 \n\t"
4804 "cmpl %%ecx, %%edx \n\t"
7f88f624 4805 "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */
c6b71bff
GD
4806 "jb sub_8lp \n\t"
4807
4808 "cmpl _MMXLength, %%edx \n\t"
4809 "jnb sub_8lt8 \n\t"
4810
4811 "sub_8lpA: \n\t"
4812 "movq (%%edi,%%edx,), %%mm0 \n\t"
4813 "addl $8, %%edx \n\t"
4814 "paddb %%mm7, %%mm0 \n\t"
4815 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4816 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */
4817 "movq %%mm0, %%mm7 \n\t" /* move calculated Raw(x) data */
4818 /* to mm1 to be new Raw(x-bpp) */
4819 /* for next loop */
c6b71bff
GD
4820 "jb sub_8lpA \n\t"
4821
4822 "sub_8lt8: \n\t"
4823
7f88f624
VZ
4824 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4825 "=D" (dummy_value_D) /* 1 */
c6b71bff 4826
7f88f624
VZ
4827 : "0" (bpp), /* eax // input regs */
4828 "1" (row) /* edi */
c6b71bff 4829
7f88f624 4830 : "%ecx", "%edx", "%esi" /* clobber list */
c6b71bff
GD
4831#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4833#endif
4834 );
4835 }
4836 break;
4837
7f88f624 4838 default: /* bpp greater than 8 bytes GRR BOGUS */
c6b71bff
GD
4839 {
4840 __asm__ __volatile__ (
4841 "movl _dif, %%edx \n\t"
7f88f624
VZ
4842/* preload "movl row, %%edi \n\t" */
4843 "movl %%edi, %%esi \n\t" /* lp = row */
4844/* preload "movl bpp, %%eax \n\t" */
4845 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4846
4847 "sub_Alp: \n\t"
4848 "movq (%%edi,%%edx,), %%mm0 \n\t"
4849 "movq (%%esi,%%edx,), %%mm1 \n\t"
4850 "addl $8, %%edx \n\t"
4851 "paddb %%mm1, %%mm0 \n\t"
4852 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4853 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */
4854 /* -8 to offset addl edx */
c6b71bff
GD
4855 "jb sub_Alp \n\t"
4856
7f88f624
VZ
4857 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4858 "=D" (dummy_value_D) /* 1 */
c6b71bff 4859
7f88f624
VZ
4860 : "0" (bpp), /* eax // input regs */
4861 "1" (row) /* edi */
c6b71bff 4862
7f88f624 4863 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4864#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4865 , "%mm0", "%mm1"
4866#endif
4867 );
4868 }
4869 break;
4870
7f88f624 4871 } /* end switch (bpp) */
c6b71bff
GD
4872
4873 __asm__ __volatile__ (
4874 "movl _MMXLength, %%edx \n\t"
7f88f624 4875/* pre "movl row, %%edi \n\t" */
c6b71bff
GD
4876 "cmpl _FullLength, %%edx \n\t"
4877 "jnb sub_end \n\t"
4878
7f88f624
VZ
4879 "movl %%edi, %%esi \n\t" /* lp = row */
4880/* pre "movl bpp, %%eax \n\t" */
4881 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4882 "xorl %%eax, %%eax \n\t"
4883
4884 "sub_lp2: \n\t"
4885 "movb (%%esi,%%edx,), %%al \n\t"
4886 "addb %%al, (%%edi,%%edx,) \n\t"
4887 "incl %%edx \n\t"
4888 "cmpl _FullLength, %%edx \n\t"
4889 "jb sub_lp2 \n\t"
4890
4891 "sub_end: \n\t"
7f88f624 4892 "EMMS \n\t" /* end MMX instructions */
c6b71bff 4893
7f88f624
VZ
4894 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4895 "=D" (dummy_value_D) /* 1 */
c6b71bff 4896
7f88f624
VZ
4897 : "0" (bpp), /* eax // input regs */
4898 "1" (row) /* edi */
c6b71bff 4899
7f88f624 4900 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4901 );
4902
7f88f624 4903} /* end of png_read_filter_row_mmx_sub() */
c6b71bff
GD
4904#endif
4905
4906
4907
4908
7f88f624
VZ
4909/*===========================================================================*/
4910/* */
4911/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P */
4912/* */
4913/*===========================================================================*/
c6b71bff 4914
7f88f624 4915/* Optimized code for PNG Up filter decoder */
c6b71bff
GD
4916
4917static void /* PRIVATE */
4918png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4919 png_bytep prev_row)
4920{
4921 png_uint_32 len;
7f88f624 4922 int dummy_value_d; /* fix 'forbidden register 3 (dx) was spilled' error */
c6b71bff
GD
4923 int dummy_value_S;
4924 int dummy_value_D;
4925
7f88f624 4926 len = row_info->rowbytes; /* number of bytes to filter */
c6b71bff
GD
4927
4928 __asm__ __volatile__ (
7f88f624
VZ
4929/* pre "movl row, %%edi \n\t" */
4930 /* get # of bytes to alignment */
c6b71bff
GD
4931#ifdef __PIC__
4932 "pushl %%ebx \n\t"
4933#endif
4934 "movl %%edi, %%ecx \n\t"
4935 "xorl %%ebx, %%ebx \n\t"
4936 "addl $0x7, %%ecx \n\t"
4937 "xorl %%eax, %%eax \n\t"
4938 "andl $0xfffffff8, %%ecx \n\t"
7f88f624 4939/* pre "movl prev_row, %%esi \n\t" */
c6b71bff
GD
4940 "subl %%edi, %%ecx \n\t"
4941 "jz up_go \n\t"
4942
7f88f624 4943 "up_lp1: \n\t" /* fix alignment */
c6b71bff
GD
4944 "movb (%%edi,%%ebx,), %%al \n\t"
4945 "addb (%%esi,%%ebx,), %%al \n\t"
4946 "incl %%ebx \n\t"
4947 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
4948 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
4949 "jb up_lp1 \n\t" /* offset incl ebx */
c6b71bff
GD
4950
4951 "up_go: \n\t"
7f88f624 4952/* pre "movl len, %%edx \n\t" */
c6b71bff 4953 "movl %%edx, %%ecx \n\t"
7f88f624
VZ
4954 "subl %%ebx, %%edx \n\t" /* subtract alignment fix */
4955 "andl $0x0000003f, %%edx \n\t" /* calc bytes over mult of 64 */
4956 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
c6b71bff 4957
7f88f624
VZ
4958 /* unrolled loop - use all MMX registers and interleave to reduce */
4959 /* number of branch instructions (loops) and reduce partial stalls */
c6b71bff
GD
4960 "up_loop: \n\t"
4961 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4962 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4963 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4964 "paddb %%mm1, %%mm0 \n\t"
4965 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4966 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4967 "paddb %%mm3, %%mm2 \n\t"
4968 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4969 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4970 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4971 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4972 "paddb %%mm5, %%mm4 \n\t"
4973 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4974 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4975 "paddb %%mm7, %%mm6 \n\t"
4976 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4977 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4978 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4979 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4980 "paddb %%mm1, %%mm0 \n\t"
4981 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4982 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4983 "paddb %%mm3, %%mm2 \n\t"
4984 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4985 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4986 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4987 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4988 "paddb %%mm5, %%mm4 \n\t"
4989 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4990 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4991 "addl $64, %%ebx \n\t"
4992 "paddb %%mm7, %%mm6 \n\t"
4993 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
4994 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */
4995 "jb up_loop \n\t" /* -8 to offset addl ebx */
c6b71bff 4996
7f88f624 4997 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 64 */
c6b71bff
GD
4998 "jz up_end \n\t"
4999
7f88f624
VZ
5000 "cmpl $8, %%edx \n\t" /* test for less than 8 bytes */
5001 "jb up_lt8 \n\t" /* [added by lcreeve@netins.net] */
c6b71bff
GD
5002
5003 "addl %%edx, %%ecx \n\t"
7f88f624
VZ
5004 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
5005 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
c6b71bff
GD
5006 "jz up_lt8 \n\t"
5007
7f88f624 5008 "up_lpA: \n\t" /* use MMX regs to update 8 bytes sim. */
c6b71bff
GD
5009 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5010 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5011 "addl $8, %%ebx \n\t"
5012 "paddb %%mm1, %%mm0 \n\t"
5013 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
5014 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */
5015 "jb up_lpA \n\t" /* offset add ebx */
5016 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 8 */
c6b71bff
GD
5017 "jz up_end \n\t"
5018
5019 "up_lt8: \n\t"
5020 "xorl %%eax, %%eax \n\t"
7f88f624 5021 "addl %%edx, %%ecx \n\t" /* move over byte count into counter */
c6b71bff 5022
7f88f624 5023 "up_lp2: \n\t" /* use x86 regs for remaining bytes */
c6b71bff
GD
5024 "movb (%%edi,%%ebx,), %%al \n\t"
5025 "addb (%%esi,%%ebx,), %%al \n\t"
5026 "incl %%ebx \n\t"
5027 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
5028 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
5029 "jb up_lp2 \n\t" /* offset inc ebx */
c6b71bff
GD
5030
5031 "up_end: \n\t"
7f88f624 5032 "EMMS \n\t" /* conversion of filtered row complete */
c6b71bff
GD
5033#ifdef __PIC__
5034 "popl %%ebx \n\t"
5035#endif
5036
7f88f624
VZ
5037 : "=d" (dummy_value_d), /* 0 // output regs (dummy) */
5038 "=S" (dummy_value_S), /* 1 */
5039 "=D" (dummy_value_D) /* 2 */
c6b71bff 5040
7f88f624
VZ
5041 : "0" (len), /* edx // input regs */
5042 "1" (prev_row), /* esi */
5043 "2" (row) /* edi */
c6b71bff 5044
5b02c8a1
VS
5045 : "%eax", "%ecx" // clobber list (no input regs!)
5046#ifndef __PIC__
5047 , "%ebx"
5048#endif
c6b71bff
GD
5049
5050#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5051 , "%mm0", "%mm1", "%mm2", "%mm3"
5052 , "%mm4", "%mm5", "%mm6", "%mm7"
5053#endif
5054 );
5055
7f88f624 5056} /* end of png_read_filter_row_mmx_up() */
c6b71bff
GD
5057
5058#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5059
5060
5061
5062
5063/*===========================================================================*/
5064/* */
5065/* P N G _ R E A D _ F I L T E R _ R O W */
5066/* */
5067/*===========================================================================*/
5068
5069
5070/* Optimized png_read_filter_row routines */
5071
5072void /* PRIVATE */
5073png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5074 row, png_bytep prev_row, int filter)
5075{
5076#ifdef PNG_DEBUG
5077 char filnm[10];
5078#endif
5079
5080#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5081/* GRR: these are superseded by png_ptr->asm_flags: */
7f88f624
VZ
5082#define UseMMX_sub 1 /* GRR: converted 20000730 */
5083#define UseMMX_up 1 /* GRR: converted 20000729 */
5084#define UseMMX_avg 1 /* GRR: converted 20000828 (+ 16-bit bugfix 20000916) */
5085#define UseMMX_paeth 1 /* GRR: converted 20000828 */
c6b71bff
GD
5086
5087 if (_mmx_supported == 2) {
5088 /* this should have happened in png_init_mmx_flags() already */
2b5f62a0 5089#if !defined(PNG_1_0_X)
c6b71bff 5090 png_warning(png_ptr, "asm_flags may not have been initialized");
2b5f62a0 5091#endif
c6b71bff
GD
5092 png_mmx_support();
5093 }
5094#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5095
5096#ifdef PNG_DEBUG
5097 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5098 switch (filter)
5099 {
5100 case 0: sprintf(filnm, "none");
5101 break;
5102 case 1: sprintf(filnm, "sub-%s",
5103#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5104#if !defined(PNG_1_0_X)
5105 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5106#endif
5107#endif
5108"x86");
5109 break;
5110 case 2: sprintf(filnm, "up-%s",
5111#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5112#if !defined(PNG_1_0_X)
5113 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5114#endif
5115#endif
5116 "x86");
5117 break;
5118 case 3: sprintf(filnm, "avg-%s",
5119#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5120#if !defined(PNG_1_0_X)
5121 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5122#endif
5123#endif
5124 "x86");
5125 break;
5126 case 4: sprintf(filnm, "Paeth-%s",
5127#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5128#if !defined(PNG_1_0_X)
5129 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5130#endif
5131#endif
5132"x86");
5133 break;
5134 default: sprintf(filnm, "unknw");
5135 break;
5136 }
5137 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5138 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5139 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5140 (int)((row_info->pixel_depth + 7) >> 3));
5141 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5142#endif /* PNG_DEBUG */
5143
5144 switch (filter)
5145 {
5146 case PNG_FILTER_VALUE_NONE:
5147 break;
5148
5149 case PNG_FILTER_VALUE_SUB:
5150#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5151#if !defined(PNG_1_0_X)
5152 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5153 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5154 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5155#else
5156 if (_mmx_supported)
5157#endif
5158 {
5159 png_read_filter_row_mmx_sub(row_info, row);
5160 }
5161 else
5162#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5163 {
5164 png_uint_32 i;
5165 png_uint_32 istop = row_info->rowbytes;
5166 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5167 png_bytep rp = row + bpp;
5168 png_bytep lp = row;
5169
5170 for (i = bpp; i < istop; i++)
5171 {
5172 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5173 rp++;
5174 }
5175 } /* end !UseMMX_sub */
5176 break;
5177
5178 case PNG_FILTER_VALUE_UP:
5179#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5180#if !defined(PNG_1_0_X)
5181 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5182 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5183 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5184#else
5185 if (_mmx_supported)
5186#endif
5187 {
5188 png_read_filter_row_mmx_up(row_info, row, prev_row);
5189 }
5190 else
5191#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5192 {
5193 png_uint_32 i;
5194 png_uint_32 istop = row_info->rowbytes;
5195 png_bytep rp = row;
5196 png_bytep pp = prev_row;
5197
5198 for (i = 0; i < istop; ++i)
5199 {
5200 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5201 rp++;
5202 }
5203 } /* end !UseMMX_up */
5204 break;
5205
5206 case PNG_FILTER_VALUE_AVG:
5207#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5208#if !defined(PNG_1_0_X)
5209 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5210 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5211 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5212#else
5213 if (_mmx_supported)
5214#endif
5215 {
5216 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5217 }
5218 else
5219#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5220 {
5221 png_uint_32 i;
5222 png_bytep rp = row;
5223 png_bytep pp = prev_row;
5224 png_bytep lp = row;
5225 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5226 png_uint_32 istop = row_info->rowbytes - bpp;
5227
5228 for (i = 0; i < bpp; i++)
5229 {
5230 *rp = (png_byte)(((int)(*rp) +
5231 ((int)(*pp++) >> 1)) & 0xff);
5232 rp++;
5233 }
5234
5235 for (i = 0; i < istop; i++)
5236 {
5237 *rp = (png_byte)(((int)(*rp) +
5238 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5239 rp++;
5240 }
5241 } /* end !UseMMX_avg */
5242 break;
5243
5244 case PNG_FILTER_VALUE_PAETH:
5245#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5246#if !defined(PNG_1_0_X)
5247 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5248 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5249 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5250#else
5251 if (_mmx_supported)
5252#endif
5253 {
5254 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5255 }
5256 else
5257#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5258 {
5259 png_uint_32 i;
5260 png_bytep rp = row;
5261 png_bytep pp = prev_row;
5262 png_bytep lp = row;
5263 png_bytep cp = prev_row;
5264 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5265 png_uint_32 istop = row_info->rowbytes - bpp;
5266
5267 for (i = 0; i < bpp; i++)
5268 {
5269 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5270 rp++;
5271 }
5272
5273 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5274 {
5275 int a, b, c, pa, pb, pc, p;
5276
5277 a = *lp++;
5278 b = *pp++;
5279 c = *cp++;
5280
5281 p = b - c;
5282 pc = a - c;
5283
5284#ifdef PNG_USE_ABS
5285 pa = abs(p);
5286 pb = abs(pc);
5287 pc = abs(p + pc);
5288#else
5289 pa = p < 0 ? -p : p;
5290 pb = pc < 0 ? -pc : pc;
5291 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5292#endif
5293
5294 /*
5295 if (pa <= pb && pa <= pc)
5296 p = a;
5297 else if (pb <= pc)
5298 p = b;
5299 else
5300 p = c;
5301 */
5302
5303 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5304
5305 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5306 rp++;
5307 }
5308 } /* end !UseMMX_paeth */
5309 break;
5310
5311 default:
5312 png_warning(png_ptr, "Ignoring bad row-filter type");
5313 *row=0;
5314 break;
5315 }
5316}
5317
5318#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5319
5320
5321/*===========================================================================*/
5322/* */
5323/* P N G _ M M X _ S U P P O R T */
5324/* */
5325/*===========================================================================*/
5326
5327/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5328 * (2) all instructions compile with gcc 2.7.2.3 and later
5329 * (3) the function is moved down here to prevent gcc from
5330 * inlining it in multiple places and then barfing be-
5331 * cause the ".NOT_SUPPORTED" label is multiply defined
5332 * [is there a way to signal that a *single* function should
5333 * not be inlined? is there a way to modify the label for
5334 * each inlined instance, e.g., by appending _1, _2, etc.?
5335 * maybe if don't use leading "." in label name? (nope...sigh)]
5336 */
5337
5338int PNGAPI
5339png_mmx_support(void)
5340{
5341#if defined(PNG_MMX_CODE_SUPPORTED)
5342 __asm__ __volatile__ (
5b02c8a1
VS
5343 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5344 "pushl %%ecx \n\t" // so does ecx...
5345 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5346// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5347// "pushf \n\t" // 16-bit pushf
5348 "pushfl \n\t" // save Eflag to stack
5349 "popl %%eax \n\t" // get Eflag from stack into eax
5350 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5351 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5352 "pushl %%eax \n\t" // save modified Eflag back to stack
5353// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5354// "popf \n\t" // 16-bit popf
5355 "popfl \n\t" // restore modified value to Eflag reg
5356 "pushfl \n\t" // save Eflag to stack
5357 "popl %%eax \n\t" // get Eflag from stack
5358 "pushl %%ecx \n\t" // save original Eflag to stack
5359 "popfl \n\t" // restore original Eflag
5360 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5361 "jz 0f \n\t" // if same, CPUID instr. is not supported
5362
5363 "xorl %%eax, %%eax \n\t" // set eax to zero
5364// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5365 "cpuid \n\t" // get the CPU identification info
5366 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5367 "jl 0f \n\t" // if eax is zero, MMX is not supported
5368
5369 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5370 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5371 // faster than the instruction "mov eax, 1"
5372 "cpuid \n\t" // get the CPU identification info again
5373 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5374 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5375 "jz 0f \n\t" // non-zero = yes, MMX IS supported
5376
5377 "movl $1, %%eax \n\t" // set return value to 1
5378 "jmp 1f \n\t" // DONE: have MMX support
5379
5380 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5381 "movl $0, %%eax \n\t" // set return value to 0
5382 "1: \n\t" // .RETURN: target label for jump instructions
5383 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5384 "popl %%edx \n\t" // restore edx
5385 "popl %%ecx \n\t" // restore ecx
5386 "popl %%ebx \n\t" // restore ebx
5387
5388// "ret \n\t" // DONE: no MMX support
5389 // (fall through to standard C "ret")
5390
5391 : // output list (none)
5392
5393 : // any variables used on input (none)
5394
5395 : "%eax" // clobber list
5396// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5397// , "memory" // if write to a variable gcc thought was in a reg
5398// , "cc" // "condition codes" (flag bits)
c6b71bff
GD
5399 );
5400#else
5401 _mmx_supported = 0;
5402#endif /* PNG_MMX_CODE_SUPPORTED */
5403
5404 return _mmx_supported;
5405}
5406
5407
5408#endif /* PNG_USE_PNGGCCRD */