]> git.saurik.com Git - wxWidgets.git/blame - src/png/pnggccrd.c
fix for always terminating intermediate UniChar String for 4 bytes wchar_t
[wxWidgets.git] / src / png / pnggccrd.c
CommitLineData
c6b71bff
GD
1/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4 *
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
8 *
2b5f62a0 9 * libpng version 1.2.5rc3 - September 18, 2002
c6b71bff
GD
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2002 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
13 *
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
17 *
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19 *
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21 *
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
23 *
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
26 * For djgpp, see
27 *
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29 *
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
32 *
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36 * binutils.tgz
37 *
38 * For other platforms, see the main GNU site:
39 *
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
41 *
42 * Version 2.5.2l.15 is definitely too old...
43 */
44
45/*
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
48 *
49 * 19991006:
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51 *
52 * 19991007:
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
79 *
80 * 19991016:
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
89 *
90 * 19991023:
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
96 *
97 * 19991024:
98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_read_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
113 *
114 * 19991107:
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
117 *
118 * 19991120:
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
124 *
125 * 20000213:
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
127 *
128 * 20000319:
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
131 *
132 * 20000623:
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
139 *
140 * 20000706:
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
152 *
153 * 20000729:
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
160 * mapping of latter)
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
163 *
164 * 20000731:
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166 *
167 * 20000822:
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
173 *
174 * 20000823:
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
178 *
179 * 20000826:
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
183 *
184 * 20000828:
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
189 *
190 * 20000829:
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
195 *
196 * 20000914:
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
199 *
200 * 20000916:
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
205 *
206 * 20010101:
207 * - added new png_init_mmx_flags() function (here only because it needs to
208 * call mmxsupport(), which should probably become global png_mmxsupport());
209 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
210 *
211 * 20010103:
212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 * and made it public; moved png_init_mmx_flags() to png.c as internal func
214 *
215 * 20010104:
216 * - removed dependency on png_read_filter_row_c() (C code already duplicated
217 * within MMX version of png_read_filter_row()) so no longer necessary to
218 * compile it into pngrutil.o
219 *
220 * 20010310:
221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222 *
223 * 20020304:
224 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
225 *
226 * STILL TO DO:
227 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
228 * - write MMX code for 48-bit case (pixel_bytes == 6)
229 * - figure out what's up with 24-bit case (pixel_bytes == 3):
230 * why subtract 8 from width_mmx in the pass 4/5 case?
231 * (only width_mmx case) (near line 1606)
232 * - rewrite all MMX interlacing code so it's aligned with beginning
233 * of the row buffer, not the end (see 19991007 for details)
234 * x pick one version of mmxsupport() and get rid of the other
235 * - add error messages to any remaining bogus default cases
236 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
237 * x add support for runtime enable/disable/query of various MMX routines
238 */
239
240#define PNG_INTERNAL
241#include "png.h"
242
243#if defined(PNG_USE_PNGGCCRD)
244
245int PNGAPI png_mmx_support(void);
246
247#ifdef PNG_USE_LOCAL_ARRAYS
248static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
249static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
250static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
251#endif
252
253#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
254/* djgpp, Win32, and Cygwin add their own underscores to global variables,
255 * so define them without: */
256#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
257# define _mmx_supported mmx_supported
258# define _const4 const4
259# define _const6 const6
260# define _mask8_0 mask8_0
261# define _mask16_1 mask16_1
262# define _mask16_0 mask16_0
263# define _mask24_2 mask24_2
264# define _mask24_1 mask24_1
265# define _mask24_0 mask24_0
266# define _mask32_3 mask32_3
267# define _mask32_2 mask32_2
268# define _mask32_1 mask32_1
269# define _mask32_0 mask32_0
270# define _mask48_5 mask48_5
271# define _mask48_4 mask48_4
272# define _mask48_3 mask48_3
273# define _mask48_2 mask48_2
274# define _mask48_1 mask48_1
275# define _mask48_0 mask48_0
276# define _LBCarryMask LBCarryMask
277# define _HBClearMask HBClearMask
278# define _ActiveMask ActiveMask
279# define _ActiveMask2 ActiveMask2
280# define _ActiveMaskEnd ActiveMaskEnd
281# define _ShiftBpp ShiftBpp
282# define _ShiftRem ShiftRem
283#ifdef PNG_THREAD_UNSAFE_OK
284# define _unmask unmask
285# define _FullLength FullLength
286# define _MMXLength MMXLength
287# define _dif dif
288# define _patemp patemp
289# define _pbtemp pbtemp
290# define _pctemp pctemp
291#endif
292#endif
293
294
295/* These constants are used in the inlined MMX assembly code.
296 Ignore gcc's "At top level: defined but not used" warnings. */
297
298/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
299 * since that case uses the %ebx register for indexing the Global Offset Table
300 * and there were no other registers available. But gcc 2.95 and later emit
301 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
302 * in the non-PIC case, so we'll just use the global unconditionally now.
303 */
304#ifdef PNG_THREAD_UNSAFE_OK
305static int _unmask;
306#endif
307
308static unsigned long long _mask8_0 = 0x0102040810204080LL;
309
310static unsigned long long _mask16_1 = 0x0101020204040808LL;
311static unsigned long long _mask16_0 = 0x1010202040408080LL;
312
313static unsigned long long _mask24_2 = 0x0101010202020404LL;
314static unsigned long long _mask24_1 = 0x0408080810101020LL;
315static unsigned long long _mask24_0 = 0x2020404040808080LL;
316
317static unsigned long long _mask32_3 = 0x0101010102020202LL;
318static unsigned long long _mask32_2 = 0x0404040408080808LL;
319static unsigned long long _mask32_1 = 0x1010101020202020LL;
320static unsigned long long _mask32_0 = 0x4040404080808080LL;
321
322static unsigned long long _mask48_5 = 0x0101010101010202LL;
323static unsigned long long _mask48_4 = 0x0202020204040404LL;
324static unsigned long long _mask48_3 = 0x0404080808080808LL;
325static unsigned long long _mask48_2 = 0x1010101010102020LL;
326static unsigned long long _mask48_1 = 0x2020202040404040LL;
327static unsigned long long _mask48_0 = 0x4040808080808080LL;
328
329static unsigned long long _const4 = 0x0000000000FFFFFFLL;
7f88f624 330/* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */ /* NOT USED */
c6b71bff
GD
331static unsigned long long _const6 = 0x00000000000000FFLL;
332
7f88f624
VZ
333/* These are used in the row-filter routines and should/would be local */
334/* variables if not for gcc addressing limitations. */
335/* WARNING: Their presence probably defeats the thread safety of libpng. */
c6b71bff
GD
336
337#ifdef PNG_THREAD_UNSAFE_OK
338static png_uint_32 _FullLength;
339static png_uint_32 _MMXLength;
340static int _dif;
7f88f624 341static int _patemp; /* temp variables for Paeth routine */
c6b71bff
GD
342static int _pbtemp;
343static int _pctemp;
344#endif
345
346void /* PRIVATE */
347png_squelch_warnings(void)
348{
349#ifdef PNG_THREAD_UNSAFE_OK
350 _dif = _dif;
351 _patemp = _patemp;
352 _pbtemp = _pbtemp;
353 _pctemp = _pctemp;
354 _MMXLength = _MMXLength;
355#endif
356 _const4 = _const4;
357 _const6 = _const6;
358 _mask8_0 = _mask8_0;
359 _mask16_1 = _mask16_1;
360 _mask16_0 = _mask16_0;
361 _mask24_2 = _mask24_2;
362 _mask24_1 = _mask24_1;
363 _mask24_0 = _mask24_0;
364 _mask32_3 = _mask32_3;
365 _mask32_2 = _mask32_2;
366 _mask32_1 = _mask32_1;
367 _mask32_0 = _mask32_0;
368 _mask48_5 = _mask48_5;
369 _mask48_4 = _mask48_4;
370 _mask48_3 = _mask48_3;
371 _mask48_2 = _mask48_2;
372 _mask48_1 = _mask48_1;
373 _mask48_0 = _mask48_0;
374}
375#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
376
377
378static int _mmx_supported = 2;
379
380/*===========================================================================*/
381/* */
382/* P N G _ C O M B I N E _ R O W */
383/* */
384/*===========================================================================*/
385
386#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
387
388#define BPP2 2
389#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
390#define BPP4 4
391#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
392#define BPP8 8
393
394/* Combines the row recently read in with the previous row.
395 This routine takes care of alpha and transparency if requested.
396 This routine also handles the two methods of progressive display
397 of interlaced images, depending on the mask value.
398 The mask value describes which pixels are to be combined with
399 the row. The pattern always repeats every 8 pixels, so just 8
400 bits are needed. A one indicates the pixel is to be combined; a
401 zero indicates the pixel is to be skipped. This is in addition
402 to any alpha or transparency value associated with the pixel.
403 If you want all pixels to be combined, pass 0xff (255) in mask. */
404
405/* Use this routine for the x86 platform - it uses a faster MMX routine
406 if the machine supports MMX. */
407
408void /* PRIVATE */
409png_combine_row(png_structp png_ptr, png_bytep row, int mask)
410{
411 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
412
413#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
414 if (_mmx_supported == 2) {
415 /* this should have happened in png_init_mmx_flags() already */
416 png_warning(png_ptr, "asm_flags may not have been initialized");
417 png_mmx_support();
418 }
419#endif
420
421 if (mask == 0xff)
422 {
423 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
424 png_memcpy(row, png_ptr->row_buf + 1,
425 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
426 }
427 else /* (png_combine_row() is never called with mask == 0) */
428 {
429 switch (png_ptr->row_info.pixel_depth)
430 {
431 case 1: /* png_ptr->row_info.pixel_depth */
432 {
433 png_bytep sp;
434 png_bytep dp;
435 int s_inc, s_start, s_end;
436 int m;
437 int shift;
438 png_uint_32 i;
439
440 sp = png_ptr->row_buf + 1;
441 dp = row;
442 m = 0x80;
443#if defined(PNG_READ_PACKSWAP_SUPPORTED)
444 if (png_ptr->transformations & PNG_PACKSWAP)
445 {
446 s_start = 0;
447 s_end = 7;
448 s_inc = 1;
449 }
450 else
451#endif
452 {
453 s_start = 7;
454 s_end = 0;
455 s_inc = -1;
456 }
457
458 shift = s_start;
459
460 for (i = 0; i < png_ptr->width; i++)
461 {
462 if (m & mask)
463 {
464 int value;
465
466 value = (*sp >> shift) & 0x1;
467 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
468 *dp |= (png_byte)(value << shift);
469 }
470
471 if (shift == s_end)
472 {
473 shift = s_start;
474 sp++;
475 dp++;
476 }
477 else
478 shift += s_inc;
479
480 if (m == 1)
481 m = 0x80;
482 else
483 m >>= 1;
484 }
485 break;
486 }
487
488 case 2: /* png_ptr->row_info.pixel_depth */
489 {
490 png_bytep sp;
491 png_bytep dp;
492 int s_start, s_end, s_inc;
493 int m;
494 int shift;
495 png_uint_32 i;
496 int value;
497
498 sp = png_ptr->row_buf + 1;
499 dp = row;
500 m = 0x80;
501#if defined(PNG_READ_PACKSWAP_SUPPORTED)
502 if (png_ptr->transformations & PNG_PACKSWAP)
503 {
504 s_start = 0;
505 s_end = 6;
506 s_inc = 2;
507 }
508 else
509#endif
510 {
511 s_start = 6;
512 s_end = 0;
513 s_inc = -2;
514 }
515
516 shift = s_start;
517
518 for (i = 0; i < png_ptr->width; i++)
519 {
520 if (m & mask)
521 {
522 value = (*sp >> shift) & 0x3;
523 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
524 *dp |= (png_byte)(value << shift);
525 }
526
527 if (shift == s_end)
528 {
529 shift = s_start;
530 sp++;
531 dp++;
532 }
533 else
534 shift += s_inc;
535 if (m == 1)
536 m = 0x80;
537 else
538 m >>= 1;
539 }
540 break;
541 }
542
543 case 4: /* png_ptr->row_info.pixel_depth */
544 {
545 png_bytep sp;
546 png_bytep dp;
547 int s_start, s_end, s_inc;
548 int m;
549 int shift;
550 png_uint_32 i;
551 int value;
552
553 sp = png_ptr->row_buf + 1;
554 dp = row;
555 m = 0x80;
556#if defined(PNG_READ_PACKSWAP_SUPPORTED)
557 if (png_ptr->transformations & PNG_PACKSWAP)
558 {
559 s_start = 0;
560 s_end = 4;
561 s_inc = 4;
562 }
563 else
564#endif
565 {
566 s_start = 4;
567 s_end = 0;
568 s_inc = -4;
569 }
570 shift = s_start;
571
572 for (i = 0; i < png_ptr->width; i++)
573 {
574 if (m & mask)
575 {
576 value = (*sp >> shift) & 0xf;
577 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
578 *dp |= (png_byte)(value << shift);
579 }
580
581 if (shift == s_end)
582 {
583 shift = s_start;
584 sp++;
585 dp++;
586 }
587 else
588 shift += s_inc;
589 if (m == 1)
590 m = 0x80;
591 else
592 m >>= 1;
593 }
594 break;
595 }
596
597 case 8: /* png_ptr->row_info.pixel_depth */
598 {
599 png_bytep srcptr;
600 png_bytep dstptr;
601
602#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
603#if !defined(PNG_1_0_X)
604 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
605 /* && _mmx_supported */ )
606#else
607 if (_mmx_supported)
608#endif
609 {
610 png_uint_32 len;
611 int diff;
7f88f624 612 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
613 int dummy_value_d;
614 int dummy_value_c;
615 int dummy_value_S;
616 int dummy_value_D;
7f88f624 617 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
618 srcptr = png_ptr->row_buf + 1;
619 dstptr = row;
7f88f624
VZ
620 len = png_ptr->width &~7; /* reduce to multiple of 8 */
621 diff = (int) (png_ptr->width & 7); /* amount lost */
c6b71bff
GD
622
623 __asm__ __volatile__ (
7f88f624
VZ
624 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
625 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
626 "punpcklbw %%mm7, %%mm7 \n\t"
627 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 628 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
629
630 "movq _mask8_0, %%mm0 \n\t"
7f88f624
VZ
631 "pand %%mm7, %%mm0 \n\t" /* nonzero if keep byte */
632 "pcmpeqb %%mm6, %%mm0 \n\t" /* zeros->1s, v versa */
c6b71bff 633
7f88f624
VZ
634/* preload "movl len, %%ecx \n\t" // load length of line */
635/* preload "movl srcptr, %%esi \n\t" // load source */
636/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff 637
7f88f624 638 "cmpl $0, %%ecx \n\t" /* len == 0 ? */
c6b71bff
GD
639 "je mainloop8end \n\t"
640
641 "mainloop8: \n\t"
7f88f624 642 "movq (%%esi), %%mm4 \n\t" /* *srcptr */
c6b71bff
GD
643 "pand %%mm0, %%mm4 \n\t"
644 "movq %%mm0, %%mm6 \n\t"
7f88f624 645 "pandn (%%edi), %%mm6 \n\t" /* *dstptr */
c6b71bff
GD
646 "por %%mm6, %%mm4 \n\t"
647 "movq %%mm4, (%%edi) \n\t"
7f88f624 648 "addl $8, %%esi \n\t" /* inc by 8 bytes processed */
c6b71bff 649 "addl $8, %%edi \n\t"
7f88f624 650 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
651 "ja mainloop8 \n\t"
652
653 "mainloop8end: \n\t"
7f88f624 654/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
655 "movl %%eax, %%ecx \n\t"
656 "cmpl $0, %%ecx \n\t"
657 "jz end8 \n\t"
7f88f624
VZ
658/* preload "movl mask, %%edx \n\t" */
659 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
660
661 "secondloop8: \n\t"
7f88f624
VZ
662 "sall %%edx \n\t" /* move high bit to CF */
663 "jnc skip8 \n\t" /* if CF = 0 */
c6b71bff
GD
664 "movb (%%esi), %%al \n\t"
665 "movb %%al, (%%edi) \n\t"
666
667 "skip8: \n\t"
668 "incl %%esi \n\t"
669 "incl %%edi \n\t"
670 "decl %%ecx \n\t"
671 "jnz secondloop8 \n\t"
672
673 "end8: \n\t"
7f88f624 674 "EMMS \n\t" /* DONE */
c6b71bff 675
7f88f624 676 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
677 "=d" (dummy_value_d),
678 "=c" (dummy_value_c),
679 "=S" (dummy_value_S),
680 "=D" (dummy_value_D)
681
7f88f624
VZ
682 : "3" (srcptr), /* esi // input regs */
683 "4" (dstptr), /* edi */
684 "0" (diff), /* eax */
685/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
686 "2" (len), /* ecx */
687 "1" (mask) /* edx */
c6b71bff
GD
688
689#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 690 : "%mm0", "%mm4", "%mm6", "%mm7" /* clobber list */
c6b71bff
GD
691#endif
692 );
693 }
694 else /* mmx _not supported - Use modified C routine */
695#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
696 {
697 register png_uint_32 i;
698 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
699 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
700 register int stride = png_pass_inc[png_ptr->pass];
701 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
702 register int rep_bytes = png_pass_width[png_ptr->pass];
703 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
704 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
705 int diff = (int) (png_ptr->width & 7); /* amount lost */
706 register png_uint_32 final_val = len; /* GRR bugfix */
707
708 srcptr = png_ptr->row_buf + 1 + initial_val;
709 dstptr = row + initial_val;
710
711 for (i = initial_val; i < final_val; i += stride)
712 {
713 png_memcpy(dstptr, srcptr, rep_bytes);
714 srcptr += stride;
715 dstptr += stride;
716 }
717 if (diff) /* number of leftover pixels: 3 for pngtest */
718 {
719 final_val+=diff /* *BPP1 */ ;
720 for (; i < final_val; i += stride)
721 {
722 if (rep_bytes > (int)(final_val-i))
723 rep_bytes = (int)(final_val-i);
724 png_memcpy(dstptr, srcptr, rep_bytes);
725 srcptr += stride;
726 dstptr += stride;
727 }
728 }
729
730 } /* end of else (_mmx_supported) */
731
732 break;
733 } /* end 8 bpp */
734
735 case 16: /* png_ptr->row_info.pixel_depth */
736 {
737 png_bytep srcptr;
738 png_bytep dstptr;
739
740#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
741#if !defined(PNG_1_0_X)
742 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
743 /* && _mmx_supported */ )
744#else
745 if (_mmx_supported)
746#endif
747 {
748 png_uint_32 len;
749 int diff;
7f88f624 750 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
751 int dummy_value_d;
752 int dummy_value_c;
753 int dummy_value_S;
754 int dummy_value_D;
7f88f624 755 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
756 srcptr = png_ptr->row_buf + 1;
757 dstptr = row;
7f88f624
VZ
758 len = png_ptr->width &~7; /* reduce to multiple of 8 */
759 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
760
761 __asm__ __volatile__ (
7f88f624
VZ
762 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
763 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
764 "punpcklbw %%mm7, %%mm7 \n\t"
765 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 766 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
767
768 "movq _mask16_0, %%mm0 \n\t"
769 "movq _mask16_1, %%mm1 \n\t"
770
771 "pand %%mm7, %%mm0 \n\t"
772 "pand %%mm7, %%mm1 \n\t"
773
774 "pcmpeqb %%mm6, %%mm0 \n\t"
775 "pcmpeqb %%mm6, %%mm1 \n\t"
776
7f88f624
VZ
777/* preload "movl len, %%ecx \n\t" // load length of line */
778/* preload "movl srcptr, %%esi \n\t" // load source */
779/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff
GD
780
781 "cmpl $0, %%ecx \n\t"
782 "jz mainloop16end \n\t"
783
784 "mainloop16: \n\t"
785 "movq (%%esi), %%mm4 \n\t"
786 "pand %%mm0, %%mm4 \n\t"
787 "movq %%mm0, %%mm6 \n\t"
788 "movq (%%edi), %%mm7 \n\t"
789 "pandn %%mm7, %%mm6 \n\t"
790 "por %%mm6, %%mm4 \n\t"
791 "movq %%mm4, (%%edi) \n\t"
792
793 "movq 8(%%esi), %%mm5 \n\t"
794 "pand %%mm1, %%mm5 \n\t"
795 "movq %%mm1, %%mm7 \n\t"
796 "movq 8(%%edi), %%mm6 \n\t"
797 "pandn %%mm6, %%mm7 \n\t"
798 "por %%mm7, %%mm5 \n\t"
799 "movq %%mm5, 8(%%edi) \n\t"
800
7f88f624 801 "addl $16, %%esi \n\t" /* inc by 16 bytes processed */
c6b71bff 802 "addl $16, %%edi \n\t"
7f88f624 803 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
804 "ja mainloop16 \n\t"
805
806 "mainloop16end: \n\t"
7f88f624 807/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
808 "movl %%eax, %%ecx \n\t"
809 "cmpl $0, %%ecx \n\t"
810 "jz end16 \n\t"
7f88f624
VZ
811/* preload "movl mask, %%edx \n\t" */
812 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
813
814 "secondloop16: \n\t"
7f88f624
VZ
815 "sall %%edx \n\t" /* move high bit to CF */
816 "jnc skip16 \n\t" /* if CF = 0 */
c6b71bff
GD
817 "movw (%%esi), %%ax \n\t"
818 "movw %%ax, (%%edi) \n\t"
819
820 "skip16: \n\t"
821 "addl $2, %%esi \n\t"
822 "addl $2, %%edi \n\t"
823 "decl %%ecx \n\t"
824 "jnz secondloop16 \n\t"
825
826 "end16: \n\t"
7f88f624 827 "EMMS \n\t" /* DONE */
c6b71bff 828
7f88f624 829 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
830 "=c" (dummy_value_c),
831 "=d" (dummy_value_d),
832 "=S" (dummy_value_S),
833 "=D" (dummy_value_D)
834
7f88f624
VZ
835 : "0" (diff), /* eax // input regs */
836/* was (unmask) " " RESERVED // ebx // Global Offset Table idx */
837 "1" (len), /* ecx */
838 "2" (mask), /* edx */
839 "3" (srcptr), /* esi */
840 "4" (dstptr) /* edi */
c6b71bff
GD
841
842#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 843 : "%mm0", "%mm1", "%mm4" /* clobber list */
c6b71bff
GD
844 , "%mm5", "%mm6", "%mm7"
845#endif
846 );
847 }
848 else /* mmx _not supported - Use modified C routine */
849#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
850 {
851 register png_uint_32 i;
852 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
853 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
854 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
855 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
856 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
857 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
858 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
859 int diff = (int) (png_ptr->width & 7); /* amount lost */
860 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
861
862 srcptr = png_ptr->row_buf + 1 + initial_val;
863 dstptr = row + initial_val;
864
865 for (i = initial_val; i < final_val; i += stride)
866 {
867 png_memcpy(dstptr, srcptr, rep_bytes);
868 srcptr += stride;
869 dstptr += stride;
870 }
871 if (diff) /* number of leftover pixels: 3 for pngtest */
872 {
873 final_val+=diff*BPP2;
874 for (; i < final_val; i += stride)
875 {
876 if (rep_bytes > (int)(final_val-i))
877 rep_bytes = (int)(final_val-i);
878 png_memcpy(dstptr, srcptr, rep_bytes);
879 srcptr += stride;
880 dstptr += stride;
881 }
882 }
883 } /* end of else (_mmx_supported) */
884
885 break;
886 } /* end 16 bpp */
887
888 case 24: /* png_ptr->row_info.pixel_depth */
889 {
890 png_bytep srcptr;
891 png_bytep dstptr;
892
893#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
894#if !defined(PNG_1_0_X)
895 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
896 /* && _mmx_supported */ )
897#else
898 if (_mmx_supported)
899#endif
900 {
901 png_uint_32 len;
902 int diff;
7f88f624 903 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
904 int dummy_value_d;
905 int dummy_value_c;
906 int dummy_value_S;
907 int dummy_value_D;
7f88f624 908 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
909 srcptr = png_ptr->row_buf + 1;
910 dstptr = row;
7f88f624
VZ
911 len = png_ptr->width &~7; /* reduce to multiple of 8 */
912 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
913
914 __asm__ __volatile__ (
7f88f624
VZ
915 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
916 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
917 "punpcklbw %%mm7, %%mm7 \n\t"
918 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 919 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
920
921 "movq _mask24_0, %%mm0 \n\t"
922 "movq _mask24_1, %%mm1 \n\t"
923 "movq _mask24_2, %%mm2 \n\t"
924
925 "pand %%mm7, %%mm0 \n\t"
926 "pand %%mm7, %%mm1 \n\t"
927 "pand %%mm7, %%mm2 \n\t"
928
929 "pcmpeqb %%mm6, %%mm0 \n\t"
930 "pcmpeqb %%mm6, %%mm1 \n\t"
931 "pcmpeqb %%mm6, %%mm2 \n\t"
932
7f88f624
VZ
933/* preload "movl len, %%ecx \n\t" // load length of line */
934/* preload "movl srcptr, %%esi \n\t" // load source */
935/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff
GD
936
937 "cmpl $0, %%ecx \n\t"
938 "jz mainloop24end \n\t"
939
940 "mainloop24: \n\t"
941 "movq (%%esi), %%mm4 \n\t"
942 "pand %%mm0, %%mm4 \n\t"
943 "movq %%mm0, %%mm6 \n\t"
944 "movq (%%edi), %%mm7 \n\t"
945 "pandn %%mm7, %%mm6 \n\t"
946 "por %%mm6, %%mm4 \n\t"
947 "movq %%mm4, (%%edi) \n\t"
948
949 "movq 8(%%esi), %%mm5 \n\t"
950 "pand %%mm1, %%mm5 \n\t"
951 "movq %%mm1, %%mm7 \n\t"
952 "movq 8(%%edi), %%mm6 \n\t"
953 "pandn %%mm6, %%mm7 \n\t"
954 "por %%mm7, %%mm5 \n\t"
955 "movq %%mm5, 8(%%edi) \n\t"
956
957 "movq 16(%%esi), %%mm6 \n\t"
958 "pand %%mm2, %%mm6 \n\t"
959 "movq %%mm2, %%mm4 \n\t"
960 "movq 16(%%edi), %%mm7 \n\t"
961 "pandn %%mm7, %%mm4 \n\t"
962 "por %%mm4, %%mm6 \n\t"
963 "movq %%mm6, 16(%%edi) \n\t"
964
7f88f624 965 "addl $24, %%esi \n\t" /* inc by 24 bytes processed */
c6b71bff 966 "addl $24, %%edi \n\t"
7f88f624 967 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
968
969 "ja mainloop24 \n\t"
970
971 "mainloop24end: \n\t"
7f88f624 972/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
973 "movl %%eax, %%ecx \n\t"
974 "cmpl $0, %%ecx \n\t"
975 "jz end24 \n\t"
7f88f624
VZ
976/* preload "movl mask, %%edx \n\t" */
977 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
978
979 "secondloop24: \n\t"
7f88f624
VZ
980 "sall %%edx \n\t" /* move high bit to CF */
981 "jnc skip24 \n\t" /* if CF = 0 */
c6b71bff
GD
982 "movw (%%esi), %%ax \n\t"
983 "movw %%ax, (%%edi) \n\t"
984 "xorl %%eax, %%eax \n\t"
985 "movb 2(%%esi), %%al \n\t"
986 "movb %%al, 2(%%edi) \n\t"
987
988 "skip24: \n\t"
989 "addl $3, %%esi \n\t"
990 "addl $3, %%edi \n\t"
991 "decl %%ecx \n\t"
992 "jnz secondloop24 \n\t"
993
994 "end24: \n\t"
7f88f624 995 "EMMS \n\t" /* DONE */
c6b71bff 996
7f88f624 997 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
998 "=d" (dummy_value_d),
999 "=c" (dummy_value_c),
1000 "=S" (dummy_value_S),
1001 "=D" (dummy_value_D)
1002
7f88f624
VZ
1003 : "3" (srcptr), /* esi // input regs */
1004 "4" (dstptr), /* edi */
1005 "0" (diff), /* eax */
1006/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1007 "2" (len), /* ecx */
1008 "1" (mask) /* edx */
c6b71bff
GD
1009
1010#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1011 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1012 , "%mm4", "%mm5", "%mm6", "%mm7"
1013#endif
1014 );
1015 }
1016 else /* mmx _not supported - Use modified C routine */
1017#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1018 {
1019 register png_uint_32 i;
1020 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1021 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1022 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1023 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1024 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1025 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1026 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1027 int diff = (int) (png_ptr->width & 7); /* amount lost */
1028 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1029
1030 srcptr = png_ptr->row_buf + 1 + initial_val;
1031 dstptr = row + initial_val;
1032
1033 for (i = initial_val; i < final_val; i += stride)
1034 {
1035 png_memcpy(dstptr, srcptr, rep_bytes);
1036 srcptr += stride;
1037 dstptr += stride;
1038 }
1039 if (diff) /* number of leftover pixels: 3 for pngtest */
1040 {
1041 final_val+=diff*BPP3;
1042 for (; i < final_val; i += stride)
1043 {
1044 if (rep_bytes > (int)(final_val-i))
1045 rep_bytes = (int)(final_val-i);
1046 png_memcpy(dstptr, srcptr, rep_bytes);
1047 srcptr += stride;
1048 dstptr += stride;
1049 }
1050 }
1051 } /* end of else (_mmx_supported) */
1052
1053 break;
1054 } /* end 24 bpp */
1055
1056 case 32: /* png_ptr->row_info.pixel_depth */
1057 {
1058 png_bytep srcptr;
1059 png_bytep dstptr;
1060
1061#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1062#if !defined(PNG_1_0_X)
1063 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1064 /* && _mmx_supported */ )
1065#else
1066 if (_mmx_supported)
1067#endif
1068 {
1069 png_uint_32 len;
1070 int diff;
7f88f624 1071 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
1072 int dummy_value_d;
1073 int dummy_value_c;
1074 int dummy_value_S;
1075 int dummy_value_D;
7f88f624 1076 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
1077 srcptr = png_ptr->row_buf + 1;
1078 dstptr = row;
7f88f624
VZ
1079 len = png_ptr->width &~7; /* reduce to multiple of 8 */
1080 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
1081
1082 __asm__ __volatile__ (
7f88f624
VZ
1083 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1084 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
1085 "punpcklbw %%mm7, %%mm7 \n\t"
1086 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 1087 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
1088
1089 "movq _mask32_0, %%mm0 \n\t"
1090 "movq _mask32_1, %%mm1 \n\t"
1091 "movq _mask32_2, %%mm2 \n\t"
1092 "movq _mask32_3, %%mm3 \n\t"
1093
1094 "pand %%mm7, %%mm0 \n\t"
1095 "pand %%mm7, %%mm1 \n\t"
1096 "pand %%mm7, %%mm2 \n\t"
1097 "pand %%mm7, %%mm3 \n\t"
1098
1099 "pcmpeqb %%mm6, %%mm0 \n\t"
1100 "pcmpeqb %%mm6, %%mm1 \n\t"
1101 "pcmpeqb %%mm6, %%mm2 \n\t"
1102 "pcmpeqb %%mm6, %%mm3 \n\t"
1103
7f88f624
VZ
1104/* preload "movl len, %%ecx \n\t" // load length of line */
1105/* preload "movl srcptr, %%esi \n\t" // load source */
1106/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff 1107
7f88f624 1108 "cmpl $0, %%ecx \n\t" /* lcr */
c6b71bff
GD
1109 "jz mainloop32end \n\t"
1110
1111 "mainloop32: \n\t"
1112 "movq (%%esi), %%mm4 \n\t"
1113 "pand %%mm0, %%mm4 \n\t"
1114 "movq %%mm0, %%mm6 \n\t"
1115 "movq (%%edi), %%mm7 \n\t"
1116 "pandn %%mm7, %%mm6 \n\t"
1117 "por %%mm6, %%mm4 \n\t"
1118 "movq %%mm4, (%%edi) \n\t"
1119
1120 "movq 8(%%esi), %%mm5 \n\t"
1121 "pand %%mm1, %%mm5 \n\t"
1122 "movq %%mm1, %%mm7 \n\t"
1123 "movq 8(%%edi), %%mm6 \n\t"
1124 "pandn %%mm6, %%mm7 \n\t"
1125 "por %%mm7, %%mm5 \n\t"
1126 "movq %%mm5, 8(%%edi) \n\t"
1127
1128 "movq 16(%%esi), %%mm6 \n\t"
1129 "pand %%mm2, %%mm6 \n\t"
1130 "movq %%mm2, %%mm4 \n\t"
1131 "movq 16(%%edi), %%mm7 \n\t"
1132 "pandn %%mm7, %%mm4 \n\t"
1133 "por %%mm4, %%mm6 \n\t"
1134 "movq %%mm6, 16(%%edi) \n\t"
1135
1136 "movq 24(%%esi), %%mm7 \n\t"
1137 "pand %%mm3, %%mm7 \n\t"
1138 "movq %%mm3, %%mm5 \n\t"
1139 "movq 24(%%edi), %%mm4 \n\t"
1140 "pandn %%mm4, %%mm5 \n\t"
1141 "por %%mm5, %%mm7 \n\t"
1142 "movq %%mm7, 24(%%edi) \n\t"
1143
7f88f624 1144 "addl $32, %%esi \n\t" /* inc by 32 bytes processed */
c6b71bff 1145 "addl $32, %%edi \n\t"
7f88f624 1146 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
1147 "ja mainloop32 \n\t"
1148
1149 "mainloop32end: \n\t"
7f88f624 1150/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
1151 "movl %%eax, %%ecx \n\t"
1152 "cmpl $0, %%ecx \n\t"
1153 "jz end32 \n\t"
7f88f624
VZ
1154/* preload "movl mask, %%edx \n\t" */
1155 "sall $24, %%edx \n\t" /* low byte => high byte */
c6b71bff
GD
1156
1157 "secondloop32: \n\t"
7f88f624
VZ
1158 "sall %%edx \n\t" /* move high bit to CF */
1159 "jnc skip32 \n\t" /* if CF = 0 */
c6b71bff
GD
1160 "movl (%%esi), %%eax \n\t"
1161 "movl %%eax, (%%edi) \n\t"
1162
1163 "skip32: \n\t"
1164 "addl $4, %%esi \n\t"
1165 "addl $4, %%edi \n\t"
1166 "decl %%ecx \n\t"
1167 "jnz secondloop32 \n\t"
1168
1169 "end32: \n\t"
7f88f624 1170 "EMMS \n\t" /* DONE */
c6b71bff 1171
7f88f624 1172 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
1173 "=d" (dummy_value_d),
1174 "=c" (dummy_value_c),
1175 "=S" (dummy_value_S),
1176 "=D" (dummy_value_D)
1177
7f88f624
VZ
1178 : "3" (srcptr), /* esi // input regs */
1179 "4" (dstptr), /* edi */
1180 "0" (diff), /* eax */
1181/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1182 "2" (len), /* ecx */
1183 "1" (mask) /* edx */
c6b71bff
GD
1184
1185#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1186 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
c6b71bff
GD
1187 , "%mm4", "%mm5", "%mm6", "%mm7"
1188#endif
1189 );
1190 }
1191 else /* mmx _not supported - Use modified C routine */
1192#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1193 {
1194 register png_uint_32 i;
1195 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1196 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1197 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1198 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1199 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1200 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1201 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1202 int diff = (int) (png_ptr->width & 7); /* amount lost */
1203 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1204
1205 srcptr = png_ptr->row_buf + 1 + initial_val;
1206 dstptr = row + initial_val;
1207
1208 for (i = initial_val; i < final_val; i += stride)
1209 {
1210 png_memcpy(dstptr, srcptr, rep_bytes);
1211 srcptr += stride;
1212 dstptr += stride;
1213 }
1214 if (diff) /* number of leftover pixels: 3 for pngtest */
1215 {
1216 final_val+=diff*BPP4;
1217 for (; i < final_val; i += stride)
1218 {
1219 if (rep_bytes > (int)(final_val-i))
1220 rep_bytes = (int)(final_val-i);
1221 png_memcpy(dstptr, srcptr, rep_bytes);
1222 srcptr += stride;
1223 dstptr += stride;
1224 }
1225 }
1226 } /* end of else (_mmx_supported) */
1227
1228 break;
1229 } /* end 32 bpp */
1230
1231 case 48: /* png_ptr->row_info.pixel_depth */
1232 {
1233 png_bytep srcptr;
1234 png_bytep dstptr;
1235
1236#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1237#if !defined(PNG_1_0_X)
1238 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1239 /* && _mmx_supported */ )
1240#else
1241 if (_mmx_supported)
1242#endif
1243 {
1244 png_uint_32 len;
1245 int diff;
7f88f624 1246 int dummy_value_a; /* fix 'forbidden register spilled' error */
c6b71bff
GD
1247 int dummy_value_d;
1248 int dummy_value_c;
1249 int dummy_value_S;
1250 int dummy_value_D;
7f88f624 1251 _unmask = ~mask; /* global variable for -fPIC version */
c6b71bff
GD
1252 srcptr = png_ptr->row_buf + 1;
1253 dstptr = row;
7f88f624
VZ
1254 len = png_ptr->width &~7; /* reduce to multiple of 8 */
1255 diff = (int) (png_ptr->width & 7); /* amount lost // */
c6b71bff
GD
1256
1257 __asm__ __volatile__ (
7f88f624
VZ
1258 "movd _unmask, %%mm7 \n\t" /* load bit pattern */
1259 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
c6b71bff
GD
1260 "punpcklbw %%mm7, %%mm7 \n\t"
1261 "punpcklwd %%mm7, %%mm7 \n\t"
7f88f624 1262 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
c6b71bff
GD
1263
1264 "movq _mask48_0, %%mm0 \n\t"
1265 "movq _mask48_1, %%mm1 \n\t"
1266 "movq _mask48_2, %%mm2 \n\t"
1267 "movq _mask48_3, %%mm3 \n\t"
1268 "movq _mask48_4, %%mm4 \n\t"
1269 "movq _mask48_5, %%mm5 \n\t"
1270
1271 "pand %%mm7, %%mm0 \n\t"
1272 "pand %%mm7, %%mm1 \n\t"
1273 "pand %%mm7, %%mm2 \n\t"
1274 "pand %%mm7, %%mm3 \n\t"
1275 "pand %%mm7, %%mm4 \n\t"
1276 "pand %%mm7, %%mm5 \n\t"
1277
1278 "pcmpeqb %%mm6, %%mm0 \n\t"
1279 "pcmpeqb %%mm6, %%mm1 \n\t"
1280 "pcmpeqb %%mm6, %%mm2 \n\t"
1281 "pcmpeqb %%mm6, %%mm3 \n\t"
1282 "pcmpeqb %%mm6, %%mm4 \n\t"
1283 "pcmpeqb %%mm6, %%mm5 \n\t"
1284
7f88f624
VZ
1285/* preload "movl len, %%ecx \n\t" // load length of line */
1286/* preload "movl srcptr, %%esi \n\t" // load source */
1287/* preload "movl dstptr, %%edi \n\t" // load dest */
c6b71bff
GD
1288
1289 "cmpl $0, %%ecx \n\t"
1290 "jz mainloop48end \n\t"
1291
1292 "mainloop48: \n\t"
1293 "movq (%%esi), %%mm7 \n\t"
1294 "pand %%mm0, %%mm7 \n\t"
1295 "movq %%mm0, %%mm6 \n\t"
1296 "pandn (%%edi), %%mm6 \n\t"
1297 "por %%mm6, %%mm7 \n\t"
1298 "movq %%mm7, (%%edi) \n\t"
1299
1300 "movq 8(%%esi), %%mm6 \n\t"
1301 "pand %%mm1, %%mm6 \n\t"
1302 "movq %%mm1, %%mm7 \n\t"
1303 "pandn 8(%%edi), %%mm7 \n\t"
1304 "por %%mm7, %%mm6 \n\t"
1305 "movq %%mm6, 8(%%edi) \n\t"
1306
1307 "movq 16(%%esi), %%mm6 \n\t"
1308 "pand %%mm2, %%mm6 \n\t"
1309 "movq %%mm2, %%mm7 \n\t"
1310 "pandn 16(%%edi), %%mm7 \n\t"
1311 "por %%mm7, %%mm6 \n\t"
1312 "movq %%mm6, 16(%%edi) \n\t"
1313
1314 "movq 24(%%esi), %%mm7 \n\t"
1315 "pand %%mm3, %%mm7 \n\t"
1316 "movq %%mm3, %%mm6 \n\t"
1317 "pandn 24(%%edi), %%mm6 \n\t"
1318 "por %%mm6, %%mm7 \n\t"
1319 "movq %%mm7, 24(%%edi) \n\t"
1320
1321 "movq 32(%%esi), %%mm6 \n\t"
1322 "pand %%mm4, %%mm6 \n\t"
1323 "movq %%mm4, %%mm7 \n\t"
1324 "pandn 32(%%edi), %%mm7 \n\t"
1325 "por %%mm7, %%mm6 \n\t"
1326 "movq %%mm6, 32(%%edi) \n\t"
1327
1328 "movq 40(%%esi), %%mm7 \n\t"
1329 "pand %%mm5, %%mm7 \n\t"
1330 "movq %%mm5, %%mm6 \n\t"
1331 "pandn 40(%%edi), %%mm6 \n\t"
1332 "por %%mm6, %%mm7 \n\t"
1333 "movq %%mm7, 40(%%edi) \n\t"
1334
7f88f624 1335 "addl $48, %%esi \n\t" /* inc by 48 bytes processed */
c6b71bff 1336 "addl $48, %%edi \n\t"
7f88f624 1337 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
c6b71bff
GD
1338
1339 "ja mainloop48 \n\t"
1340
1341 "mainloop48end: \n\t"
7f88f624 1342/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
c6b71bff
GD
1343 "movl %%eax, %%ecx \n\t"
1344 "cmpl $0, %%ecx \n\t"
1345 "jz end48 \n\t"
7f88f624
VZ
1346/* preload "movl mask, %%edx \n\t" */
1347 "sall $24, %%edx \n\t" /* make low byte, high byte */
c6b71bff
GD
1348
1349 "secondloop48: \n\t"
7f88f624
VZ
1350 "sall %%edx \n\t" /* move high bit to CF */
1351 "jnc skip48 \n\t" /* if CF = 0 */
c6b71bff
GD
1352 "movl (%%esi), %%eax \n\t"
1353 "movl %%eax, (%%edi) \n\t"
1354
1355 "skip48: \n\t"
1356 "addl $4, %%esi \n\t"
1357 "addl $4, %%edi \n\t"
1358 "decl %%ecx \n\t"
1359 "jnz secondloop48 \n\t"
1360
1361 "end48: \n\t"
7f88f624 1362 "EMMS \n\t" /* DONE */
c6b71bff 1363
7f88f624 1364 : "=a" (dummy_value_a), /* output regs (dummy) */
c6b71bff
GD
1365 "=d" (dummy_value_d),
1366 "=c" (dummy_value_c),
1367 "=S" (dummy_value_S),
1368 "=D" (dummy_value_D)
1369
7f88f624
VZ
1370 : "3" (srcptr), /* esi // input regs */
1371 "4" (dstptr), /* edi */
1372 "0" (diff), /* eax */
1373/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
1374 "2" (len), /* ecx */
1375 "1" (mask) /* edx */
c6b71bff
GD
1376
1377#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1378 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
c6b71bff
GD
1379 , "%mm4", "%mm5", "%mm6", "%mm7"
1380#endif
1381 );
1382 }
1383 else /* mmx _not supported - Use modified C routine */
1384#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1385 {
1386 register png_uint_32 i;
1387 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1388 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1389 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1390 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1391 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1392 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1393 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1394 int diff = (int) (png_ptr->width & 7); /* amount lost */
1395 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1396
1397 srcptr = png_ptr->row_buf + 1 + initial_val;
1398 dstptr = row + initial_val;
1399
1400 for (i = initial_val; i < final_val; i += stride)
1401 {
1402 png_memcpy(dstptr, srcptr, rep_bytes);
1403 srcptr += stride;
1404 dstptr += stride;
1405 }
1406 if (diff) /* number of leftover pixels: 3 for pngtest */
1407 {
1408 final_val+=diff*BPP6;
1409 for (; i < final_val; i += stride)
1410 {
1411 if (rep_bytes > (int)(final_val-i))
1412 rep_bytes = (int)(final_val-i);
1413 png_memcpy(dstptr, srcptr, rep_bytes);
1414 srcptr += stride;
1415 dstptr += stride;
1416 }
1417 }
1418 } /* end of else (_mmx_supported) */
1419
1420 break;
1421 } /* end 48 bpp */
1422
1423 case 64: /* png_ptr->row_info.pixel_depth */
1424 {
1425 png_bytep srcptr;
1426 png_bytep dstptr;
1427 register png_uint_32 i;
1428 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1429 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1430 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1431 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1432 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1433 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1434 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1435 int diff = (int) (png_ptr->width & 7); /* amount lost */
1436 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1437
1438 srcptr = png_ptr->row_buf + 1 + initial_val;
1439 dstptr = row + initial_val;
1440
1441 for (i = initial_val; i < final_val; i += stride)
1442 {
1443 png_memcpy(dstptr, srcptr, rep_bytes);
1444 srcptr += stride;
1445 dstptr += stride;
1446 }
1447 if (diff) /* number of leftover pixels: 3 for pngtest */
1448 {
1449 final_val+=diff*BPP8;
1450 for (; i < final_val; i += stride)
1451 {
1452 if (rep_bytes > (int)(final_val-i))
1453 rep_bytes = (int)(final_val-i);
1454 png_memcpy(dstptr, srcptr, rep_bytes);
1455 srcptr += stride;
1456 dstptr += stride;
1457 }
1458 }
1459
1460 break;
1461 } /* end 64 bpp */
1462
1463 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1464 {
1465 /* this should never happen */
1466 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1467 break;
1468 }
1469 } /* end switch (png_ptr->row_info.pixel_depth) */
1470
1471 } /* end if (non-trivial mask) */
1472
1473} /* end png_combine_row() */
1474
1475#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1476
1477
1478
1479
1480/*===========================================================================*/
1481/* */
1482/* P N G _ D O _ R E A D _ I N T E R L A C E */
1483/* */
1484/*===========================================================================*/
1485
1486#if defined(PNG_READ_INTERLACING_SUPPORTED)
1487#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1488
1489/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1490 * has taken place. [GRR: what other steps come before and/or after?]
1491 */
1492
1493void /* PRIVATE */
1494png_do_read_interlace(png_structp png_ptr)
1495{
1496 png_row_infop row_info = &(png_ptr->row_info);
1497 png_bytep row = png_ptr->row_buf + 1;
1498 int pass = png_ptr->pass;
1499#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1500 png_uint_32 transformations = png_ptr->transformations;
1501#endif
1502
1503 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1504
1505#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1506 if (_mmx_supported == 2) {
1507#if !defined(PNG_1_0_X)
1508 /* this should have happened in png_init_mmx_flags() already */
1509 png_warning(png_ptr, "asm_flags may not have been initialized");
1510#endif
1511 png_mmx_support();
1512 }
1513#endif
1514
1515 if (row != NULL && row_info != NULL)
1516 {
1517 png_uint_32 final_width;
1518
1519 final_width = row_info->width * png_pass_inc[pass];
1520
1521 switch (row_info->pixel_depth)
1522 {
1523 case 1:
1524 {
1525 png_bytep sp, dp;
1526 int sshift, dshift;
1527 int s_start, s_end, s_inc;
1528 png_byte v;
1529 png_uint_32 i;
1530 int j;
1531
1532 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1533 dp = row + (png_size_t)((final_width - 1) >> 3);
1534#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1535 if (transformations & PNG_PACKSWAP)
1536 {
1537 sshift = (int)((row_info->width + 7) & 7);
1538 dshift = (int)((final_width + 7) & 7);
1539 s_start = 7;
1540 s_end = 0;
1541 s_inc = -1;
1542 }
1543 else
1544#endif
1545 {
1546 sshift = 7 - (int)((row_info->width + 7) & 7);
1547 dshift = 7 - (int)((final_width + 7) & 7);
1548 s_start = 0;
1549 s_end = 7;
1550 s_inc = 1;
1551 }
1552
1553 for (i = row_info->width; i; i--)
1554 {
1555 v = (png_byte)((*sp >> sshift) & 0x1);
1556 for (j = 0; j < png_pass_inc[pass]; j++)
1557 {
1558 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1559 *dp |= (png_byte)(v << dshift);
1560 if (dshift == s_end)
1561 {
1562 dshift = s_start;
1563 dp--;
1564 }
1565 else
1566 dshift += s_inc;
1567 }
1568 if (sshift == s_end)
1569 {
1570 sshift = s_start;
1571 sp--;
1572 }
1573 else
1574 sshift += s_inc;
1575 }
1576 break;
1577 }
1578
1579 case 2:
1580 {
1581 png_bytep sp, dp;
1582 int sshift, dshift;
1583 int s_start, s_end, s_inc;
1584 png_uint_32 i;
1585
1586 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1587 dp = row + (png_size_t)((final_width - 1) >> 2);
1588#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1589 if (transformations & PNG_PACKSWAP)
1590 {
1591 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1592 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1593 s_start = 6;
1594 s_end = 0;
1595 s_inc = -2;
1596 }
1597 else
1598#endif
1599 {
1600 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1601 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1602 s_start = 0;
1603 s_end = 6;
1604 s_inc = 2;
1605 }
1606
1607 for (i = row_info->width; i; i--)
1608 {
1609 png_byte v;
1610 int j;
1611
1612 v = (png_byte)((*sp >> sshift) & 0x3);
1613 for (j = 0; j < png_pass_inc[pass]; j++)
1614 {
1615 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1616 *dp |= (png_byte)(v << dshift);
1617 if (dshift == s_end)
1618 {
1619 dshift = s_start;
1620 dp--;
1621 }
1622 else
1623 dshift += s_inc;
1624 }
1625 if (sshift == s_end)
1626 {
1627 sshift = s_start;
1628 sp--;
1629 }
1630 else
1631 sshift += s_inc;
1632 }
1633 break;
1634 }
1635
1636 case 4:
1637 {
1638 png_bytep sp, dp;
1639 int sshift, dshift;
1640 int s_start, s_end, s_inc;
1641 png_uint_32 i;
1642
1643 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1644 dp = row + (png_size_t)((final_width - 1) >> 1);
1645#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1646 if (transformations & PNG_PACKSWAP)
1647 {
1648 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1649 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1650 s_start = 4;
1651 s_end = 0;
1652 s_inc = -4;
1653 }
1654 else
1655#endif
1656 {
1657 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1658 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1659 s_start = 0;
1660 s_end = 4;
1661 s_inc = 4;
1662 }
1663
1664 for (i = row_info->width; i; i--)
1665 {
1666 png_byte v;
1667 int j;
1668
1669 v = (png_byte)((*sp >> sshift) & 0xf);
1670 for (j = 0; j < png_pass_inc[pass]; j++)
1671 {
1672 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1673 *dp |= (png_byte)(v << dshift);
1674 if (dshift == s_end)
1675 {
1676 dshift = s_start;
1677 dp--;
1678 }
1679 else
1680 dshift += s_inc;
1681 }
1682 if (sshift == s_end)
1683 {
1684 sshift = s_start;
1685 sp--;
1686 }
1687 else
1688 sshift += s_inc;
1689 }
1690 break;
1691 }
1692
1693 /*====================================================================*/
1694
1695 default: /* 8-bit or larger (this is where the routine is modified) */
1696 {
1697#if 0
7f88f624
VZ
1698/* static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1699/* static unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
1700/* unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */
1701/* unsigned long long const4 = 0x0000000000FFFFFFLL; no good */
c6b71bff
GD
1702#endif
1703 png_bytep sptr, dp;
1704 png_uint_32 i;
1705 png_size_t pixel_bytes;
1706 int width = (int)row_info->width;
1707
1708 pixel_bytes = (row_info->pixel_depth >> 3);
1709
1710 /* point sptr at the last pixel in the pre-expanded row: */
1711 sptr = row + (width - 1) * pixel_bytes;
1712
1713 /* point dp at the last pixel position in the expanded row: */
1714 dp = row + (final_width - 1) * pixel_bytes;
1715
1716 /* New code by Nirav Chhatrapati - Intel Corporation */
1717
1718#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1719#if !defined(PNG_1_0_X)
1720 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1721 /* && _mmx_supported */ )
1722#else
1723 if (_mmx_supported)
1724#endif
1725 {
1726 //--------------------------------------------------------------
1727 if (pixel_bytes == 3)
1728 {
1729 if (((pass == 0) || (pass == 1)) && width)
1730 {
7f88f624 1731 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1732 int dummy_value_S;
1733 int dummy_value_D;
1734
1735 __asm__ __volatile__ (
1736 "subl $21, %%edi \n\t"
7f88f624 1737 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1738
1739 ".loop3_pass0: \n\t"
7f88f624
VZ
1740 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1741 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1742 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1743 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1744 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1745 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1746 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1747 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1748 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
1749 "movq %%mm0, %%mm3 \n\t" /* 2 1 0 2 1 0 2 1 */
1750 "psllq $16, %%mm0 \n\t" /* 0 2 1 0 2 1 z z */
1751 "movq %%mm3, %%mm4 \n\t" /* 2 1 0 2 1 0 2 1 */
1752 "punpckhdq %%mm0, %%mm3 \n\t" /* 0 2 1 0 2 1 0 2 */
c6b71bff 1753 "movq %%mm4, 16(%%edi) \n\t"
7f88f624 1754 "psrlq $32, %%mm0 \n\t" /* z z z z 0 2 1 0 */
c6b71bff 1755 "movq %%mm3, 8(%%edi) \n\t"
7f88f624 1756 "punpckldq %%mm4, %%mm0 \n\t" /* 1 0 2 1 0 2 1 0 */
c6b71bff
GD
1757 "subl $3, %%esi \n\t"
1758 "movq %%mm0, (%%edi) \n\t"
1759 "subl $24, %%edi \n\t"
1760 "decl %%ecx \n\t"
1761 "jnz .loop3_pass0 \n\t"
7f88f624 1762 "EMMS \n\t" /* DONE */
c6b71bff 1763
7f88f624 1764 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1765 "=S" (dummy_value_S),
1766 "=D" (dummy_value_D)
1767
7f88f624
VZ
1768 : "1" (sptr), /* esi // input regs */
1769 "2" (dp), /* edi */
1770 "0" (width) /* ecx */
1771/* doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4) */
c6b71bff
GD
1772
1773#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1774 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1775 , "%mm3", "%mm4"
1776#endif
1777 );
1778 }
1779 else if (((pass == 2) || (pass == 3)) && width)
1780 {
7f88f624 1781 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1782 int dummy_value_S;
1783 int dummy_value_D;
1784
1785 __asm__ __volatile__ (
1786 "subl $9, %%edi \n\t"
7f88f624 1787 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1788
1789 ".loop3_pass2: \n\t"
7f88f624
VZ
1790 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */
1791 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */
1792 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */
1793 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */
1794 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */
1795 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */
1796 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */
1797 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */
1798 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */
c6b71bff 1799 "movq %%mm0, 4(%%edi) \n\t"
7f88f624 1800 "psrlq $16, %%mm0 \n\t" /* z z 2 1 0 2 1 0 */
c6b71bff
GD
1801 "subl $3, %%esi \n\t"
1802 "movd %%mm0, (%%edi) \n\t"
1803 "subl $12, %%edi \n\t"
1804 "decl %%ecx \n\t"
1805 "jnz .loop3_pass2 \n\t"
7f88f624 1806 "EMMS \n\t" /* DONE */
c6b71bff 1807
7f88f624 1808 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1809 "=S" (dummy_value_S),
1810 "=D" (dummy_value_D)
1811
7f88f624
VZ
1812 : "1" (sptr), /* esi // input regs */
1813 "2" (dp), /* edi */
1814 "0" (width) /* ecx */
c6b71bff
GD
1815
1816#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1817 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1818#endif
1819 );
1820 }
1821 else if (width) /* && ((pass == 4) || (pass == 5)) */
1822 {
7f88f624 1823 int width_mmx = ((width >> 1) << 1) - 8; /* GRR: huh? */
c6b71bff
GD
1824 if (width_mmx < 0)
1825 width_mmx = 0;
7f88f624 1826 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */
c6b71bff
GD
1827 if (width_mmx)
1828 {
7f88f624
VZ
1829 /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1830 /* sptr points at last pixel in pre-expanded row */
1831 /* dp points at last pixel position in expanded row */
1832 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1833 int dummy_value_S;
1834 int dummy_value_D;
1835
1836 __asm__ __volatile__ (
1837 "subl $3, %%esi \n\t"
1838 "subl $9, %%edi \n\t"
7f88f624 1839 /* (png_pass_inc[pass] + 1)*pixel_bytes */
c6b71bff
GD
1840
1841 ".loop3_pass4: \n\t"
7f88f624
VZ
1842 "movq (%%esi), %%mm0 \n\t" /* x x 5 4 3 2 1 0 */
1843 "movq %%mm0, %%mm1 \n\t" /* x x 5 4 3 2 1 0 */
1844 "movq %%mm0, %%mm2 \n\t" /* x x 5 4 3 2 1 0 */
1845 "psllq $24, %%mm0 \n\t" /* 4 3 2 1 0 z z z */
1846 "pand _const4, %%mm1 \n\t" /* z z z z z 2 1 0 */
1847 "psrlq $24, %%mm2 \n\t" /* z z z x x 5 4 3 */
1848 "por %%mm1, %%mm0 \n\t" /* 4 3 2 1 0 2 1 0 */
1849 "movq %%mm2, %%mm3 \n\t" /* z z z x x 5 4 3 */
1850 "psllq $8, %%mm2 \n\t" /* z z x x 5 4 3 z */
c6b71bff 1851 "movq %%mm0, (%%edi) \n\t"
7f88f624
VZ
1852 "psrlq $16, %%mm3 \n\t" /* z z z z z x x 5 */
1853 "pand _const6, %%mm3 \n\t" /* z z z z z z z 5 */
1854 "por %%mm3, %%mm2 \n\t" /* z z x x 5 4 3 5 */
c6b71bff
GD
1855 "subl $6, %%esi \n\t"
1856 "movd %%mm2, 8(%%edi) \n\t"
1857 "subl $12, %%edi \n\t"
1858 "subl $2, %%ecx \n\t"
1859 "jnz .loop3_pass4 \n\t"
7f88f624 1860 "EMMS \n\t" /* DONE */
c6b71bff 1861
7f88f624 1862 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1863 "=S" (dummy_value_S),
1864 "=D" (dummy_value_D)
1865
7f88f624
VZ
1866 : "1" (sptr), /* esi // input regs */
1867 "2" (dp), /* edi */
1868 "0" (width_mmx) /* ecx */
c6b71bff
GD
1869
1870#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1871 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
1872 , "%mm2", "%mm3"
1873#endif
1874 );
1875 }
1876
1877 sptr -= width_mmx*3;
1878 dp -= width_mmx*6;
1879 for (i = width; i; i--)
1880 {
1881 png_byte v[8];
1882 int j;
1883
1884 png_memcpy(v, sptr, 3);
1885 for (j = 0; j < png_pass_inc[pass]; j++)
1886 {
1887 png_memcpy(dp, v, 3);
1888 dp -= 3;
1889 }
1890 sptr -= 3;
1891 }
1892 }
1893 } /* end of pixel_bytes == 3 */
1894
1895 //--------------------------------------------------------------
1896 else if (pixel_bytes == 1)
1897 {
1898 if (((pass == 0) || (pass == 1)) && width)
1899 {
1900 int width_mmx = ((width >> 2) << 2);
7f88f624 1901 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */
c6b71bff
GD
1902 if (width_mmx)
1903 {
7f88f624 1904 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1905 int dummy_value_S;
1906 int dummy_value_D;
1907
1908 __asm__ __volatile__ (
1909 "subl $3, %%esi \n\t"
1910 "subl $31, %%edi \n\t"
1911
1912 ".loop1_pass0: \n\t"
7f88f624
VZ
1913 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
1914 "movq %%mm0, %%mm1 \n\t" /* x x x x 3 2 1 0 */
1915 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
1916 "movq %%mm0, %%mm2 \n\t" /* 3 3 2 2 1 1 0 0 */
1917 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
1918 "movq %%mm0, %%mm3 \n\t" /* 1 1 1 1 0 0 0 0 */
1919 "punpckldq %%mm0, %%mm0 \n\t" /* 0 0 0 0 0 0 0 0 */
1920 "punpckhdq %%mm3, %%mm3 \n\t" /* 1 1 1 1 1 1 1 1 */
c6b71bff 1921 "movq %%mm0, (%%edi) \n\t"
7f88f624 1922 "punpckhwd %%mm2, %%mm2 \n\t" /* 3 3 3 3 2 2 2 2 */
c6b71bff 1923 "movq %%mm3, 8(%%edi) \n\t"
7f88f624
VZ
1924 "movq %%mm2, %%mm4 \n\t" /* 3 3 3 3 2 2 2 2 */
1925 "punpckldq %%mm2, %%mm2 \n\t" /* 2 2 2 2 2 2 2 2 */
1926 "punpckhdq %%mm4, %%mm4 \n\t" /* 3 3 3 3 3 3 3 3 */
c6b71bff
GD
1927 "movq %%mm2, 16(%%edi) \n\t"
1928 "subl $4, %%esi \n\t"
1929 "movq %%mm4, 24(%%edi) \n\t"
1930 "subl $32, %%edi \n\t"
1931 "subl $4, %%ecx \n\t"
1932 "jnz .loop1_pass0 \n\t"
7f88f624 1933 "EMMS \n\t" /* DONE */
c6b71bff 1934
7f88f624 1935 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
1936 "=S" (dummy_value_S),
1937 "=D" (dummy_value_D)
1938
7f88f624
VZ
1939 : "1" (sptr), /* esi // input regs */
1940 "2" (dp), /* edi */
1941 "0" (width_mmx) /* ecx */
c6b71bff
GD
1942
1943#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 1944 : "%mm0", "%mm1", "%mm2" /* clobber list */
c6b71bff
GD
1945 , "%mm3", "%mm4"
1946#endif
1947 );
1948 }
1949
1950 sptr -= width_mmx;
1951 dp -= width_mmx*8;
1952 for (i = width; i; i--)
1953 {
1954 int j;
1955
1956 /* I simplified this part in version 1.0.4e
1957 * here and in several other instances where
1958 * pixel_bytes == 1 -- GR-P
1959 *
1960 * Original code:
1961 *
1962 * png_byte v[8];
1963 * png_memcpy(v, sptr, pixel_bytes);
1964 * for (j = 0; j < png_pass_inc[pass]; j++)
1965 * {
1966 * png_memcpy(dp, v, pixel_bytes);
1967 * dp -= pixel_bytes;
1968 * }
1969 * sptr -= pixel_bytes;
1970 *
1971 * Replacement code is in the next three lines:
1972 */
1973
1974 for (j = 0; j < png_pass_inc[pass]; j++)
1975 {
1976 *dp-- = *sptr;
1977 }
1978 --sptr;
1979 }
1980 }
1981 else if (((pass == 2) || (pass == 3)) && width)
1982 {
1983 int width_mmx = ((width >> 2) << 2);
7f88f624 1984 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */
c6b71bff
GD
1985 if (width_mmx)
1986 {
7f88f624 1987 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
1988 int dummy_value_S;
1989 int dummy_value_D;
1990
1991 __asm__ __volatile__ (
1992 "subl $3, %%esi \n\t"
1993 "subl $15, %%edi \n\t"
1994
1995 ".loop1_pass2: \n\t"
7f88f624
VZ
1996 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
1997 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
1998 "movq %%mm0, %%mm1 \n\t" /* 3 3 2 2 1 1 0 0 */
1999 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */
2000 "punpckhwd %%mm1, %%mm1 \n\t" /* 3 3 3 3 2 2 2 2 */
c6b71bff
GD
2001 "movq %%mm0, (%%edi) \n\t"
2002 "subl $4, %%esi \n\t"
2003 "movq %%mm1, 8(%%edi) \n\t"
2004 "subl $16, %%edi \n\t"
2005 "subl $4, %%ecx \n\t"
2006 "jnz .loop1_pass2 \n\t"
7f88f624 2007 "EMMS \n\t" /* DONE */
c6b71bff 2008
7f88f624 2009 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2010 "=S" (dummy_value_S),
2011 "=D" (dummy_value_D)
2012
7f88f624
VZ
2013 : "1" (sptr), /* esi // input regs */
2014 "2" (dp), /* edi */
2015 "0" (width_mmx) /* ecx */
c6b71bff
GD
2016
2017#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2018 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2019#endif
2020 );
2021 }
2022
2023 sptr -= width_mmx;
2024 dp -= width_mmx*4;
2025 for (i = width; i; i--)
2026 {
2027 int j;
2028
2029 for (j = 0; j < png_pass_inc[pass]; j++)
2030 {
2031 *dp-- = *sptr;
2032 }
2033 --sptr;
2034 }
2035 }
2036 else if (width) /* && ((pass == 4) || (pass == 5)) */
2037 {
2038 int width_mmx = ((width >> 3) << 3);
7f88f624 2039 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */
c6b71bff
GD
2040 if (width_mmx)
2041 {
7f88f624 2042 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2043 int dummy_value_S;
2044 int dummy_value_D;
2045
2046 __asm__ __volatile__ (
2047 "subl $7, %%esi \n\t"
2048 "subl $15, %%edi \n\t"
2049
2050 ".loop1_pass4: \n\t"
7f88f624
VZ
2051 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2052 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2053 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */
2054 "punpckhbw %%mm1, %%mm1 \n\t" /* 7 7 6 6 5 5 4 4 */
c6b71bff
GD
2055 "movq %%mm1, 8(%%edi) \n\t"
2056 "subl $8, %%esi \n\t"
2057 "movq %%mm0, (%%edi) \n\t"
2058 "subl $16, %%edi \n\t"
2059 "subl $8, %%ecx \n\t"
2060 "jnz .loop1_pass4 \n\t"
7f88f624 2061 "EMMS \n\t" /* DONE */
c6b71bff 2062
7f88f624 2063 : "=c" (dummy_value_c), /* output regs (none) */
c6b71bff
GD
2064 "=S" (dummy_value_S),
2065 "=D" (dummy_value_D)
2066
7f88f624
VZ
2067 : "1" (sptr), /* esi // input regs */
2068 "2" (dp), /* edi */
2069 "0" (width_mmx) /* ecx */
c6b71bff
GD
2070
2071#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2072 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2073#endif
2074 );
2075 }
2076
2077 sptr -= width_mmx;
2078 dp -= width_mmx*2;
2079 for (i = width; i; i--)
2080 {
2081 int j;
2082
2083 for (j = 0; j < png_pass_inc[pass]; j++)
2084 {
2085 *dp-- = *sptr;
2086 }
2087 --sptr;
2088 }
2089 }
2090 } /* end of pixel_bytes == 1 */
2091
2092 //--------------------------------------------------------------
2093 else if (pixel_bytes == 2)
2094 {
2095 if (((pass == 0) || (pass == 1)) && width)
2096 {
2097 int width_mmx = ((width >> 1) << 1);
7f88f624 2098 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */
c6b71bff
GD
2099 if (width_mmx)
2100 {
7f88f624 2101 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2102 int dummy_value_S;
2103 int dummy_value_D;
2104
2105 __asm__ __volatile__ (
2106 "subl $2, %%esi \n\t"
2107 "subl $30, %%edi \n\t"
2108
2109 ".loop2_pass0: \n\t"
7f88f624
VZ
2110 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2111 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2112 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2113 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2114 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
c6b71bff
GD
2115 "movq %%mm0, (%%edi) \n\t"
2116 "movq %%mm0, 8(%%edi) \n\t"
2117 "movq %%mm1, 16(%%edi) \n\t"
2118 "subl $4, %%esi \n\t"
2119 "movq %%mm1, 24(%%edi) \n\t"
2120 "subl $32, %%edi \n\t"
2121 "subl $2, %%ecx \n\t"
2122 "jnz .loop2_pass0 \n\t"
7f88f624 2123 "EMMS \n\t" /* DONE */
c6b71bff 2124
7f88f624 2125 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2126 "=S" (dummy_value_S),
2127 "=D" (dummy_value_D)
2128
7f88f624
VZ
2129 : "1" (sptr), /* esi // input regs */
2130 "2" (dp), /* edi */
2131 "0" (width_mmx) /* ecx */
c6b71bff
GD
2132
2133#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2134 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2135#endif
2136 );
2137 }
2138
7f88f624
VZ
2139 sptr -= (width_mmx*2 - 2); /* sign fixed */
2140 dp -= (width_mmx*16 - 2); /* sign fixed */
c6b71bff
GD
2141 for (i = width; i; i--)
2142 {
2143 png_byte v[8];
2144 int j;
2145 sptr -= 2;
2146 png_memcpy(v, sptr, 2);
2147 for (j = 0; j < png_pass_inc[pass]; j++)
2148 {
2149 dp -= 2;
2150 png_memcpy(dp, v, 2);
2151 }
2152 }
2153 }
2154 else if (((pass == 2) || (pass == 3)) && width)
2155 {
2156 int width_mmx = ((width >> 1) << 1) ;
7f88f624 2157 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */
c6b71bff
GD
2158 if (width_mmx)
2159 {
7f88f624 2160 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2161 int dummy_value_S;
2162 int dummy_value_D;
2163
2164 __asm__ __volatile__ (
2165 "subl $2, %%esi \n\t"
2166 "subl $14, %%edi \n\t"
2167
2168 ".loop2_pass2: \n\t"
7f88f624
VZ
2169 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2170 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
2171 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */
2172 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */
2173 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */
c6b71bff
GD
2174 "movq %%mm0, (%%edi) \n\t"
2175 "subl $4, %%esi \n\t"
2176 "movq %%mm1, 8(%%edi) \n\t"
2177 "subl $16, %%edi \n\t"
2178 "subl $2, %%ecx \n\t"
2179 "jnz .loop2_pass2 \n\t"
7f88f624 2180 "EMMS \n\t" /* DONE */
c6b71bff 2181
7f88f624 2182 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2183 "=S" (dummy_value_S),
2184 "=D" (dummy_value_D)
2185
7f88f624
VZ
2186 : "1" (sptr), /* esi // input regs */
2187 "2" (dp), /* edi */
2188 "0" (width_mmx) /* ecx */
c6b71bff
GD
2189
2190#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2191 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2192#endif
2193 );
2194 }
2195
7f88f624
VZ
2196 sptr -= (width_mmx*2 - 2); /* sign fixed */
2197 dp -= (width_mmx*8 - 2); /* sign fixed */
c6b71bff
GD
2198 for (i = width; i; i--)
2199 {
2200 png_byte v[8];
2201 int j;
2202 sptr -= 2;
2203 png_memcpy(v, sptr, 2);
2204 for (j = 0; j < png_pass_inc[pass]; j++)
2205 {
2206 dp -= 2;
2207 png_memcpy(dp, v, 2);
2208 }
2209 }
2210 }
7f88f624 2211 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
2212 {
2213 int width_mmx = ((width >> 1) << 1) ;
7f88f624 2214 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */
c6b71bff
GD
2215 if (width_mmx)
2216 {
7f88f624 2217 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2218 int dummy_value_S;
2219 int dummy_value_D;
2220
2221 __asm__ __volatile__ (
2222 "subl $2, %%esi \n\t"
2223 "subl $6, %%edi \n\t"
2224
2225 ".loop2_pass4: \n\t"
7f88f624
VZ
2226 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */
2227 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */
c6b71bff
GD
2228 "subl $4, %%esi \n\t"
2229 "movq %%mm0, (%%edi) \n\t"
2230 "subl $8, %%edi \n\t"
2231 "subl $2, %%ecx \n\t"
2232 "jnz .loop2_pass4 \n\t"
7f88f624 2233 "EMMS \n\t" /* DONE */
c6b71bff 2234
7f88f624 2235 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2236 "=S" (dummy_value_S),
2237 "=D" (dummy_value_D)
2238
7f88f624
VZ
2239 : "1" (sptr), /* esi // input regs */
2240 "2" (dp), /* edi */
2241 "0" (width_mmx) /* ecx */
c6b71bff
GD
2242
2243#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2244 : "%mm0" /* clobber list */
c6b71bff
GD
2245#endif
2246 );
2247 }
2248
7f88f624
VZ
2249 sptr -= (width_mmx*2 - 2); /* sign fixed */
2250 dp -= (width_mmx*4 - 2); /* sign fixed */
c6b71bff
GD
2251 for (i = width; i; i--)
2252 {
2253 png_byte v[8];
2254 int j;
2255 sptr -= 2;
2256 png_memcpy(v, sptr, 2);
2257 for (j = 0; j < png_pass_inc[pass]; j++)
2258 {
2259 dp -= 2;
2260 png_memcpy(dp, v, 2);
2261 }
2262 }
2263 }
2264 } /* end of pixel_bytes == 2 */
2265
2266 //--------------------------------------------------------------
2267 else if (pixel_bytes == 4)
2268 {
2269 if (((pass == 0) || (pass == 1)) && width)
2270 {
2271 int width_mmx = ((width >> 1) << 1);
7f88f624 2272 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */
c6b71bff
GD
2273 if (width_mmx)
2274 {
7f88f624 2275 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2276 int dummy_value_S;
2277 int dummy_value_D;
2278
2279 __asm__ __volatile__ (
2280 "subl $4, %%esi \n\t"
2281 "subl $60, %%edi \n\t"
2282
2283 ".loop4_pass0: \n\t"
7f88f624
VZ
2284 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2285 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2286 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2287 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
c6b71bff
GD
2288 "movq %%mm0, (%%edi) \n\t"
2289 "movq %%mm0, 8(%%edi) \n\t"
2290 "movq %%mm0, 16(%%edi) \n\t"
2291 "movq %%mm0, 24(%%edi) \n\t"
2292 "movq %%mm1, 32(%%edi) \n\t"
2293 "movq %%mm1, 40(%%edi) \n\t"
2294 "movq %%mm1, 48(%%edi) \n\t"
2295 "subl $8, %%esi \n\t"
2296 "movq %%mm1, 56(%%edi) \n\t"
2297 "subl $64, %%edi \n\t"
2298 "subl $2, %%ecx \n\t"
2299 "jnz .loop4_pass0 \n\t"
7f88f624 2300 "EMMS \n\t" /* DONE */
c6b71bff 2301
7f88f624 2302 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2303 "=S" (dummy_value_S),
2304 "=D" (dummy_value_D)
2305
7f88f624
VZ
2306 : "1" (sptr), /* esi // input regs */
2307 "2" (dp), /* edi */
2308 "0" (width_mmx) /* ecx */
c6b71bff
GD
2309
2310#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2311 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2312#endif
2313 );
2314 }
2315
7f88f624
VZ
2316 sptr -= (width_mmx*4 - 4); /* sign fixed */
2317 dp -= (width_mmx*32 - 4); /* sign fixed */
c6b71bff
GD
2318 for (i = width; i; i--)
2319 {
2320 png_byte v[8];
2321 int j;
2322 sptr -= 4;
2323 png_memcpy(v, sptr, 4);
2324 for (j = 0; j < png_pass_inc[pass]; j++)
2325 {
2326 dp -= 4;
2327 png_memcpy(dp, v, 4);
2328 }
2329 }
2330 }
2331 else if (((pass == 2) || (pass == 3)) && width)
2332 {
2333 int width_mmx = ((width >> 1) << 1);
7f88f624 2334 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */
c6b71bff
GD
2335 if (width_mmx)
2336 {
7f88f624 2337 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2338 int dummy_value_S;
2339 int dummy_value_D;
2340
2341 __asm__ __volatile__ (
2342 "subl $4, %%esi \n\t"
2343 "subl $28, %%edi \n\t"
2344
2345 ".loop4_pass2: \n\t"
7f88f624
VZ
2346 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2347 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2348 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2349 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
c6b71bff
GD
2350 "movq %%mm0, (%%edi) \n\t"
2351 "movq %%mm0, 8(%%edi) \n\t"
2352 "movq %%mm1, 16(%%edi) \n\t"
2353 "movq %%mm1, 24(%%edi) \n\t"
2354 "subl $8, %%esi \n\t"
2355 "subl $32, %%edi \n\t"
2356 "subl $2, %%ecx \n\t"
2357 "jnz .loop4_pass2 \n\t"
7f88f624 2358 "EMMS \n\t" /* DONE */
c6b71bff 2359
7f88f624 2360 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2361 "=S" (dummy_value_S),
2362 "=D" (dummy_value_D)
2363
7f88f624
VZ
2364 : "1" (sptr), /* esi // input regs */
2365 "2" (dp), /* edi */
2366 "0" (width_mmx) /* ecx */
c6b71bff
GD
2367
2368#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2369 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2370#endif
2371 );
2372 }
2373
7f88f624
VZ
2374 sptr -= (width_mmx*4 - 4); /* sign fixed */
2375 dp -= (width_mmx*16 - 4); /* sign fixed */
c6b71bff
GD
2376 for (i = width; i; i--)
2377 {
2378 png_byte v[8];
2379 int j;
2380 sptr -= 4;
2381 png_memcpy(v, sptr, 4);
2382 for (j = 0; j < png_pass_inc[pass]; j++)
2383 {
2384 dp -= 4;
2385 png_memcpy(dp, v, 4);
2386 }
2387 }
2388 }
7f88f624 2389 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
2390 {
2391 int width_mmx = ((width >> 1) << 1) ;
7f88f624 2392 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */
c6b71bff
GD
2393 if (width_mmx)
2394 {
7f88f624 2395 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2396 int dummy_value_S;
2397 int dummy_value_D;
2398
2399 __asm__ __volatile__ (
2400 "subl $4, %%esi \n\t"
2401 "subl $12, %%edi \n\t"
2402
2403 ".loop4_pass4: \n\t"
7f88f624
VZ
2404 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
2405 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */
2406 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */
2407 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */
c6b71bff
GD
2408 "movq %%mm0, (%%edi) \n\t"
2409 "subl $8, %%esi \n\t"
2410 "movq %%mm1, 8(%%edi) \n\t"
2411 "subl $16, %%edi \n\t"
2412 "subl $2, %%ecx \n\t"
2413 "jnz .loop4_pass4 \n\t"
7f88f624 2414 "EMMS \n\t" /* DONE */
c6b71bff 2415
7f88f624 2416 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2417 "=S" (dummy_value_S),
2418 "=D" (dummy_value_D)
2419
7f88f624
VZ
2420 : "1" (sptr), /* esi // input regs */
2421 "2" (dp), /* edi */
2422 "0" (width_mmx) /* ecx */
c6b71bff
GD
2423
2424#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2425 : "%mm0", "%mm1" /* clobber list */
c6b71bff
GD
2426#endif
2427 );
2428 }
2429
7f88f624
VZ
2430 sptr -= (width_mmx*4 - 4); /* sign fixed */
2431 dp -= (width_mmx*8 - 4); /* sign fixed */
c6b71bff
GD
2432 for (i = width; i; i--)
2433 {
2434 png_byte v[8];
2435 int j;
2436 sptr -= 4;
2437 png_memcpy(v, sptr, 4);
2438 for (j = 0; j < png_pass_inc[pass]; j++)
2439 {
2440 dp -= 4;
2441 png_memcpy(dp, v, 4);
2442 }
2443 }
2444 }
2445 } /* end of pixel_bytes == 4 */
2446
2447 //--------------------------------------------------------------
2448 else if (pixel_bytes == 8)
2449 {
7f88f624
VZ
2450/* GRR TEST: should work, but needs testing (special 64-bit version of rpng2?) */
2451 /* GRR NOTE: no need to combine passes here! */
c6b71bff
GD
2452 if (((pass == 0) || (pass == 1)) && width)
2453 {
7f88f624 2454 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2455 int dummy_value_S;
2456 int dummy_value_D;
2457
7f88f624
VZ
2458 /* source is 8-byte RRGGBBAA */
2459 /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */
c6b71bff 2460 __asm__ __volatile__ (
7f88f624 2461 "subl $56, %%edi \n\t" /* start of last block */
c6b71bff
GD
2462
2463 ".loop8_pass0: \n\t"
7f88f624 2464 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
c6b71bff
GD
2465 "movq %%mm0, (%%edi) \n\t"
2466 "movq %%mm0, 8(%%edi) \n\t"
2467 "movq %%mm0, 16(%%edi) \n\t"
2468 "movq %%mm0, 24(%%edi) \n\t"
2469 "movq %%mm0, 32(%%edi) \n\t"
2470 "movq %%mm0, 40(%%edi) \n\t"
2471 "movq %%mm0, 48(%%edi) \n\t"
2472 "subl $8, %%esi \n\t"
2473 "movq %%mm0, 56(%%edi) \n\t"
2474 "subl $64, %%edi \n\t"
2475 "decl %%ecx \n\t"
2476 "jnz .loop8_pass0 \n\t"
7f88f624 2477 "EMMS \n\t" /* DONE */
c6b71bff 2478
7f88f624 2479 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2480 "=S" (dummy_value_S),
2481 "=D" (dummy_value_D)
2482
7f88f624
VZ
2483 : "1" (sptr), /* esi // input regs */
2484 "2" (dp), /* edi */
2485 "0" (width) /* ecx */
c6b71bff
GD
2486
2487#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2488 : "%mm0" /* clobber list */
c6b71bff
GD
2489#endif
2490 );
2491 }
2492 else if (((pass == 2) || (pass == 3)) && width)
2493 {
7f88f624
VZ
2494 /* source is 8-byte RRGGBBAA */
2495 /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
2496 /* (recall that expansion is _in place_: sptr and dp */
2497 /* both point at locations within same row buffer) */
c6b71bff 2498 {
7f88f624 2499 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2500 int dummy_value_S;
2501 int dummy_value_D;
2502
2503 __asm__ __volatile__ (
7f88f624 2504 "subl $24, %%edi \n\t" /* start of last block */
c6b71bff
GD
2505
2506 ".loop8_pass2: \n\t"
7f88f624 2507 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
c6b71bff
GD
2508 "movq %%mm0, (%%edi) \n\t"
2509 "movq %%mm0, 8(%%edi) \n\t"
2510 "movq %%mm0, 16(%%edi) \n\t"
2511 "subl $8, %%esi \n\t"
2512 "movq %%mm0, 24(%%edi) \n\t"
2513 "subl $32, %%edi \n\t"
2514 "decl %%ecx \n\t"
2515 "jnz .loop8_pass2 \n\t"
7f88f624 2516 "EMMS \n\t" /* DONE */
c6b71bff 2517
7f88f624 2518 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2519 "=S" (dummy_value_S),
2520 "=D" (dummy_value_D)
2521
7f88f624
VZ
2522 : "1" (sptr), /* esi // input regs */
2523 "2" (dp), /* edi */
2524 "0" (width) /* ecx */
c6b71bff
GD
2525
2526#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2527 : "%mm0" /* clobber list */
c6b71bff
GD
2528#endif
2529 );
2530 }
2531 }
7f88f624 2532 else if (width) /* pass == 4 or 5 */
c6b71bff 2533 {
7f88f624
VZ
2534 /* source is 8-byte RRGGBBAA */
2535 /* dest is 16-byte RRGGBBAA RRGGBBAA */
c6b71bff 2536 {
7f88f624 2537 int dummy_value_c; /* fix 'forbidden register spilled' */
c6b71bff
GD
2538 int dummy_value_S;
2539 int dummy_value_D;
2540
2541 __asm__ __volatile__ (
7f88f624 2542 "subl $8, %%edi \n\t" /* start of last block */
c6b71bff
GD
2543
2544 ".loop8_pass4: \n\t"
7f88f624 2545 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */
c6b71bff
GD
2546 "movq %%mm0, (%%edi) \n\t"
2547 "subl $8, %%esi \n\t"
2548 "movq %%mm0, 8(%%edi) \n\t"
2549 "subl $16, %%edi \n\t"
2550 "decl %%ecx \n\t"
2551 "jnz .loop8_pass4 \n\t"
7f88f624 2552 "EMMS \n\t" /* DONE */
c6b71bff 2553
7f88f624 2554 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2555 "=S" (dummy_value_S),
2556 "=D" (dummy_value_D)
2557
7f88f624
VZ
2558 : "1" (sptr), /* esi // input regs */
2559 "2" (dp), /* edi */
2560 "0" (width) /* ecx */
c6b71bff
GD
2561
2562#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
7f88f624 2563 : "%mm0" /* clobber list */
c6b71bff
GD
2564#endif
2565 );
2566 }
2567 }
2568
2569 } /* end of pixel_bytes == 8 */
2570
2571 //--------------------------------------------------------------
2572 else if (pixel_bytes == 6)
2573 {
2574 for (i = width; i; i--)
2575 {
2576 png_byte v[8];
2577 int j;
2578 png_memcpy(v, sptr, 6);
2579 for (j = 0; j < png_pass_inc[pass]; j++)
2580 {
2581 png_memcpy(dp, v, 6);
2582 dp -= 6;
2583 }
2584 sptr -= 6;
2585 }
2586 } /* end of pixel_bytes == 6 */
2587
2588 //--------------------------------------------------------------
2589 else
2590 {
2591 for (i = width; i; i--)
2592 {
2593 png_byte v[8];
2594 int j;
2595 png_memcpy(v, sptr, pixel_bytes);
2596 for (j = 0; j < png_pass_inc[pass]; j++)
2597 {
2598 png_memcpy(dp, v, pixel_bytes);
2599 dp -= pixel_bytes;
2600 }
2601 sptr-= pixel_bytes;
2602 }
2603 }
7f88f624 2604 } /* end of _mmx_supported ======================================== */
c6b71bff
GD
2605
2606 else /* MMX not supported: use modified C code - takes advantage
2607 * of inlining of png_memcpy for a constant */
2608 /* GRR 19991007: does it? or should pixel_bytes in each
2609 * block be replaced with immediate value (e.g., 1)? */
2610 /* GRR 19991017: replaced with constants in each case */
2611#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2612 {
2613 if (pixel_bytes == 1)
2614 {
2615 for (i = width; i; i--)
2616 {
2617 int j;
2618 for (j = 0; j < png_pass_inc[pass]; j++)
2619 {
2620 *dp-- = *sptr;
2621 }
2622 --sptr;
2623 }
2624 }
2625 else if (pixel_bytes == 3)
2626 {
2627 for (i = width; i; i--)
2628 {
2629 png_byte v[8];
2630 int j;
2631 png_memcpy(v, sptr, 3);
2632 for (j = 0; j < png_pass_inc[pass]; j++)
2633 {
2634 png_memcpy(dp, v, 3);
2635 dp -= 3;
2636 }
2637 sptr -= 3;
2638 }
2639 }
2640 else if (pixel_bytes == 2)
2641 {
2642 for (i = width; i; i--)
2643 {
2644 png_byte v[8];
2645 int j;
2646 png_memcpy(v, sptr, 2);
2647 for (j = 0; j < png_pass_inc[pass]; j++)
2648 {
2649 png_memcpy(dp, v, 2);
2650 dp -= 2;
2651 }
2652 sptr -= 2;
2653 }
2654 }
2655 else if (pixel_bytes == 4)
2656 {
2657 for (i = width; i; i--)
2658 {
2659 png_byte v[8];
2660 int j;
2661 png_memcpy(v, sptr, 4);
2662 for (j = 0; j < png_pass_inc[pass]; j++)
2663 {
2664#ifdef PNG_DEBUG
2665 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2666 {
2667 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2668 row, dp, row+png_ptr->row_buf_size);
2669 printf("row_buf=%d\n",png_ptr->row_buf_size);
2670 }
2671#endif
2672 png_memcpy(dp, v, 4);
2673 dp -= 4;
2674 }
2675 sptr -= 4;
2676 }
2677 }
2678 else if (pixel_bytes == 6)
2679 {
2680 for (i = width; i; i--)
2681 {
2682 png_byte v[8];
2683 int j;
2684 png_memcpy(v, sptr, 6);
2685 for (j = 0; j < png_pass_inc[pass]; j++)
2686 {
2687 png_memcpy(dp, v, 6);
2688 dp -= 6;
2689 }
2690 sptr -= 6;
2691 }
2692 }
2693 else if (pixel_bytes == 8)
2694 {
2695 for (i = width; i; i--)
2696 {
2697 png_byte v[8];
2698 int j;
2699 png_memcpy(v, sptr, 8);
2700 for (j = 0; j < png_pass_inc[pass]; j++)
2701 {
2702 png_memcpy(dp, v, 8);
2703 dp -= 8;
2704 }
2705 sptr -= 8;
2706 }
2707 }
2708 else /* GRR: should never be reached */
2709 {
2710 for (i = width; i; i--)
2711 {
2712 png_byte v[8];
2713 int j;
2714 png_memcpy(v, sptr, pixel_bytes);
2715 for (j = 0; j < png_pass_inc[pass]; j++)
2716 {
2717 png_memcpy(dp, v, pixel_bytes);
2718 dp -= pixel_bytes;
2719 }
2720 sptr -= pixel_bytes;
2721 }
2722 }
2723
2724 } /* end if (MMX not supported) */
2725 break;
2726 }
2727 } /* end switch (row_info->pixel_depth) */
2728
2729 row_info->width = final_width;
2730 row_info->rowbytes = ((final_width *
2731 (png_uint_32)row_info->pixel_depth + 7) >> 3);
2732 }
2733
2734} /* end png_do_read_interlace() */
2735
2736#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2737#endif /* PNG_READ_INTERLACING_SUPPORTED */
2738
2739
2740
2741#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2742#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2743
7f88f624
VZ
2744/* These variables are utilized in the functions below. They are declared */
2745/* globally here to ensure alignment on 8-byte boundaries. */
c6b71bff
GD
2746
2747union uAll {
2748 long long use;
2749 double align;
2750} _LBCarryMask = {0x0101010101010101LL},
2751 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2752 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2753
2754#ifdef PNG_THREAD_UNSAFE_OK
7f88f624
VZ
2755/*===========================================================================*/
2756/* */
2757/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G */
2758/* */
2759/*===========================================================================*/
c6b71bff 2760
7f88f624 2761/* Optimized code for PNG Average filter decoder */
c6b71bff
GD
2762
2763static void /* PRIVATE */
2764png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2765 png_bytep prev_row)
2766{
2767 int bpp;
7f88f624 2768 int dummy_value_c; /* fix 'forbidden register 2 (cx) was spilled' error */
c6b71bff
GD
2769 int dummy_value_S;
2770 int dummy_value_D;
2771
7f88f624
VZ
2772 bpp = (row_info->pixel_depth + 7) >> 3; /* get # bytes per pixel */
2773 _FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
2774
2775 __asm__ __volatile__ (
7f88f624 2776 /* initialize address pointers and offset */
c6b71bff 2777#ifdef __PIC__
7f88f624 2778 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff 2779#endif
7f88f624
VZ
2780/*pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
2781 "xorl %%ebx, %%ebx \n\t" /* ebx: x */
c6b71bff 2782 "movl %%edi, %%edx \n\t"
7f88f624
VZ
2783/*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
2784/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
2785 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
c6b71bff
GD
2786
2787 "xorl %%eax,%%eax \n\t"
2788
7f88f624
VZ
2789 /* Compute the Raw value for the first bpp bytes */
2790 /* Raw(x) = Avg(x) + (Prior(x)/2) */
c6b71bff 2791 "avg_rlp: \n\t"
7f88f624 2792 "movb (%%esi,%%ebx,),%%al \n\t" /* load al with Prior(x) */
c6b71bff 2793 "incl %%ebx \n\t"
7f88f624
VZ
2794 "shrb %%al \n\t" /* divide by 2 */
2795 "addb -1(%%edi,%%ebx,),%%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2796/* pre "cmpl bpp, %%ebx \n\t" */ /* (bpp is preloaded into ecx) */
c6b71bff 2797 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
2798 "movb %%al,-1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2799 "jb avg_rlp \n\t" /* mov does not affect flags */
2800
2801 /* get # of bytes to alignment */
2802 "movl %%edi, _dif \n\t" /* take start of row */
2803 "addl %%ebx, _dif \n\t" /* add bpp */
2804 "addl $0xf, _dif \n\t" /* add 7+8 to incr past alignment bdry */
2805 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
2806 "subl %%edi, _dif \n\t" /* subtract from start => value ebx at */
2807 "jz avg_go \n\t" /* alignment */
2808
2809 /* fix alignment */
2810 /* Compute the Raw value for the bytes up to the alignment boundary */
2811 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff
GD
2812 "xorl %%ecx, %%ecx \n\t"
2813
2814 "avg_lp1: \n\t"
2815 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
2816 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
2817 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
c6b71bff
GD
2818 "addw %%cx, %%ax \n\t"
2819 "incl %%ebx \n\t"
7f88f624
VZ
2820 "shrw %%ax \n\t" /* divide by 2 */
2821 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2822 "cmpl _dif, %%ebx \n\t" /* check if at alignment boundary */
2823 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2824 "jb avg_lp1 \n\t" /* repeat until at alignment boundary */
c6b71bff
GD
2825
2826 "avg_go: \n\t"
2827 "movl _FullLength, %%eax \n\t"
2828 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
2829 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
2830 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
2831 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
c6b71bff
GD
2832 "movl %%ecx, _MMXLength \n\t"
2833#ifdef __PIC__
7f88f624 2834 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
2835#endif
2836
7f88f624 2837 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
2838 "=S" (dummy_value_S),
2839 "=D" (dummy_value_D)
2840
7f88f624
VZ
2841 : "0" (bpp), /* ecx // input regs */
2842 "1" (prev_row), /* esi */
2843 "2" (row) /* edi */
c6b71bff 2844
7f88f624 2845 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
2846#ifndef __PIC__
2847 , "%ebx"
2848#endif
7f88f624
VZ
2849 /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */
2850 /* (seems to work fine without...) */
c6b71bff
GD
2851 );
2852
7f88f624 2853 /* now do the math for the rest of the row */
c6b71bff
GD
2854 switch (bpp)
2855 {
2856 case 3:
2857 {
2858 _ActiveMask.use = 0x0000000000ffffffLL;
7f88f624
VZ
2859 _ShiftBpp.use = 24; /* == 3 * 8 */
2860 _ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
2861
2862 __asm__ __volatile__ (
7f88f624 2863 /* re-init address pointers and offset */
c6b71bff 2864 "movq _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
2865 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2866 "movq _LBCarryMask, %%mm5 \n\t" /* alignment boundary */
2867/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 2868 "movq _HBClearMask, %%mm4 \n\t"
7f88f624 2869/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 2870
7f88f624
VZ
2871 /* prime the pump: load the first Raw(x-bpp) data set */
2872 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2873 /* (correct pos. in loop below) */
c6b71bff 2874 "avg_3lp: \n\t"
7f88f624 2875 "movq (%%edi,%%ecx,), %%mm0 \n\t" /* load mm0 with Avg(x) */
c6b71bff 2876 "movq %%mm5, %%mm3 \n\t"
7f88f624
VZ
2877 "psrlq _ShiftRem, %%mm2 \n\t" /* correct position Raw(x-bpp) */
2878 /* data */
2879 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* load mm1 with Prior(x) */
c6b71bff 2880 "movq %%mm7, %%mm6 \n\t"
7f88f624
VZ
2881 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
2882 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
2883 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
2884 /* byte */
2885 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
2886 /* each byte */
2887 /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */
2888 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2889 /* LBCarrys */
2890 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2891 /* where both */
2892 /* lsb's were == 1 (only valid for active group) */
2893 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2894 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2895 /* byte */
2896 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2897 /* for each byte */
2898 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
2899 /* bytes to add to Avg */
2900 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2901 /* Avg for each Active */
2902 /* byte */
2903 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
2904 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
2905 /* bytes 3-5 */
2906 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2907 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2908 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2909 /* LBCarrys */
2910 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2911 /* where both */
2912 /* lsb's were == 1 (only valid for active group) */
2913 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2914 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2915 /* byte */
2916 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2917 /* for each byte */
2918 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2919 /* bytes to add to Avg */
2920 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2921 /* Avg for each Active */
2922 /* byte */
2923
2924 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
2925 "psllq _ShiftBpp, %%mm6 \n\t" /* shift mm6 mask to cover last */
2926 /* two */
2927 /* bytes */
2928 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
2929 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
2930 /* Data only needs to be shifted once here to */
2931 /* get the correct x-bpp offset. */
2932 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
2933 /* LBCarrys */
2934 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
2935 /* where both */
2936 /* lsb's were == 1 (only valid for active group) */
2937 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
2938 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
2939 /* byte */
2940 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2941 /* for each byte */
2942 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
2943 /* bytes to add to Avg */
c6b71bff 2944 "addl $8, %%ecx \n\t"
7f88f624
VZ
2945 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
2946 /* Avg for each Active */
2947 /* byte */
2948 /* now ready to write back to memory */
c6b71bff 2949 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624 2950 /* move updated Raw(x) to use as Raw(x-bpp) for next loop */
c6b71bff 2951 "cmpl _MMXLength, %%ecx \n\t"
7f88f624 2952 "movq %%mm0, %%mm2 \n\t" /* mov updated Raw(x) to mm2 */
c6b71bff
GD
2953 "jb avg_3lp \n\t"
2954
7f88f624 2955 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
2956 "=D" (dummy_value_D)
2957
7f88f624
VZ
2958 : "0" (prev_row), /* esi // input regs */
2959 "1" (row) /* edi */
c6b71bff 2960
7f88f624 2961 : "%ecx" /* clobber list */
c6b71bff
GD
2962#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2963 , "%mm0", "%mm1", "%mm2", "%mm3"
2964 , "%mm4", "%mm5", "%mm6", "%mm7"
2965#endif
2966 );
2967 }
7f88f624 2968 break; /* end 3 bpp */
c6b71bff
GD
2969
2970 case 6:
2971 case 4:
7f88f624
VZ
2972 //case 7: /* who wrote this? PNG doesn't support 5 or 7 bytes/pixel */
2973 //case 5: /* GRR BOGUS */
c6b71bff 2974 {
7f88f624
VZ
2975 _ActiveMask.use = 0xffffffffffffffffLL; /* use shift below to clear */
2976 /* appropriate inactive bytes */
c6b71bff
GD
2977 _ShiftBpp.use = bpp << 3;
2978 _ShiftRem.use = 64 - _ShiftBpp.use;
2979
2980 __asm__ __volatile__ (
2981 "movq _HBClearMask, %%mm4 \n\t"
2982
7f88f624
VZ
2983 /* re-init address pointers and offset */
2984 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */
2985 /* alignment boundary */
c6b71bff 2986
7f88f624 2987 /* load _ActiveMask and clear all bytes except for 1st active group */
c6b71bff 2988 "movq _ActiveMask, %%mm7 \n\t"
7f88f624 2989/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 2990 "psrlq _ShiftRem, %%mm7 \n\t"
7f88f624 2991/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff
GD
2992 "movq %%mm7, %%mm6 \n\t"
2993 "movq _LBCarryMask, %%mm5 \n\t"
7f88f624
VZ
2994 "psllq _ShiftBpp, %%mm6 \n\t" /* create mask for 2nd active */
2995 /* group */
c6b71bff 2996
7f88f624
VZ
2997 /* prime the pump: load the first Raw(x-bpp) data set */
2998 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2999 /* (we correct pos. in loop below) */
c6b71bff
GD
3000 "avg_4lp: \n\t"
3001 "movq (%%edi,%%ecx,), %%mm0 \n\t"
7f88f624 3002 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
c6b71bff 3003 "movq (%%esi,%%ecx,), %%mm1 \n\t"
7f88f624 3004 /* add (Prev_row/2) to average */
c6b71bff 3005 "movq %%mm5, %%mm3 \n\t"
7f88f624
VZ
3006 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3007 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3008 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3009 /* byte */
3010 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3011 /* each byte */
3012 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3013 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3014 /* LBCarrys */
3015 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3016 /* where both */
3017 /* lsb's were == 1 (only valid for active group) */
3018 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3019 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3020 /* byte */
3021 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3022 /* for each byte */
3023 "pand %%mm7, %%mm2 \n\t" /* leave only Active Group 1 */
3024 /* bytes to add to Avg */
3025 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3026 /* for each Active */
3027 /* byte */
3028 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3029 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3030 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
c6b71bff 3031 "addl $8, %%ecx \n\t"
7f88f624
VZ
3032 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3033 /* LBCarrys */
3034 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3035 /* where both */
3036 /* lsb's were == 1 (only valid for active group) */
3037 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3038 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3039 /* byte */
3040 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3041 /* for each byte */
3042 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3043 /* bytes to add to Avg */
3044 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3045 /* Avg for each Active */
3046 /* byte */
c6b71bff 3047 "cmpl _MMXLength, %%ecx \n\t"
7f88f624 3048 /* now ready to write back to memory */
c6b71bff 3049 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624
VZ
3050 /* prep Raw(x-bpp) for next loop */
3051 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
c6b71bff
GD
3052 "jb avg_4lp \n\t"
3053
7f88f624 3054 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3055 "=D" (dummy_value_D)
3056
7f88f624
VZ
3057 : "0" (prev_row), /* esi // input regs */
3058 "1" (row) /* edi */
c6b71bff 3059
7f88f624 3060 : "%ecx" /* clobber list */
c6b71bff
GD
3061#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3062 , "%mm0", "%mm1", "%mm2", "%mm3"
3063 , "%mm4", "%mm5", "%mm6", "%mm7"
3064#endif
3065 );
3066 }
7f88f624 3067 break; /* end 4,6 bpp */
c6b71bff
GD
3068
3069 case 2:
3070 {
3071 _ActiveMask.use = 0x000000000000ffffLL;
7f88f624
VZ
3072 _ShiftBpp.use = 16; /* == 2 * 8 */
3073 _ShiftRem.use = 48; /* == 64 - 16 */
c6b71bff
GD
3074
3075 __asm__ __volatile__ (
7f88f624 3076 /* load _ActiveMask */
c6b71bff 3077 "movq _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
3078 /* re-init address pointers and offset */
3079 "movl _dif, %%ecx \n\t" /* ecx: x = offset to alignment */
3080 /* boundary */
c6b71bff 3081 "movq _LBCarryMask, %%mm5 \n\t"
7f88f624 3082/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 3083 "movq _HBClearMask, %%mm4 \n\t"
7f88f624 3084/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 3085
7f88f624
VZ
3086 /* prime the pump: load the first Raw(x-bpp) data set */
3087 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3088 /* (we correct pos. in loop below) */
c6b71bff
GD
3089 "avg_2lp: \n\t"
3090 "movq (%%edi,%%ecx,), %%mm0 \n\t"
7f88f624
VZ
3091 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */
3092 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* (GRR BUGFIX: was psllq) */
3093 /* add (Prev_row/2) to average */
c6b71bff 3094 "movq %%mm5, %%mm3 \n\t"
7f88f624
VZ
3095 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3096 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3097 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3098 /* byte */
c6b71bff 3099 "movq %%mm7, %%mm6 \n\t"
7f88f624
VZ
3100 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3101 /* each byte */
3102
3103 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3104 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3105 /* LBCarrys */
3106 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3107 /* where both */
3108 /* lsb's were == 1 (only valid */
3109 /* for active group) */
3110 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3111 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3112 /* byte */
3113 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3114 /* for each byte */
3115 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */
3116 /* bytes to add to Avg */
3117 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */
3118 /* for each Active byte */
3119
3120 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3121 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3122 /* bytes 2 & 3 */
3123 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3124 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3125 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3126 /* LBCarrys */
3127 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3128 /* where both */
3129 /* lsb's were == 1 (only valid */
3130 /* for active group) */
3131 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3132 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3133 /* byte */
3134 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3135 /* for each byte */
3136 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3137 /* bytes to add to Avg */
3138 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3139 /* Avg for each Active byte */
3140
3141 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
3142 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3143 /* bytes 4 & 5 */
3144 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3145 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
3146 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3147 /* LBCarrys */
3148 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3149 /* where both lsb's were == 1 */
3150 /* (only valid for active group) */
3151 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3152 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3153 /* byte */
3154 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3155 /* for each byte */
3156 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3157 /* bytes to add to Avg */
3158 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3159 /* Avg for each Active byte */
3160
3161 /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */
3162 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */
3163 /* bytes 6 & 7 */
3164 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
3165 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */
c6b71bff 3166 "addl $8, %%ecx \n\t"
7f88f624
VZ
3167 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */
3168 /* LBCarrys */
3169 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */
3170 /* where both */
3171 /* lsb's were == 1 (only valid */
3172 /* for active group) */
3173 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3174 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3175 /* byte */
3176 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3177 /* for each byte */
3178 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */
3179 /* bytes to add to Avg */
3180 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */
3181 /* Avg for each Active byte */
c6b71bff
GD
3182
3183 "cmpl _MMXLength, %%ecx \n\t"
7f88f624 3184 /* now ready to write back to memory */
c6b71bff 3185 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624
VZ
3186 /* prep Raw(x-bpp) for next loop */
3187 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */
c6b71bff
GD
3188 "jb avg_2lp \n\t"
3189
7f88f624 3190 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3191 "=D" (dummy_value_D)
3192
7f88f624
VZ
3193 : "0" (prev_row), /* esi // input regs */
3194 "1" (row) /* edi */
c6b71bff 3195
7f88f624 3196 : "%ecx" /* clobber list */
c6b71bff
GD
3197#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3198 , "%mm0", "%mm1", "%mm2", "%mm3"
3199 , "%mm4", "%mm5", "%mm6", "%mm7"
3200#endif
3201 );
3202 }
7f88f624 3203 break; /* end 2 bpp */
c6b71bff
GD
3204
3205 case 1:
3206 {
3207 __asm__ __volatile__ (
7f88f624 3208 /* re-init address pointers and offset */
c6b71bff 3209#ifdef __PIC__
7f88f624 3210 "pushl %%ebx \n\t" /* save Global Offset Table index */
c6b71bff 3211#endif
7f88f624
VZ
3212 "movl _dif, %%ebx \n\t" /* ebx: x = offset to alignment */
3213 /* boundary */
3214/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
3215 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
c6b71bff 3216 "jnb avg_1end \n\t"
7f88f624
VZ
3217 /* do Paeth decode for remaining bytes */
3218/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 3219 "movl %%edi, %%edx \n\t"
7f88f624
VZ
3220/* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
3221 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3222 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
3223 /* in loop below */
c6b71bff 3224 "avg_1lp: \n\t"
7f88f624 3225 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 3226 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3227 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3228 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
c6b71bff
GD
3229 "addw %%cx, %%ax \n\t"
3230 "incl %%ebx \n\t"
7f88f624
VZ
3231 "shrw %%ax \n\t" /* divide by 2 */
3232 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */
3233 /* inc ebx */
3234 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3235 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */
3236 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
3237 "jb avg_1lp \n\t"
3238
3239 "avg_1end: \n\t"
3240#ifdef __PIC__
7f88f624 3241 "popl %%ebx \n\t" /* Global Offset Table index */
c6b71bff
GD
3242#endif
3243
7f88f624 3244 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
3245 "=S" (dummy_value_S),
3246 "=D" (dummy_value_D)
3247
7f88f624
VZ
3248 : "0" (bpp), /* ecx // input regs */
3249 "1" (prev_row), /* esi */
3250 "2" (row) /* edi */
c6b71bff 3251
7f88f624 3252 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
3253#ifndef __PIC__
3254 , "%ebx"
3255#endif
3256 );
3257 }
7f88f624 3258 return; /* end 1 bpp */
c6b71bff
GD
3259
3260 case 8:
3261 {
3262 __asm__ __volatile__ (
7f88f624
VZ
3263 /* re-init address pointers and offset */
3264 "movl _dif, %%ecx \n\t" /* ecx: x == offset to alignment */
3265 "movq _LBCarryMask, %%mm5 \n\t" /* boundary */
3266/* preload "movl row, %%edi \n\t" // edi: Avg(x) */
c6b71bff 3267 "movq _HBClearMask, %%mm4 \n\t"
7f88f624 3268/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */
c6b71bff 3269
7f88f624
VZ
3270 /* prime the pump: load the first Raw(x-bpp) data set */
3271 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3272 /* (NO NEED to correct pos. in loop below) */
c6b71bff
GD
3273
3274 "avg_8lp: \n\t"
3275 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3276 "movq %%mm5, %%mm3 \n\t"
3277 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3278 "addl $8, %%ecx \n\t"
7f88f624
VZ
3279 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
3280 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3281 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3282 /* where both lsb's were == 1 */
3283 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3284 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7, each byte */
3285 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg, each byte */
3286 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7, each byte */
3287 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg, each */
3288 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
c6b71bff
GD
3289 "cmpl _MMXLength, %%ecx \n\t"
3290 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
7f88f624 3291 "movq %%mm0, %%mm2 \n\t" /* reuse as Raw(x-bpp) */
c6b71bff
GD
3292 "jb avg_8lp \n\t"
3293
7f88f624 3294 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3295 "=D" (dummy_value_D)
3296
7f88f624
VZ
3297 : "0" (prev_row), /* esi // input regs */
3298 "1" (row) /* edi */
c6b71bff 3299
7f88f624 3300 : "%ecx" /* clobber list */
c6b71bff
GD
3301#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3302 , "%mm0", "%mm1", "%mm2"
3303 , "%mm3", "%mm4", "%mm5"
3304#endif
3305 );
3306 }
7f88f624 3307 break; /* end 8 bpp */
c6b71bff 3308
7f88f624 3309 default: /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */
c6b71bff
GD
3310 {
3311
3312#ifdef PNG_DEBUG
7f88f624 3313 /* GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED */
c6b71bff
GD
3314 png_debug(1,
3315 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3316#endif
3317
3318#if 0
3319 __asm__ __volatile__ (
3320 "movq _LBCarryMask, %%mm5 \n\t"
7f88f624
VZ
3321 /* re-init address pointers and offset */
3322 "movl _dif, %%ebx \n\t" /* ebx: x = offset to */
3323 /* alignment boundary */
3324 "movl row, %%edi \n\t" /* edi: Avg(x) */
c6b71bff
GD
3325 "movq _HBClearMask, %%mm4 \n\t"
3326 "movl %%edi, %%edx \n\t"
7f88f624
VZ
3327 "movl prev_row, %%esi \n\t" /* esi: Prior(x) */
3328 "subl bpp, %%edx \n\t" /* edx: Raw(x-bpp) */
c6b71bff
GD
3329 "avg_Alp: \n\t"
3330 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3331 "movq %%mm5, %%mm3 \n\t"
3332 "movq (%%esi,%%ebx,), %%mm1 \n\t"
7f88f624 3333 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */
c6b71bff 3334 "movq (%%edx,%%ebx,), %%mm2 \n\t"
7f88f624
VZ
3335 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */
3336 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */
3337 /* where both lsb's were == 1 */
3338 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */
3339 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */
3340 /* byte */
3341 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg for each */
3342 /* byte */
3343 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */
3344 /* byte */
3345 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */
3346 /* each byte */
c6b71bff 3347 "addl $8, %%ebx \n\t"
7f88f624
VZ
3348 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */
3349 /* byte */
c6b71bff
GD
3350 "cmpl _MMXLength, %%ebx \n\t"
3351 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3352 "jb avg_Alp \n\t"
3353
7f88f624 3354 : /* FIXASM: output regs/vars go here, e.g.: "=m" (memory_var) */
c6b71bff 3355
7f88f624 3356 : /* FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest) */
c6b71bff 3357
7f88f624 3358 : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */
c6b71bff
GD
3359 );
3360#endif /* 0 - NEVER REACHED */
3361 }
3362 break;
3363
7f88f624 3364 } /* end switch (bpp) */
c6b71bff
GD
3365
3366 __asm__ __volatile__ (
7f88f624
VZ
3367 /* MMX acceleration complete; now do clean-up */
3368 /* check if any remaining bytes left to decode */
c6b71bff 3369#ifdef __PIC__
7f88f624 3370 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff 3371#endif
7f88f624
VZ
3372 "movl _MMXLength, %%ebx \n\t" /* ebx: x == offset bytes after MMX */
3373/* pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */
3374 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */
c6b71bff
GD
3375 "jnb avg_end \n\t"
3376
7f88f624
VZ
3377 /* do Avg decode for remaining bytes */
3378/*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */
c6b71bff 3379 "movl %%edi, %%edx \n\t"
7f88f624
VZ
3380/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
3381 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */
3382 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
c6b71bff
GD
3383
3384 "avg_lp2: \n\t"
7f88f624 3385 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 3386 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3387 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */
3388 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */
c6b71bff
GD
3389 "addw %%cx, %%ax \n\t"
3390 "incl %%ebx \n\t"
7f88f624
VZ
3391 "shrw %%ax \n\t" /* divide by 2 */
3392 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
3393 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */
3394 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */
3395 "jb avg_lp2 \n\t" /* affect flags; -1 to offset inc ebx] */
c6b71bff
GD
3396
3397 "avg_end: \n\t"
7f88f624 3398 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
c6b71bff 3399#ifdef __PIC__
7f88f624 3400 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
3401#endif
3402
7f88f624 3403 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
3404 "=S" (dummy_value_S),
3405 "=D" (dummy_value_D)
3406
7f88f624
VZ
3407 : "0" (bpp), /* ecx // input regs */
3408 "1" (prev_row), /* esi */
3409 "2" (row) /* edi */
c6b71bff 3410
7f88f624 3411 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
3412#ifndef __PIC__
3413 , "%ebx"
3414#endif
3415 );
3416
3417} /* end png_read_filter_row_mmx_avg() */
3418#endif
3419
3420
3421
3422#ifdef PNG_THREAD_UNSAFE_OK
7f88f624
VZ
3423/*===========================================================================*/
3424/* */
3425/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H */
3426/* */
3427/*===========================================================================*/
c6b71bff 3428
7f88f624 3429/* Optimized code for PNG Paeth filter decoder */
c6b71bff
GD
3430
3431static void /* PRIVATE */
3432png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3433 png_bytep prev_row)
3434{
3435 int bpp;
7f88f624 3436 int dummy_value_c; /* fix 'forbidden register 2 (cx) was spilled' error */
c6b71bff
GD
3437 int dummy_value_S;
3438 int dummy_value_D;
3439
7f88f624
VZ
3440 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3441 _FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
3442
3443 __asm__ __volatile__ (
3444#ifdef __PIC__
7f88f624 3445 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff 3446#endif
7f88f624
VZ
3447 "xorl %%ebx, %%ebx \n\t" /* ebx: x offset */
3448/*pre "movl row, %%edi \n\t" */
3449 "xorl %%edx, %%edx \n\t" /* edx: x-bpp offset */
3450/*pre "movl prev_row, %%esi \n\t" */
c6b71bff
GD
3451 "xorl %%eax, %%eax \n\t"
3452
7f88f624
VZ
3453 /* Compute the Raw value for the first bpp bytes */
3454 /* Note: the formula works out to be always */
3455 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
c6b71bff
GD
3456 "paeth_rlp: \n\t"
3457 "movb (%%edi,%%ebx,), %%al \n\t"
3458 "addb (%%esi,%%ebx,), %%al \n\t"
3459 "incl %%ebx \n\t"
7f88f624 3460/*pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx) */
c6b71bff
GD
3461 "cmpl %%ecx, %%ebx \n\t"
3462 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3463 "jb paeth_rlp \n\t"
7f88f624
VZ
3464 /* get # of bytes to alignment */
3465 "movl %%edi, _dif \n\t" /* take start of row */
3466 "addl %%ebx, _dif \n\t" /* add bpp */
c6b71bff 3467 "xorl %%ecx, %%ecx \n\t"
7f88f624
VZ
3468 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past alignment */
3469 /* boundary */
3470 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
3471 "subl %%edi, _dif \n\t" /* subtract from start ==> value ebx */
3472 /* at alignment */
c6b71bff 3473 "jz paeth_go \n\t"
7f88f624 3474 /* fix alignment */
c6b71bff
GD
3475
3476 "paeth_lp1: \n\t"
3477 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3478 /* pav = p - a = (a + b - c) - a = b - c */
3479 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
3480 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
3481 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
3482 "movl %%eax, _patemp \n\t" /* Save pav for later use */
c6b71bff 3483 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
3484 /* pbv = p - b = (a + b - c) - b = a - c */
3485 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
3486 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
c6b71bff 3487 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
3488 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3489 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
3490 /* pc = abs(pcv) */
c6b71bff
GD
3491 "testl $0x80000000, %%eax \n\t"
3492 "jz paeth_pca \n\t"
7f88f624 3493 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
3494
3495 "paeth_pca: \n\t"
7f88f624
VZ
3496 "movl %%eax, _pctemp \n\t" /* save pc for later use */
3497 /* pb = abs(pbv) */
c6b71bff
GD
3498 "testl $0x80000000, %%ecx \n\t"
3499 "jz paeth_pba \n\t"
7f88f624 3500 "negl %%ecx \n\t" /* reverse sign of neg values */
c6b71bff
GD
3501
3502 "paeth_pba: \n\t"
7f88f624
VZ
3503 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
3504 /* pa = abs(pav) */
c6b71bff
GD
3505 "movl _patemp, %%eax \n\t"
3506 "testl $0x80000000, %%eax \n\t"
3507 "jz paeth_paa \n\t"
7f88f624 3508 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
3509
3510 "paeth_paa: \n\t"
7f88f624
VZ
3511 "movl %%eax, _patemp \n\t" /* save pa for later use */
3512 /* test if pa <= pb */
c6b71bff
GD
3513 "cmpl %%ecx, %%eax \n\t"
3514 "jna paeth_abb \n\t"
7f88f624 3515 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
3516 "cmpl _pctemp, %%ecx \n\t"
3517 "jna paeth_bbc \n\t"
7f88f624
VZ
3518 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3519 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
3520 "jmp paeth_paeth \n\t"
3521
3522 "paeth_bbc: \n\t"
7f88f624
VZ
3523 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3524 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
c6b71bff
GD
3525 "jmp paeth_paeth \n\t"
3526
3527 "paeth_abb: \n\t"
7f88f624 3528 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
3529 "cmpl _pctemp, %%eax \n\t"
3530 "jna paeth_abc \n\t"
7f88f624
VZ
3531 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3532 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
3533 "jmp paeth_paeth \n\t"
3534
3535 "paeth_abc: \n\t"
7f88f624
VZ
3536 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3537 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
c6b71bff
GD
3538
3539 "paeth_paeth: \n\t"
3540 "incl %%ebx \n\t"
3541 "incl %%edx \n\t"
7f88f624 3542 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
3543 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3544 "cmpl _dif, %%ebx \n\t"
3545 "jb paeth_lp1 \n\t"
3546
3547 "paeth_go: \n\t"
3548 "movl _FullLength, %%ecx \n\t"
3549 "movl %%ecx, %%eax \n\t"
7f88f624
VZ
3550 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */
3551 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */
3552 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */
c6b71bff
GD
3553 "movl %%ecx, _MMXLength \n\t"
3554#ifdef __PIC__
7f88f624 3555 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
3556#endif
3557
7f88f624 3558 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
3559 "=S" (dummy_value_S),
3560 "=D" (dummy_value_D)
3561
7f88f624
VZ
3562 : "0" (bpp), /* ecx // input regs */
3563 "1" (prev_row), /* esi */
3564 "2" (row) /* edi */
c6b71bff 3565
7f88f624 3566 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
3567#ifndef __PIC__
3568 , "%ebx"
3569#endif
3570 );
3571
7f88f624 3572 /* now do the math for the rest of the row */
c6b71bff
GD
3573 switch (bpp)
3574 {
3575 case 3:
3576 {
3577 _ActiveMask.use = 0x0000000000ffffffLL;
3578 _ActiveMaskEnd.use = 0xffff000000000000LL;
7f88f624
VZ
3579 _ShiftBpp.use = 24; /* == bpp(3) * 8 */
3580 _ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
3581
3582 __asm__ __volatile__ (
3583 "movl _dif, %%ecx \n\t"
7f88f624
VZ
3584/* preload "movl row, %%edi \n\t" */
3585/* preload "movl prev_row, %%esi \n\t" */
c6b71bff 3586 "pxor %%mm0, %%mm0 \n\t"
7f88f624 3587 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
3588 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3589 "paeth_3lp: \n\t"
7f88f624
VZ
3590 "psrlq _ShiftRem, %%mm1 \n\t" /* shift last 3 bytes to 1st */
3591 /* 3 bytes */
3592 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3593 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3594 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */
3595 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3596 "psrlq _ShiftRem, %%mm3 \n\t" /* shift last 3 bytes to 1st */
3597 /* 3 bytes */
3598 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3599 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3600 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3601 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3602 "movq %%mm1, %%mm5 \n\t"
3603 "psubw %%mm3, %%mm4 \n\t"
3604 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3605 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3606 "movq %%mm4, %%mm6 \n\t"
3607 "psubw %%mm3, %%mm5 \n\t"
3608
7f88f624
VZ
3609 /* pa = abs(p-a) = abs(pav) */
3610 /* pb = abs(p-b) = abs(pbv) */
3611 /* pc = abs(p-c) = abs(pcv) */
3612 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3613 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3614 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3615 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3616 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3617 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3618 "psubw %%mm0, %%mm4 \n\t"
3619 "psubw %%mm7, %%mm5 \n\t"
3620 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3621 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3622 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3623 "psubw %%mm7, %%mm5 \n\t"
3624 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3625 /* test pa <= pb */
c6b71bff
GD
3626 "movq %%mm4, %%mm7 \n\t"
3627 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3628 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3629 "movq %%mm7, %%mm0 \n\t"
7f88f624 3630 /* use mm7 mask to merge pa & pb */
c6b71bff 3631 "pand %%mm7, %%mm5 \n\t"
7f88f624 3632 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3633 "pand %%mm0, %%mm2 \n\t"
3634 "pandn %%mm4, %%mm7 \n\t"
3635 "pandn %%mm1, %%mm0 \n\t"
3636 "paddw %%mm5, %%mm7 \n\t"
3637 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3638 /* test ((pa <= pb)? pa:pb) <= pc */
3639 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3640 "pxor %%mm1, %%mm1 \n\t"
3641 "pand %%mm7, %%mm3 \n\t"
3642 "pandn %%mm0, %%mm7 \n\t"
3643 "paddw %%mm3, %%mm7 \n\t"
3644 "pxor %%mm0, %%mm0 \n\t"
3645 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 3646 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
c6b71bff 3647 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
3648 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
3649 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3650 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3651 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
3652 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as */
3653 /* Raw(x-bpp) */
3654 /* now do Paeth for 2nd set of bytes (3-5) */
3655 "psrlq _ShiftBpp, %%mm2 \n\t" /* load b=Prior(x) step 2 */
3656 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
c6b71bff 3657 "pxor %%mm7, %%mm7 \n\t"
7f88f624
VZ
3658 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3659 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 3660 "movq %%mm1, %%mm5 \n\t"
7f88f624 3661 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff
GD
3662 "movq %%mm2, %%mm4 \n\t"
3663 "psubw %%mm3, %%mm5 \n\t"
3664 "psubw %%mm3, %%mm4 \n\t"
7f88f624
VZ
3665 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
3666 /* pav + pbv = pbv + pav */
c6b71bff
GD
3667 "movq %%mm5, %%mm6 \n\t"
3668 "paddw %%mm4, %%mm6 \n\t"
3669
7f88f624
VZ
3670 /* pa = abs(p-a) = abs(pav) */
3671 /* pb = abs(p-b) = abs(pbv) */
3672 /* pc = abs(p-c) = abs(pcv) */
3673 "pcmpgtw %%mm5, %%mm0 \n\t" /* create mask pbv bytes < 0 */
3674 "pcmpgtw %%mm4, %%mm7 \n\t" /* create mask pav bytes < 0 */
3675 "pand %%mm5, %%mm0 \n\t" /* only pbv bytes < 0 in mm0 */
3676 "pand %%mm4, %%mm7 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3677 "psubw %%mm0, %%mm5 \n\t"
3678 "psubw %%mm7, %%mm4 \n\t"
3679 "psubw %%mm0, %%mm5 \n\t"
3680 "psubw %%mm7, %%mm4 \n\t"
3681 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3682 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3683 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff 3684 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3685 /* test pa <= pb */
c6b71bff
GD
3686 "movq %%mm4, %%mm7 \n\t"
3687 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3688 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3689 "movq %%mm7, %%mm0 \n\t"
7f88f624 3690 /* use mm7 mask to merge pa & pb */
c6b71bff 3691 "pand %%mm7, %%mm5 \n\t"
7f88f624 3692 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3693 "pand %%mm0, %%mm2 \n\t"
3694 "pandn %%mm4, %%mm7 \n\t"
3695 "pandn %%mm1, %%mm0 \n\t"
3696 "paddw %%mm5, %%mm7 \n\t"
3697 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3698 /* test ((pa <= pb)? pa:pb) <= pc */
3699 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
3700 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
c6b71bff
GD
3701 "pand %%mm7, %%mm3 \n\t"
3702 "pandn %%mm0, %%mm7 \n\t"
3703 "pxor %%mm1, %%mm1 \n\t"
3704 "paddw %%mm3, %%mm7 \n\t"
3705 "pxor %%mm0, %%mm0 \n\t"
3706 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 3707 "movq %%mm2, %%mm3 \n\t" /* load c=Prior(x-bpp) step 1 */
c6b71bff 3708 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
3709 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3710 "psllq _ShiftBpp, %%mm7 \n\t" /* shift bytes to 2nd group of */
3711 /* 3 bytes */
3712 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3713 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3714 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3715 "psllq _ShiftBpp, %%mm3 \n\t" /* load c=Prior(x-bpp) step 2 */
3716 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
c6b71bff 3717 "movq %%mm7, %%mm1 \n\t"
7f88f624
VZ
3718 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3719 "psllq _ShiftBpp, %%mm1 \n\t" /* shift bytes */
3720 /* now mm1 will be used as Raw(x-bpp) */
3721 /* now do Paeth for 3rd, and final, set of bytes (6-7) */
c6b71bff 3722 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3723 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
c6b71bff 3724 "psubw %%mm3, %%mm4 \n\t"
7f88f624 3725 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 3726 "movq %%mm1, %%mm5 \n\t"
7f88f624 3727 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3728 "movq %%mm4, %%mm6 \n\t"
3729 "psubw %%mm3, %%mm5 \n\t"
3730 "pxor %%mm0, %%mm0 \n\t"
3731 "paddw %%mm5, %%mm6 \n\t"
3732
7f88f624
VZ
3733 /* pa = abs(p-a) = abs(pav) */
3734 /* pb = abs(p-b) = abs(pbv) */
3735 /* pc = abs(p-c) = abs(pcv) */
3736 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
3737 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
3738 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3739 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3740 "psubw %%mm0, %%mm4 \n\t"
3741 "psubw %%mm7, %%mm5 \n\t"
3742 "psubw %%mm0, %%mm4 \n\t"
3743 "psubw %%mm7, %%mm5 \n\t"
3744 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3745 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3746 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff 3747 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3748 /* test pa <= pb */
c6b71bff
GD
3749 "movq %%mm4, %%mm7 \n\t"
3750 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3751 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3752 "movq %%mm7, %%mm0 \n\t"
7f88f624 3753 /* use mm0 mask copy to merge a & b */
c6b71bff 3754 "pand %%mm0, %%mm2 \n\t"
7f88f624 3755 /* use mm7 mask to merge pa & pb */
c6b71bff
GD
3756 "pand %%mm7, %%mm5 \n\t"
3757 "pandn %%mm1, %%mm0 \n\t"
3758 "pandn %%mm4, %%mm7 \n\t"
3759 "paddw %%mm2, %%mm0 \n\t"
3760 "paddw %%mm5, %%mm7 \n\t"
7f88f624
VZ
3761 /* test ((pa <= pb)? pa:pb) <= pc */
3762 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3763 "pand %%mm7, %%mm3 \n\t"
3764 "pandn %%mm0, %%mm7 \n\t"
3765 "paddw %%mm3, %%mm7 \n\t"
3766 "pxor %%mm1, %%mm1 \n\t"
3767 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 3768 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
3769 "addl $8, %%ecx \n\t"
3770 "pand _ActiveMaskEnd, %%mm1 \n\t"
7f88f624
VZ
3771 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */
3772 /* Raw(x) */
c6b71bff
GD
3773
3774 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
3775 "pxor %%mm0, %%mm0 \n\t" /* pxor does not affect flags */
3776 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3777 /* mm1 will be used as Raw(x-bpp) next loop */
3778 /* mm3 ready to be used as Prior(x-bpp) next loop */
c6b71bff
GD
3779 "jb paeth_3lp \n\t"
3780
7f88f624 3781 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3782 "=D" (dummy_value_D)
3783
7f88f624
VZ
3784 : "0" (prev_row), /* esi // input regs */
3785 "1" (row) /* edi */
c6b71bff 3786
7f88f624 3787 : "%ecx" /* clobber list */
c6b71bff
GD
3788#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3789 , "%mm0", "%mm1", "%mm2", "%mm3"
3790 , "%mm4", "%mm5", "%mm6", "%mm7"
3791#endif
3792 );
3793 }
7f88f624 3794 break; /* end 3 bpp */
c6b71bff
GD
3795
3796 case 6:
7f88f624
VZ
3797 //case 7: /* GRR BOGUS */
3798 //case 5: /* GRR BOGUS */
c6b71bff
GD
3799 {
3800 _ActiveMask.use = 0x00000000ffffffffLL;
3801 _ActiveMask2.use = 0xffffffff00000000LL;
7f88f624 3802 _ShiftBpp.use = bpp << 3; /* == bpp * 8 */
c6b71bff
GD
3803 _ShiftRem.use = 64 - _ShiftBpp.use;
3804
3805 __asm__ __volatile__ (
3806 "movl _dif, %%ecx \n\t"
7f88f624
VZ
3807/* preload "movl row, %%edi \n\t" */
3808/* preload "movl prev_row, %%esi \n\t" */
3809 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
3810 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3811 "pxor %%mm0, %%mm0 \n\t"
3812
3813 "paeth_6lp: \n\t"
7f88f624 3814 /* must shift to position Raw(x-bpp) data */
c6b71bff 3815 "psrlq _ShiftRem, %%mm1 \n\t"
7f88f624
VZ
3816 /* do first set of 4 bytes */
3817 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3818 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3819 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3820 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
3821 /* must shift to position Prior(x-bpp) data */
c6b71bff 3822 "psrlq _ShiftRem, %%mm3 \n\t"
7f88f624 3823 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3824 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3825 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
3826 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3827 "movq %%mm1, %%mm5 \n\t"
3828 "psubw %%mm3, %%mm4 \n\t"
3829 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3830 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3831 "movq %%mm4, %%mm6 \n\t"
3832 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
3833 /* pa = abs(p-a) = abs(pav) */
3834 /* pb = abs(p-b) = abs(pbv) */
3835 /* pc = abs(p-c) = abs(pcv) */
3836 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3837 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3838 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3839 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3840 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3841 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3842 "psubw %%mm0, %%mm4 \n\t"
3843 "psubw %%mm7, %%mm5 \n\t"
3844 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3845 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3846 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3847 "psubw %%mm7, %%mm5 \n\t"
3848 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3849 /* test pa <= pb */
c6b71bff
GD
3850 "movq %%mm4, %%mm7 \n\t"
3851 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3852 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3853 "movq %%mm7, %%mm0 \n\t"
7f88f624 3854 /* use mm7 mask to merge pa & pb */
c6b71bff 3855 "pand %%mm7, %%mm5 \n\t"
7f88f624 3856 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3857 "pand %%mm0, %%mm2 \n\t"
3858 "pandn %%mm4, %%mm7 \n\t"
3859 "pandn %%mm1, %%mm0 \n\t"
3860 "paddw %%mm5, %%mm7 \n\t"
3861 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3862 /* test ((pa <= pb)? pa:pb) <= pc */
3863 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3864 "pxor %%mm1, %%mm1 \n\t"
3865 "pand %%mm7, %%mm3 \n\t"
3866 "pandn %%mm0, %%mm7 \n\t"
3867 "paddw %%mm3, %%mm7 \n\t"
3868 "pxor %%mm0, %%mm0 \n\t"
3869 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 3870 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
c6b71bff
GD
3871 "pand _ActiveMask, %%mm7 \n\t"
3872 "psrlq _ShiftRem, %%mm3 \n\t"
7f88f624
VZ
3873 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) step 1 */
3874 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */
c6b71bff 3875 "movq %%mm2, %%mm6 \n\t"
7f88f624 3876 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
c6b71bff
GD
3877 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3878 "psllq _ShiftBpp, %%mm6 \n\t"
3879 "movq %%mm7, %%mm5 \n\t"
3880 "psrlq _ShiftRem, %%mm1 \n\t"
3881 "por %%mm6, %%mm3 \n\t"
3882 "psllq _ShiftBpp, %%mm5 \n\t"
7f88f624 3883 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
c6b71bff 3884 "por %%mm5, %%mm1 \n\t"
7f88f624
VZ
3885 /* do second set of 4 bytes */
3886 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3887 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
3888 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3889 "movq %%mm2, %%mm4 \n\t"
7f88f624 3890 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3891 "movq %%mm1, %%mm5 \n\t"
3892 "psubw %%mm3, %%mm4 \n\t"
3893 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3894 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3895 "movq %%mm4, %%mm6 \n\t"
3896 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
3897 /* pa = abs(p-a) = abs(pav) */
3898 /* pb = abs(p-b) = abs(pbv) */
3899 /* pc = abs(p-c) = abs(pcv) */
3900 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3901 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3902 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3903 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3904 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3905 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3906 "psubw %%mm0, %%mm4 \n\t"
3907 "psubw %%mm7, %%mm5 \n\t"
3908 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3909 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3910 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
3911 "psubw %%mm7, %%mm5 \n\t"
3912 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3913 /* test pa <= pb */
c6b71bff
GD
3914 "movq %%mm4, %%mm7 \n\t"
3915 "psubw %%mm0, %%mm6 \n\t"
7f88f624 3916 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 3917 "movq %%mm7, %%mm0 \n\t"
7f88f624 3918 /* use mm7 mask to merge pa & pb */
c6b71bff 3919 "pand %%mm7, %%mm5 \n\t"
7f88f624 3920 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3921 "pand %%mm0, %%mm2 \n\t"
3922 "pandn %%mm4, %%mm7 \n\t"
3923 "pandn %%mm1, %%mm0 \n\t"
3924 "paddw %%mm5, %%mm7 \n\t"
3925 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
3926 /* test ((pa <= pb)? pa:pb) <= pc */
3927 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
3928 "pxor %%mm1, %%mm1 \n\t"
3929 "pand %%mm7, %%mm3 \n\t"
3930 "pandn %%mm0, %%mm7 \n\t"
3931 "pxor %%mm1, %%mm1 \n\t"
3932 "paddw %%mm3, %%mm7 \n\t"
3933 "pxor %%mm0, %%mm0 \n\t"
7f88f624 3934 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
3935 "addl $8, %%ecx \n\t"
3936 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 3937 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
c6b71bff 3938 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
3939 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3940 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff
GD
3941 "jb paeth_6lp \n\t"
3942
7f88f624 3943 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
3944 "=D" (dummy_value_D)
3945
7f88f624
VZ
3946 : "0" (prev_row), /* esi // input regs */
3947 "1" (row) /* edi */
c6b71bff 3948
7f88f624 3949 : "%ecx" /* clobber list */
c6b71bff
GD
3950#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3951 , "%mm0", "%mm1", "%mm2", "%mm3"
3952 , "%mm4", "%mm5", "%mm6", "%mm7"
3953#endif
3954 );
3955 }
7f88f624 3956 break; /* end 6 bpp */
c6b71bff
GD
3957
3958 case 4:
3959 {
3960 _ActiveMask.use = 0x00000000ffffffffLL;
3961
3962 __asm__ __volatile__ (
3963 "movl _dif, %%ecx \n\t"
7f88f624
VZ
3964/* preload "movl row, %%edi \n\t" */
3965/* preload "movl prev_row, %%esi \n\t" */
c6b71bff 3966 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3967 /* prime the pump: load the first Raw(x-bpp) data set */
3968 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
3969 /* a=Raw(x-bpp) bytes */
c6b71bff 3970 "paeth_4lp: \n\t"
7f88f624
VZ
3971 /* do first set of 4 bytes */
3972 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3973 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
3974 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
3975 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
3976 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3977 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
3978 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
3979 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3980 "movq %%mm1, %%mm5 \n\t"
3981 "psubw %%mm3, %%mm4 \n\t"
3982 "pxor %%mm7, %%mm7 \n\t"
7f88f624 3983 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3984 "movq %%mm4, %%mm6 \n\t"
3985 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
3986 /* pa = abs(p-a) = abs(pav) */
3987 /* pb = abs(p-b) = abs(pbv) */
3988 /* pc = abs(p-c) = abs(pcv) */
3989 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 3990 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
3991 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
3992 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 3993 "psubw %%mm0, %%mm4 \n\t"
7f88f624 3994 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
3995 "psubw %%mm0, %%mm4 \n\t"
3996 "psubw %%mm7, %%mm5 \n\t"
3997 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
3998 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
3999 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4000 "psubw %%mm7, %%mm5 \n\t"
4001 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4002 /* test pa <= pb */
c6b71bff
GD
4003 "movq %%mm4, %%mm7 \n\t"
4004 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4005 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4006 "movq %%mm7, %%mm0 \n\t"
7f88f624 4007 /* use mm7 mask to merge pa & pb */
c6b71bff 4008 "pand %%mm7, %%mm5 \n\t"
7f88f624 4009 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4010 "pand %%mm0, %%mm2 \n\t"
4011 "pandn %%mm4, %%mm7 \n\t"
4012 "pandn %%mm1, %%mm0 \n\t"
4013 "paddw %%mm5, %%mm7 \n\t"
4014 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4015 /* test ((pa <= pb)? pa:pb) <= pc */
4016 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4017 "pxor %%mm1, %%mm1 \n\t"
4018 "pand %%mm7, %%mm3 \n\t"
4019 "pandn %%mm0, %%mm7 \n\t"
4020 "paddw %%mm3, %%mm7 \n\t"
4021 "pxor %%mm0, %%mm0 \n\t"
4022 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 4023 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
c6b71bff 4024 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
4025 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */
4026 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4027 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4028 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4029 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as Raw(x-bpp) */
4030 /* do second set of 4 bytes */
4031 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4032 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4033 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 4034 "movq %%mm2, %%mm4 \n\t"
7f88f624 4035 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
4036 "movq %%mm1, %%mm5 \n\t"
4037 "psubw %%mm3, %%mm4 \n\t"
4038 "pxor %%mm7, %%mm7 \n\t"
7f88f624 4039 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
4040 "movq %%mm4, %%mm6 \n\t"
4041 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
4042 /* pa = abs(p-a) = abs(pav) */
4043 /* pb = abs(p-b) = abs(pbv) */
4044 /* pc = abs(p-c) = abs(pcv) */
4045 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 4046 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4047 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4048 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4049 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4050 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4051 "psubw %%mm0, %%mm4 \n\t"
4052 "psubw %%mm7, %%mm5 \n\t"
4053 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4054 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4055 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4056 "psubw %%mm7, %%mm5 \n\t"
4057 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4058 /* test pa <= pb */
c6b71bff
GD
4059 "movq %%mm4, %%mm7 \n\t"
4060 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4061 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4062 "movq %%mm7, %%mm0 \n\t"
7f88f624 4063 /* use mm7 mask to merge pa & pb */
c6b71bff 4064 "pand %%mm7, %%mm5 \n\t"
7f88f624 4065 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4066 "pand %%mm0, %%mm2 \n\t"
4067 "pandn %%mm4, %%mm7 \n\t"
4068 "pandn %%mm1, %%mm0 \n\t"
4069 "paddw %%mm5, %%mm7 \n\t"
4070 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4071 /* test ((pa <= pb)? pa:pb) <= pc */
4072 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4073 "pxor %%mm1, %%mm1 \n\t"
4074 "pand %%mm7, %%mm3 \n\t"
4075 "pandn %%mm0, %%mm7 \n\t"
4076 "pxor %%mm1, %%mm1 \n\t"
4077 "paddw %%mm3, %%mm7 \n\t"
4078 "pxor %%mm0, %%mm0 \n\t"
7f88f624 4079 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
4080 "addl $8, %%ecx \n\t"
4081 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 4082 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */
c6b71bff 4083 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
4084 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4085 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff
GD
4086 "jb paeth_4lp \n\t"
4087
7f88f624 4088 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
4089 "=D" (dummy_value_D)
4090
7f88f624
VZ
4091 : "0" (prev_row), /* esi // input regs */
4092 "1" (row) /* edi */
c6b71bff 4093
7f88f624 4094 : "%ecx" /* clobber list */
c6b71bff
GD
4095#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4096 , "%mm0", "%mm1", "%mm2", "%mm3"
4097 , "%mm4", "%mm5", "%mm6", "%mm7"
4098#endif
4099 );
4100 }
7f88f624 4101 break; /* end 4 bpp */
c6b71bff 4102
7f88f624 4103 case 8: /* bpp == 8 */
c6b71bff
GD
4104 {
4105 _ActiveMask.use = 0x00000000ffffffffLL;
4106
4107 __asm__ __volatile__ (
4108 "movl _dif, %%ecx \n\t"
7f88f624
VZ
4109/* preload "movl row, %%edi \n\t" */
4110/* preload "movl prev_row, %%esi \n\t" */
c6b71bff 4111 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4112 /* prime the pump: load the first Raw(x-bpp) data set */
4113 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
4114 /* a=Raw(x-bpp) bytes */
c6b71bff 4115 "paeth_8lp: \n\t"
7f88f624
VZ
4116 /* do first set of 4 bytes */
4117 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4118 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */
4119 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4120 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */
4121 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 4122 "movq %%mm2, %%mm4 \n\t"
7f88f624
VZ
4123 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */
4124 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
4125 "movq %%mm1, %%mm5 \n\t"
4126 "psubw %%mm3, %%mm4 \n\t"
4127 "pxor %%mm7, %%mm7 \n\t"
7f88f624 4128 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
4129 "movq %%mm4, %%mm6 \n\t"
4130 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
4131 /* pa = abs(p-a) = abs(pav) */
4132 /* pb = abs(p-b) = abs(pbv) */
4133 /* pc = abs(p-c) = abs(pcv) */
4134 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 4135 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4136 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4137 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4138 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4139 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4140 "psubw %%mm0, %%mm4 \n\t"
4141 "psubw %%mm7, %%mm5 \n\t"
4142 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4143 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4144 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4145 "psubw %%mm7, %%mm5 \n\t"
4146 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4147 /* test pa <= pb */
c6b71bff
GD
4148 "movq %%mm4, %%mm7 \n\t"
4149 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4150 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4151 "movq %%mm7, %%mm0 \n\t"
7f88f624 4152 /* use mm7 mask to merge pa & pb */
c6b71bff 4153 "pand %%mm7, %%mm5 \n\t"
7f88f624 4154 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4155 "pand %%mm0, %%mm2 \n\t"
4156 "pandn %%mm4, %%mm7 \n\t"
4157 "pandn %%mm1, %%mm0 \n\t"
4158 "paddw %%mm5, %%mm7 \n\t"
4159 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4160 /* test ((pa <= pb)? pa:pb) <= pc */
4161 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4162 "pxor %%mm1, %%mm1 \n\t"
4163 "pand %%mm7, %%mm3 \n\t"
4164 "pandn %%mm0, %%mm7 \n\t"
4165 "paddw %%mm3, %%mm7 \n\t"
4166 "pxor %%mm0, %%mm0 \n\t"
4167 "packuswb %%mm1, %%mm7 \n\t"
7f88f624 4168 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
c6b71bff 4169 "pand _ActiveMask, %%mm7 \n\t"
7f88f624
VZ
4170 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */
4171 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4172 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */
4173 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */
4174 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */
4175
4176 /* do second set of 4 bytes */
4177 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */
4178 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */
4179 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 4180 "movq %%mm2, %%mm4 \n\t"
7f88f624 4181 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
4182 "movq %%mm1, %%mm5 \n\t"
4183 "psubw %%mm3, %%mm4 \n\t"
4184 "pxor %%mm7, %%mm7 \n\t"
7f88f624 4185 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
4186 "movq %%mm4, %%mm6 \n\t"
4187 "psubw %%mm3, %%mm5 \n\t"
7f88f624
VZ
4188 /* pa = abs(p-a) = abs(pav) */
4189 /* pb = abs(p-b) = abs(pbv) */
4190 /* pc = abs(p-c) = abs(pcv) */
4191 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */
c6b71bff 4192 "paddw %%mm5, %%mm6 \n\t"
7f88f624
VZ
4193 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
4194 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */
c6b71bff 4195 "psubw %%mm0, %%mm4 \n\t"
7f88f624 4196 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */
c6b71bff
GD
4197 "psubw %%mm0, %%mm4 \n\t"
4198 "psubw %%mm7, %%mm5 \n\t"
4199 "pxor %%mm0, %%mm0 \n\t"
7f88f624
VZ
4200 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */
4201 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */
c6b71bff
GD
4202 "psubw %%mm7, %%mm5 \n\t"
4203 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4204 /* test pa <= pb */
c6b71bff
GD
4205 "movq %%mm4, %%mm7 \n\t"
4206 "psubw %%mm0, %%mm6 \n\t"
7f88f624 4207 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */
c6b71bff 4208 "movq %%mm7, %%mm0 \n\t"
7f88f624 4209 /* use mm7 mask to merge pa & pb */
c6b71bff 4210 "pand %%mm7, %%mm5 \n\t"
7f88f624 4211 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
4212 "pand %%mm0, %%mm2 \n\t"
4213 "pandn %%mm4, %%mm7 \n\t"
4214 "pandn %%mm1, %%mm0 \n\t"
4215 "paddw %%mm5, %%mm7 \n\t"
4216 "paddw %%mm2, %%mm0 \n\t"
7f88f624
VZ
4217 /* test ((pa <= pb)? pa:pb) <= pc */
4218 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */
c6b71bff
GD
4219 "pxor %%mm1, %%mm1 \n\t"
4220 "pand %%mm7, %%mm3 \n\t"
4221 "pandn %%mm0, %%mm7 \n\t"
4222 "pxor %%mm1, %%mm1 \n\t"
4223 "paddw %%mm3, %%mm7 \n\t"
4224 "pxor %%mm0, %%mm0 \n\t"
7f88f624 4225 /* step ecx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
4226 "addl $8, %%ecx \n\t"
4227 "packuswb %%mm7, %%mm1 \n\t"
7f88f624 4228 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
c6b71bff 4229 "cmpl _MMXLength, %%ecx \n\t"
7f88f624
VZ
4230 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4231 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff
GD
4232 "jb paeth_8lp \n\t"
4233
7f88f624 4234 : "=S" (dummy_value_S), /* output regs (dummy) */
c6b71bff
GD
4235 "=D" (dummy_value_D)
4236
7f88f624
VZ
4237 : "0" (prev_row), /* esi // input regs */
4238 "1" (row) /* edi */
c6b71bff 4239
7f88f624 4240 : "%ecx" /* clobber list */
c6b71bff
GD
4241#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4242 , "%mm0", "%mm1", "%mm2", "%mm3"
4243 , "%mm4", "%mm5", "%mm6", "%mm7"
4244#endif
4245 );
4246 }
7f88f624 4247 break; /* end 8 bpp */
c6b71bff 4248
7f88f624
VZ
4249 case 1: /* bpp = 1 */
4250 case 2: /* bpp = 2 */
4251 default: /* bpp > 8 */
c6b71bff
GD
4252 {
4253 __asm__ __volatile__ (
4254#ifdef __PIC__
7f88f624 4255 "pushl %%ebx \n\t" /* save Global Offset Table index */
c6b71bff
GD
4256#endif
4257 "movl _dif, %%ebx \n\t"
4258 "cmpl _FullLength, %%ebx \n\t"
4259 "jnb paeth_dend \n\t"
4260
7f88f624
VZ
4261/* preload "movl row, %%edi \n\t" */
4262/* preload "movl prev_row, %%esi \n\t" */
4263 /* do Paeth decode for remaining bytes */
c6b71bff 4264 "movl %%ebx, %%edx \n\t"
7f88f624
VZ
4265/* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */
4266 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4267 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */
c6b71bff
GD
4268
4269 "paeth_dlp: \n\t"
4270 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4271 /* pav = p - a = (a + b - c) - a = b - c */
4272 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4273 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4274 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4275 "movl %%eax, _patemp \n\t" /* Save pav for later use */
c6b71bff 4276 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4277 /* pbv = p - b = (a + b - c) - b = a - c */
4278 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4279 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
c6b71bff 4280 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
4281 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4282 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4283 /* pc = abs(pcv) */
c6b71bff
GD
4284 "testl $0x80000000, %%eax \n\t"
4285 "jz paeth_dpca \n\t"
7f88f624 4286 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4287
4288 "paeth_dpca: \n\t"
7f88f624
VZ
4289 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4290 /* pb = abs(pbv) */
c6b71bff
GD
4291 "testl $0x80000000, %%ecx \n\t"
4292 "jz paeth_dpba \n\t"
7f88f624 4293 "negl %%ecx \n\t" /* reverse sign of neg values */
c6b71bff
GD
4294
4295 "paeth_dpba: \n\t"
7f88f624
VZ
4296 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4297 /* pa = abs(pav) */
c6b71bff
GD
4298 "movl _patemp, %%eax \n\t"
4299 "testl $0x80000000, %%eax \n\t"
4300 "jz paeth_dpaa \n\t"
7f88f624 4301 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4302
4303 "paeth_dpaa: \n\t"
7f88f624
VZ
4304 "movl %%eax, _patemp \n\t" /* save pa for later use */
4305 /* test if pa <= pb */
c6b71bff
GD
4306 "cmpl %%ecx, %%eax \n\t"
4307 "jna paeth_dabb \n\t"
7f88f624 4308 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
4309 "cmpl _pctemp, %%ecx \n\t"
4310 "jna paeth_dbbc \n\t"
7f88f624
VZ
4311 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4312 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4313 "jmp paeth_dpaeth \n\t"
4314
4315 "paeth_dbbc: \n\t"
7f88f624
VZ
4316 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4317 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
c6b71bff
GD
4318 "jmp paeth_dpaeth \n\t"
4319
4320 "paeth_dabb: \n\t"
7f88f624 4321 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
4322 "cmpl _pctemp, %%eax \n\t"
4323 "jna paeth_dabc \n\t"
7f88f624
VZ
4324 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4325 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4326 "jmp paeth_dpaeth \n\t"
4327
4328 "paeth_dabc: \n\t"
7f88f624
VZ
4329 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4330 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
c6b71bff
GD
4331
4332 "paeth_dpaeth: \n\t"
4333 "incl %%ebx \n\t"
4334 "incl %%edx \n\t"
7f88f624 4335 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
4336 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4337 "cmpl _FullLength, %%ebx \n\t"
4338 "jb paeth_dlp \n\t"
4339
4340 "paeth_dend: \n\t"
4341#ifdef __PIC__
7f88f624 4342 "popl %%ebx \n\t" /* index to Global Offset Table */
c6b71bff
GD
4343#endif
4344
7f88f624 4345 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
4346 "=S" (dummy_value_S),
4347 "=D" (dummy_value_D)
4348
7f88f624
VZ
4349 : "0" (bpp), /* ecx // input regs */
4350 "1" (prev_row), /* esi */
4351 "2" (row) /* edi */
c6b71bff 4352
7f88f624 4353 : "%eax", "%edx" /* clobber list */
c6b71bff
GD
4354#ifndef __PIC__
4355 , "%ebx"
4356#endif
4357 );
4358 }
7f88f624 4359 return; /* No need to go further with this one */
c6b71bff 4360
7f88f624 4361 } /* end switch (bpp) */
c6b71bff
GD
4362
4363 __asm__ __volatile__ (
7f88f624
VZ
4364 /* MMX acceleration complete; now do clean-up */
4365 /* check if any remaining bytes left to decode */
c6b71bff 4366#ifdef __PIC__
7f88f624 4367 "pushl %%ebx \n\t" /* save index to Global Offset Table */
c6b71bff
GD
4368#endif
4369 "movl _MMXLength, %%ebx \n\t"
4370 "cmpl _FullLength, %%ebx \n\t"
4371 "jnb paeth_end \n\t"
7f88f624
VZ
4372/*pre "movl row, %%edi \n\t" */
4373/*pre "movl prev_row, %%esi \n\t" */
4374 /* do Paeth decode for remaining bytes */
c6b71bff 4375 "movl %%ebx, %%edx \n\t"
7f88f624
VZ
4376/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */
4377 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */
4378 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */
c6b71bff
GD
4379
4380 "paeth_lp2: \n\t"
4381 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4382 /* pav = p - a = (a + b - c) - a = b - c */
4383 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */
4384 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
4385 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
4386 "movl %%eax, _patemp \n\t" /* Save pav for later use */
c6b71bff 4387 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4388 /* pbv = p - b = (a + b - c) - b = a - c */
4389 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */
4390 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */
c6b71bff 4391 "movl %%eax, %%ecx \n\t"
7f88f624
VZ
4392 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4393 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */
4394 /* pc = abs(pcv) */
c6b71bff
GD
4395 "testl $0x80000000, %%eax \n\t"
4396 "jz paeth_pca2 \n\t"
7f88f624 4397 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4398
4399 "paeth_pca2: \n\t"
7f88f624
VZ
4400 "movl %%eax, _pctemp \n\t" /* save pc for later use */
4401 /* pb = abs(pbv) */
c6b71bff
GD
4402 "testl $0x80000000, %%ecx \n\t"
4403 "jz paeth_pba2 \n\t"
7f88f624 4404 "negl %%ecx \n\t" /* reverse sign of neg values */
c6b71bff
GD
4405
4406 "paeth_pba2: \n\t"
7f88f624
VZ
4407 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */
4408 /* pa = abs(pav) */
c6b71bff
GD
4409 "movl _patemp, %%eax \n\t"
4410 "testl $0x80000000, %%eax \n\t"
4411 "jz paeth_paa2 \n\t"
7f88f624 4412 "negl %%eax \n\t" /* reverse sign of neg values */
c6b71bff
GD
4413
4414 "paeth_paa2: \n\t"
7f88f624
VZ
4415 "movl %%eax, _patemp \n\t" /* save pa for later use */
4416 /* test if pa <= pb */
c6b71bff
GD
4417 "cmpl %%ecx, %%eax \n\t"
4418 "jna paeth_abb2 \n\t"
7f88f624 4419 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
4420 "cmpl _pctemp, %%ecx \n\t"
4421 "jna paeth_bbc2 \n\t"
7f88f624
VZ
4422 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4423 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4424 "jmp paeth_paeth2 \n\t"
4425
4426 "paeth_bbc2: \n\t"
7f88f624
VZ
4427 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4428 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */
c6b71bff
GD
4429 "jmp paeth_paeth2 \n\t"
4430
4431 "paeth_abb2: \n\t"
7f88f624 4432 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
4433 "cmpl _pctemp, %%eax \n\t"
4434 "jna paeth_abc2 \n\t"
7f88f624
VZ
4435 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4436 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */
c6b71bff
GD
4437 "jmp paeth_paeth2 \n\t"
4438
4439 "paeth_abc2: \n\t"
7f88f624
VZ
4440 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4441 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */
c6b71bff
GD
4442
4443 "paeth_paeth2: \n\t"
4444 "incl %%ebx \n\t"
4445 "incl %%edx \n\t"
7f88f624 4446 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
4447 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4448 "cmpl _FullLength, %%ebx \n\t"
4449 "jb paeth_lp2 \n\t"
4450
4451 "paeth_end: \n\t"
7f88f624 4452 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */
c6b71bff 4453#ifdef __PIC__
7f88f624 4454 "popl %%ebx \n\t" /* restore index to Global Offset Table */
c6b71bff
GD
4455#endif
4456
7f88f624 4457 : "=c" (dummy_value_c), /* output regs (dummy) */
c6b71bff
GD
4458 "=S" (dummy_value_S),
4459 "=D" (dummy_value_D)
4460
7f88f624
VZ
4461 : "0" (bpp), /* ecx // input regs */
4462 "1" (prev_row), /* esi */
4463 "2" (row) /* edi */
c6b71bff 4464
7f88f624 4465 : "%eax", "%edx" /* clobber list (no input regs!) */
c6b71bff
GD
4466#ifndef __PIC__
4467 , "%ebx"
4468#endif
4469 );
4470
4471} /* end png_read_filter_row_mmx_paeth() */
4472#endif
4473
4474
4475
4476
4477#ifdef PNG_THREAD_UNSAFE_OK
7f88f624
VZ
4478/*===========================================================================*/
4479/* */
4480/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B */
4481/* */
4482/*===========================================================================*/
c6b71bff 4483
7f88f624 4484/* Optimized code for PNG Sub filter decoder */
c6b71bff
GD
4485
4486static void /* PRIVATE */
4487png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4488{
4489 int bpp;
4490 int dummy_value_a;
4491 int dummy_value_D;
4492
7f88f624
VZ
4493 bpp = (row_info->pixel_depth + 7) >> 3; /* calc number of bytes per pixel */
4494 _FullLength = row_info->rowbytes - bpp; /* number of bytes to filter */
c6b71bff
GD
4495
4496 __asm__ __volatile__ (
7f88f624
VZ
4497/*pre "movl row, %%edi \n\t" */
4498 "movl %%edi, %%esi \n\t" /* lp = row */
4499/*pre "movl bpp, %%eax \n\t" */
4500 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4501/*irr "xorl %%eax, %%eax \n\t" */
4502 /* get # of bytes to alignment */
4503 "movl %%edi, _dif \n\t" /* take start of row */
4504 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past */
4505 /* alignment boundary */
c6b71bff 4506 "xorl %%ecx, %%ecx \n\t"
7f88f624
VZ
4507 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */
4508 "subl %%edi, _dif \n\t" /* subtract from start ==> value */
4509 "jz sub_go \n\t" /* ecx at alignment */
c6b71bff 4510
7f88f624 4511 "sub_lp1: \n\t" /* fix alignment */
c6b71bff
GD
4512 "movb (%%esi,%%ecx,), %%al \n\t"
4513 "addb %%al, (%%edi,%%ecx,) \n\t"
4514 "incl %%ecx \n\t"
4515 "cmpl _dif, %%ecx \n\t"
4516 "jb sub_lp1 \n\t"
4517
4518 "sub_go: \n\t"
4519 "movl _FullLength, %%eax \n\t"
4520 "movl %%eax, %%edx \n\t"
7f88f624
VZ
4521 "subl %%ecx, %%edx \n\t" /* subtract alignment fix */
4522 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
4523 "subl %%edx, %%eax \n\t" /* drop over bytes from length */
c6b71bff
GD
4524 "movl %%eax, _MMXLength \n\t"
4525
7f88f624
VZ
4526 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4527 "=D" (dummy_value_D) /* 1 */
c6b71bff 4528
7f88f624
VZ
4529 : "0" (bpp), /* eax // input regs */
4530 "1" (row) /* edi */
c6b71bff 4531
7f88f624 4532 : "%ebx", "%ecx", "%edx" /* clobber list */
c6b71bff
GD
4533 , "%esi"
4534
4535#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4536 , "%mm0", "%mm1", "%mm2", "%mm3"
4537 , "%mm4", "%mm5", "%mm6", "%mm7"
4538#endif
4539 );
4540
7f88f624 4541 /* now do the math for the rest of the row */
c6b71bff
GD
4542 switch (bpp)
4543 {
4544 case 3:
4545 {
4546 _ActiveMask.use = 0x0000ffffff000000LL;
7f88f624
VZ
4547 _ShiftBpp.use = 24; /* == 3 * 8 */
4548 _ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
4549
4550 __asm__ __volatile__ (
7f88f624
VZ
4551/* preload "movl row, %%edi \n\t" */
4552 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4553 /* active byte group */
4554 "movl %%edi, %%esi \n\t" /* lp = row */
4555/* preload "movl bpp, %%eax \n\t" */
4556 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4557 "movq %%mm7, %%mm6 \n\t"
4558 "movl _dif, %%edx \n\t"
7f88f624
VZ
4559 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4560 /* 3rd active byte group */
4561 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
4562 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4563
7f88f624
VZ
4564 "sub_3lp: \n\t" /* shift data for adding first */
4565 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4566 /* shift clears inactive bytes) */
4567 /* add 1st active group */
c6b71bff
GD
4568 "movq (%%edi,%%edx,), %%mm0 \n\t"
4569 "paddb %%mm1, %%mm0 \n\t"
4570
7f88f624
VZ
4571 /* add 2nd active group */
4572 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4573 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4574 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
c6b71bff
GD
4575 "paddb %%mm1, %%mm0 \n\t"
4576
7f88f624
VZ
4577 /* add 3rd active group */
4578 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4579 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4580 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
c6b71bff
GD
4581 "addl $8, %%edx \n\t"
4582 "paddb %%mm1, %%mm0 \n\t"
4583
4584 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4585 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4586 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
c6b71bff
GD
4587 "jb sub_3lp \n\t"
4588
7f88f624
VZ
4589 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4590 "=D" (dummy_value_D) /* 1 */
c6b71bff 4591
7f88f624
VZ
4592 : "0" (bpp), /* eax // input regs */
4593 "1" (row) /* edi */
c6b71bff 4594
7f88f624 4595 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4596#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4597 , "%mm0", "%mm1", "%mm6", "%mm7"
4598#endif
4599 );
4600 }
4601 break;
4602
4603 case 1:
4604 {
4605 __asm__ __volatile__ (
4606 "movl _dif, %%edx \n\t"
7f88f624 4607/* preload "movl row, %%edi \n\t" */
c6b71bff
GD
4608 "cmpl _FullLength, %%edx \n\t"
4609 "jnb sub_1end \n\t"
7f88f624 4610 "movl %%edi, %%esi \n\t" /* lp = row */
c6b71bff 4611 "xorl %%eax, %%eax \n\t"
7f88f624
VZ
4612/* preload "movl bpp, %%eax \n\t" */
4613 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4614
4615 "sub_1lp: \n\t"
4616 "movb (%%esi,%%edx,), %%al \n\t"
4617 "addb %%al, (%%edi,%%edx,) \n\t"
4618 "incl %%edx \n\t"
4619 "cmpl _FullLength, %%edx \n\t"
4620 "jb sub_1lp \n\t"
4621
4622 "sub_1end: \n\t"
4623
7f88f624
VZ
4624 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4625 "=D" (dummy_value_D) /* 1 */
c6b71bff 4626
7f88f624
VZ
4627 : "0" (bpp), /* eax // input regs */
4628 "1" (row) /* edi */
c6b71bff 4629
7f88f624 4630 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4631 );
4632 }
4633 return;
4634
4635 case 6:
4636 case 4:
7f88f624
VZ
4637 //case 7: /* GRR BOGUS */
4638 //case 5: /* GRR BOGUS */
c6b71bff
GD
4639 {
4640 _ShiftBpp.use = bpp << 3;
4641 _ShiftRem.use = 64 - _ShiftBpp.use;
4642
4643 __asm__ __volatile__ (
7f88f624 4644/* preload "movl row, %%edi \n\t" */
c6b71bff 4645 "movl _dif, %%edx \n\t"
7f88f624
VZ
4646 "movl %%edi, %%esi \n\t" /* lp = row */
4647/* preload "movl bpp, %%eax \n\t" */
4648 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff 4649
7f88f624 4650 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
4651 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4652
7f88f624
VZ
4653 "sub_4lp: \n\t" /* shift data for adding first */
4654 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4655 /* shift clears inactive bytes) */
c6b71bff
GD
4656 "movq (%%edi,%%edx,), %%mm0 \n\t"
4657 "paddb %%mm1, %%mm0 \n\t"
4658
7f88f624
VZ
4659 /* add 2nd active group */
4660 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4661 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
c6b71bff
GD
4662 "addl $8, %%edx \n\t"
4663 "paddb %%mm1, %%mm0 \n\t"
4664
4665 "cmpl _MMXLength, %%edx \n\t"
4666 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
7f88f624 4667 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
c6b71bff
GD
4668 "jb sub_4lp \n\t"
4669
7f88f624
VZ
4670 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4671 "=D" (dummy_value_D) /* 1 */
c6b71bff 4672
7f88f624
VZ
4673 : "0" (bpp), /* eax // input regs */
4674 "1" (row) /* edi */
c6b71bff 4675
7f88f624 4676 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4677#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4678 , "%mm0", "%mm1"
4679#endif
4680 );
4681 }
4682 break;
4683
4684 case 2:
4685 {
4686 _ActiveMask.use = 0x00000000ffff0000LL;
7f88f624
VZ
4687 _ShiftBpp.use = 16; /* == 2 * 8 */
4688 _ShiftRem.use = 48; /* == 64 - 16 */
c6b71bff
GD
4689
4690 __asm__ __volatile__ (
7f88f624
VZ
4691 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */
4692 /* active byte group */
c6b71bff
GD
4693 "movl _dif, %%edx \n\t"
4694 "movq %%mm7, %%mm6 \n\t"
7f88f624
VZ
4695/* preload "movl row, %%edi \n\t" */
4696 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */
4697 /* 3rd active byte group */
4698 "movl %%edi, %%esi \n\t" /* lp = row */
c6b71bff 4699 "movq %%mm6, %%mm5 \n\t"
7f88f624
VZ
4700/* preload "movl bpp, %%eax \n\t" */
4701 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
4702 "psllq _ShiftBpp, %%mm5 \n\t" /* move mask in mm5 to cover */
4703 /* 4th active byte group */
4704 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff
GD
4705 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4706
7f88f624
VZ
4707 "sub_2lp: \n\t" /* shift data for adding first */
4708 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */
4709 /* shift clears inactive bytes) */
4710 /* add 1st active group */
c6b71bff
GD
4711 "movq (%%edi,%%edx,), %%mm0 \n\t"
4712 "paddb %%mm1, %%mm0 \n\t"
4713
7f88f624
VZ
4714 /* add 2nd active group */
4715 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4716 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4717 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */
c6b71bff
GD
4718 "paddb %%mm1, %%mm0 \n\t"
4719
7f88f624
VZ
4720 /* add 3rd active group */
4721 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4722 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4723 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */
c6b71bff
GD
4724 "paddb %%mm1, %%mm0 \n\t"
4725
7f88f624
VZ
4726 /* add 4th active group */
4727 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */
4728 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */
4729 "pand %%mm5, %%mm1 \n\t" /* mask to use 4th active group */
c6b71bff
GD
4730 "addl $8, %%edx \n\t"
4731 "paddb %%mm1, %%mm0 \n\t"
4732 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4733 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4734 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */
c6b71bff
GD
4735 "jb sub_2lp \n\t"
4736
7f88f624
VZ
4737 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4738 "=D" (dummy_value_D) /* 1 */
c6b71bff 4739
7f88f624
VZ
4740 : "0" (bpp), /* eax // input regs */
4741 "1" (row) /* edi */
c6b71bff 4742
7f88f624 4743 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4744#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4745 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4746#endif
4747 );
4748 }
4749 break;
4750
4751 case 8:
4752 {
4753 __asm__ __volatile__ (
7f88f624 4754/* preload "movl row, %%edi \n\t" */
c6b71bff 4755 "movl _dif, %%edx \n\t"
7f88f624
VZ
4756 "movl %%edi, %%esi \n\t" /* lp = row */
4757/* preload "movl bpp, %%eax \n\t" */
4758 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4759 "movl _MMXLength, %%ecx \n\t"
4760
7f88f624 4761 /* prime the pump: load the first Raw(x-bpp) data set */
c6b71bff 4762 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
7f88f624 4763 "andl $0x0000003f, %%ecx \n\t" /* calc bytes over mult of 64 */
c6b71bff
GD
4764
4765 "sub_8lp: \n\t"
7f88f624 4766 "movq (%%edi,%%edx,), %%mm0 \n\t" /* load Sub(x) for 1st 8 bytes */
c6b71bff 4767 "paddb %%mm7, %%mm0 \n\t"
7f88f624
VZ
4768 "movq 8(%%edi,%%edx,), %%mm1 \n\t" /* load Sub(x) for 2nd 8 bytes */
4769 "movq %%mm0, (%%edi,%%edx,) \n\t" /* write Raw(x) for 1st 8 bytes */
c6b71bff 4770
7f88f624
VZ
4771 /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */
4772 /* This will be repeated for each group of 8 bytes with the 8th */
4773 /* group being used as the Raw(x-bpp) for the 1st group of the */
4774 /* next loop. */
c6b71bff
GD
4775
4776 "paddb %%mm0, %%mm1 \n\t"
7f88f624
VZ
4777 "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */
4778 "movq %%mm1, 8(%%edi,%%edx,) \n\t" /* write Raw(x) for 2nd 8 bytes */
c6b71bff 4779 "paddb %%mm1, %%mm2 \n\t"
7f88f624
VZ
4780 "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */
4781 "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */
c6b71bff 4782 "paddb %%mm2, %%mm3 \n\t"
7f88f624
VZ
4783 "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */
4784 "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */
c6b71bff 4785 "paddb %%mm3, %%mm4 \n\t"
7f88f624
VZ
4786 "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */
4787 "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */
c6b71bff 4788 "paddb %%mm4, %%mm5 \n\t"
7f88f624
VZ
4789 "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */
4790 "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */
c6b71bff 4791 "paddb %%mm5, %%mm6 \n\t"
7f88f624
VZ
4792 "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */
4793 "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */
c6b71bff
GD
4794 "addl $64, %%edx \n\t"
4795 "paddb %%mm6, %%mm7 \n\t"
4796 "cmpl %%ecx, %%edx \n\t"
7f88f624 4797 "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */
c6b71bff
GD
4798 "jb sub_8lp \n\t"
4799
4800 "cmpl _MMXLength, %%edx \n\t"
4801 "jnb sub_8lt8 \n\t"
4802
4803 "sub_8lpA: \n\t"
4804 "movq (%%edi,%%edx,), %%mm0 \n\t"
4805 "addl $8, %%edx \n\t"
4806 "paddb %%mm7, %%mm0 \n\t"
4807 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4808 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */
4809 "movq %%mm0, %%mm7 \n\t" /* move calculated Raw(x) data */
4810 /* to mm1 to be new Raw(x-bpp) */
4811 /* for next loop */
c6b71bff
GD
4812 "jb sub_8lpA \n\t"
4813
4814 "sub_8lt8: \n\t"
4815
7f88f624
VZ
4816 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4817 "=D" (dummy_value_D) /* 1 */
c6b71bff 4818
7f88f624
VZ
4819 : "0" (bpp), /* eax // input regs */
4820 "1" (row) /* edi */
c6b71bff 4821
7f88f624 4822 : "%ecx", "%edx", "%esi" /* clobber list */
c6b71bff
GD
4823#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4824 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4825#endif
4826 );
4827 }
4828 break;
4829
7f88f624 4830 default: /* bpp greater than 8 bytes GRR BOGUS */
c6b71bff
GD
4831 {
4832 __asm__ __volatile__ (
4833 "movl _dif, %%edx \n\t"
7f88f624
VZ
4834/* preload "movl row, %%edi \n\t" */
4835 "movl %%edi, %%esi \n\t" /* lp = row */
4836/* preload "movl bpp, %%eax \n\t" */
4837 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4838
4839 "sub_Alp: \n\t"
4840 "movq (%%edi,%%edx,), %%mm0 \n\t"
4841 "movq (%%esi,%%edx,), %%mm1 \n\t"
4842 "addl $8, %%edx \n\t"
4843 "paddb %%mm1, %%mm0 \n\t"
4844 "cmpl _MMXLength, %%edx \n\t"
7f88f624
VZ
4845 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */
4846 /* -8 to offset addl edx */
c6b71bff
GD
4847 "jb sub_Alp \n\t"
4848
7f88f624
VZ
4849 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4850 "=D" (dummy_value_D) /* 1 */
c6b71bff 4851
7f88f624
VZ
4852 : "0" (bpp), /* eax // input regs */
4853 "1" (row) /* edi */
c6b71bff 4854
7f88f624 4855 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4856#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4857 , "%mm0", "%mm1"
4858#endif
4859 );
4860 }
4861 break;
4862
7f88f624 4863 } /* end switch (bpp) */
c6b71bff
GD
4864
4865 __asm__ __volatile__ (
4866 "movl _MMXLength, %%edx \n\t"
7f88f624 4867/* pre "movl row, %%edi \n\t" */
c6b71bff
GD
4868 "cmpl _FullLength, %%edx \n\t"
4869 "jnb sub_end \n\t"
4870
7f88f624
VZ
4871 "movl %%edi, %%esi \n\t" /* lp = row */
4872/* pre "movl bpp, %%eax \n\t" */
4873 "addl %%eax, %%edi \n\t" /* rp = row + bpp */
c6b71bff
GD
4874 "xorl %%eax, %%eax \n\t"
4875
4876 "sub_lp2: \n\t"
4877 "movb (%%esi,%%edx,), %%al \n\t"
4878 "addb %%al, (%%edi,%%edx,) \n\t"
4879 "incl %%edx \n\t"
4880 "cmpl _FullLength, %%edx \n\t"
4881 "jb sub_lp2 \n\t"
4882
4883 "sub_end: \n\t"
7f88f624 4884 "EMMS \n\t" /* end MMX instructions */
c6b71bff 4885
7f88f624
VZ
4886 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */
4887 "=D" (dummy_value_D) /* 1 */
c6b71bff 4888
7f88f624
VZ
4889 : "0" (bpp), /* eax // input regs */
4890 "1" (row) /* edi */
c6b71bff 4891
7f88f624 4892 : "%edx", "%esi" /* clobber list */
c6b71bff
GD
4893 );
4894
7f88f624 4895} /* end of png_read_filter_row_mmx_sub() */
c6b71bff
GD
4896#endif
4897
4898
4899
4900
7f88f624
VZ
4901/*===========================================================================*/
4902/* */
4903/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P */
4904/* */
4905/*===========================================================================*/
c6b71bff 4906
7f88f624 4907/* Optimized code for PNG Up filter decoder */
c6b71bff
GD
4908
4909static void /* PRIVATE */
4910png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4911 png_bytep prev_row)
4912{
4913 png_uint_32 len;
7f88f624 4914 int dummy_value_d; /* fix 'forbidden register 3 (dx) was spilled' error */
c6b71bff
GD
4915 int dummy_value_S;
4916 int dummy_value_D;
4917
7f88f624 4918 len = row_info->rowbytes; /* number of bytes to filter */
c6b71bff
GD
4919
4920 __asm__ __volatile__ (
7f88f624
VZ
4921/* pre "movl row, %%edi \n\t" */
4922 /* get # of bytes to alignment */
c6b71bff
GD
4923#ifdef __PIC__
4924 "pushl %%ebx \n\t"
4925#endif
4926 "movl %%edi, %%ecx \n\t"
4927 "xorl %%ebx, %%ebx \n\t"
4928 "addl $0x7, %%ecx \n\t"
4929 "xorl %%eax, %%eax \n\t"
4930 "andl $0xfffffff8, %%ecx \n\t"
7f88f624 4931/* pre "movl prev_row, %%esi \n\t" */
c6b71bff
GD
4932 "subl %%edi, %%ecx \n\t"
4933 "jz up_go \n\t"
4934
7f88f624 4935 "up_lp1: \n\t" /* fix alignment */
c6b71bff
GD
4936 "movb (%%edi,%%ebx,), %%al \n\t"
4937 "addb (%%esi,%%ebx,), %%al \n\t"
4938 "incl %%ebx \n\t"
4939 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
4940 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
4941 "jb up_lp1 \n\t" /* offset incl ebx */
c6b71bff
GD
4942
4943 "up_go: \n\t"
7f88f624 4944/* pre "movl len, %%edx \n\t" */
c6b71bff 4945 "movl %%edx, %%ecx \n\t"
7f88f624
VZ
4946 "subl %%ebx, %%edx \n\t" /* subtract alignment fix */
4947 "andl $0x0000003f, %%edx \n\t" /* calc bytes over mult of 64 */
4948 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
c6b71bff 4949
7f88f624
VZ
4950 /* unrolled loop - use all MMX registers and interleave to reduce */
4951 /* number of branch instructions (loops) and reduce partial stalls */
c6b71bff
GD
4952 "up_loop: \n\t"
4953 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4954 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4955 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4956 "paddb %%mm1, %%mm0 \n\t"
4957 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4958 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4959 "paddb %%mm3, %%mm2 \n\t"
4960 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4961 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4962 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4963 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4964 "paddb %%mm5, %%mm4 \n\t"
4965 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4966 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4967 "paddb %%mm7, %%mm6 \n\t"
4968 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4969 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4970 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4971 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4972 "paddb %%mm1, %%mm0 \n\t"
4973 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4974 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4975 "paddb %%mm3, %%mm2 \n\t"
4976 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4977 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4978 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4979 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4980 "paddb %%mm5, %%mm4 \n\t"
4981 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4982 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4983 "addl $64, %%ebx \n\t"
4984 "paddb %%mm7, %%mm6 \n\t"
4985 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
4986 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */
4987 "jb up_loop \n\t" /* -8 to offset addl ebx */
c6b71bff 4988
7f88f624 4989 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 64 */
c6b71bff
GD
4990 "jz up_end \n\t"
4991
7f88f624
VZ
4992 "cmpl $8, %%edx \n\t" /* test for less than 8 bytes */
4993 "jb up_lt8 \n\t" /* [added by lcreeve@netins.net] */
c6b71bff
GD
4994
4995 "addl %%edx, %%ecx \n\t"
7f88f624
VZ
4996 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */
4997 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */
c6b71bff
GD
4998 "jz up_lt8 \n\t"
4999
7f88f624 5000 "up_lpA: \n\t" /* use MMX regs to update 8 bytes sim. */
c6b71bff
GD
5001 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5002 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5003 "addl $8, %%ebx \n\t"
5004 "paddb %%mm1, %%mm0 \n\t"
5005 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
5006 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */
5007 "jb up_lpA \n\t" /* offset add ebx */
5008 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 8 */
c6b71bff
GD
5009 "jz up_end \n\t"
5010
5011 "up_lt8: \n\t"
5012 "xorl %%eax, %%eax \n\t"
7f88f624 5013 "addl %%edx, %%ecx \n\t" /* move over byte count into counter */
c6b71bff 5014
7f88f624 5015 "up_lp2: \n\t" /* use x86 regs for remaining bytes */
c6b71bff
GD
5016 "movb (%%edi,%%ebx,), %%al \n\t"
5017 "addb (%%esi,%%ebx,), %%al \n\t"
5018 "incl %%ebx \n\t"
5019 "cmpl %%ecx, %%ebx \n\t"
7f88f624
VZ
5020 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */
5021 "jb up_lp2 \n\t" /* offset inc ebx */
c6b71bff
GD
5022
5023 "up_end: \n\t"
7f88f624 5024 "EMMS \n\t" /* conversion of filtered row complete */
c6b71bff
GD
5025#ifdef __PIC__
5026 "popl %%ebx \n\t"
5027#endif
5028
7f88f624
VZ
5029 : "=d" (dummy_value_d), /* 0 // output regs (dummy) */
5030 "=S" (dummy_value_S), /* 1 */
5031 "=D" (dummy_value_D) /* 2 */
c6b71bff 5032
7f88f624
VZ
5033 : "0" (len), /* edx // input regs */
5034 "1" (prev_row), /* esi */
5035 "2" (row) /* edi */
c6b71bff 5036
7f88f624 5037 : "%eax", "%ebx", "%ecx" /* clobber list (no input regs!) */
c6b71bff
GD
5038
5039#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5040 , "%mm0", "%mm1", "%mm2", "%mm3"
5041 , "%mm4", "%mm5", "%mm6", "%mm7"
5042#endif
5043 );
5044
7f88f624 5045} /* end of png_read_filter_row_mmx_up() */
c6b71bff
GD
5046
5047#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5048
5049
5050
5051
5052/*===========================================================================*/
5053/* */
5054/* P N G _ R E A D _ F I L T E R _ R O W */
5055/* */
5056/*===========================================================================*/
5057
5058
5059/* Optimized png_read_filter_row routines */
5060
5061void /* PRIVATE */
5062png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5063 row, png_bytep prev_row, int filter)
5064{
5065#ifdef PNG_DEBUG
5066 char filnm[10];
5067#endif
5068
5069#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5070/* GRR: these are superseded by png_ptr->asm_flags: */
7f88f624
VZ
5071#define UseMMX_sub 1 /* GRR: converted 20000730 */
5072#define UseMMX_up 1 /* GRR: converted 20000729 */
5073#define UseMMX_avg 1 /* GRR: converted 20000828 (+ 16-bit bugfix 20000916) */
5074#define UseMMX_paeth 1 /* GRR: converted 20000828 */
c6b71bff
GD
5075
5076 if (_mmx_supported == 2) {
5077 /* this should have happened in png_init_mmx_flags() already */
2b5f62a0 5078#if !defined(PNG_1_0_X)
c6b71bff 5079 png_warning(png_ptr, "asm_flags may not have been initialized");
2b5f62a0 5080#endif
c6b71bff
GD
5081 png_mmx_support();
5082 }
5083#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5084
5085#ifdef PNG_DEBUG
5086 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5087 switch (filter)
5088 {
5089 case 0: sprintf(filnm, "none");
5090 break;
5091 case 1: sprintf(filnm, "sub-%s",
5092#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5093#if !defined(PNG_1_0_X)
5094 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5095#endif
5096#endif
5097"x86");
5098 break;
5099 case 2: sprintf(filnm, "up-%s",
5100#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5101#if !defined(PNG_1_0_X)
5102 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5103#endif
5104#endif
5105 "x86");
5106 break;
5107 case 3: sprintf(filnm, "avg-%s",
5108#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5109#if !defined(PNG_1_0_X)
5110 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5111#endif
5112#endif
5113 "x86");
5114 break;
5115 case 4: sprintf(filnm, "Paeth-%s",
5116#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5117#if !defined(PNG_1_0_X)
5118 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5119#endif
5120#endif
5121"x86");
5122 break;
5123 default: sprintf(filnm, "unknw");
5124 break;
5125 }
5126 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5127 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5128 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5129 (int)((row_info->pixel_depth + 7) >> 3));
5130 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5131#endif /* PNG_DEBUG */
5132
5133 switch (filter)
5134 {
5135 case PNG_FILTER_VALUE_NONE:
5136 break;
5137
5138 case PNG_FILTER_VALUE_SUB:
5139#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5140#if !defined(PNG_1_0_X)
5141 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5142 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5143 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5144#else
5145 if (_mmx_supported)
5146#endif
5147 {
5148 png_read_filter_row_mmx_sub(row_info, row);
5149 }
5150 else
5151#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5152 {
5153 png_uint_32 i;
5154 png_uint_32 istop = row_info->rowbytes;
5155 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5156 png_bytep rp = row + bpp;
5157 png_bytep lp = row;
5158
5159 for (i = bpp; i < istop; i++)
5160 {
5161 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5162 rp++;
5163 }
5164 } /* end !UseMMX_sub */
5165 break;
5166
5167 case PNG_FILTER_VALUE_UP:
5168#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5169#if !defined(PNG_1_0_X)
5170 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5171 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5172 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5173#else
5174 if (_mmx_supported)
5175#endif
5176 {
5177 png_read_filter_row_mmx_up(row_info, row, prev_row);
5178 }
5179 else
5180#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5181 {
5182 png_uint_32 i;
5183 png_uint_32 istop = row_info->rowbytes;
5184 png_bytep rp = row;
5185 png_bytep pp = prev_row;
5186
5187 for (i = 0; i < istop; ++i)
5188 {
5189 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5190 rp++;
5191 }
5192 } /* end !UseMMX_up */
5193 break;
5194
5195 case PNG_FILTER_VALUE_AVG:
5196#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5197#if !defined(PNG_1_0_X)
5198 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5199 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5200 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5201#else
5202 if (_mmx_supported)
5203#endif
5204 {
5205 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5206 }
5207 else
5208#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5209 {
5210 png_uint_32 i;
5211 png_bytep rp = row;
5212 png_bytep pp = prev_row;
5213 png_bytep lp = row;
5214 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5215 png_uint_32 istop = row_info->rowbytes - bpp;
5216
5217 for (i = 0; i < bpp; i++)
5218 {
5219 *rp = (png_byte)(((int)(*rp) +
5220 ((int)(*pp++) >> 1)) & 0xff);
5221 rp++;
5222 }
5223
5224 for (i = 0; i < istop; i++)
5225 {
5226 *rp = (png_byte)(((int)(*rp) +
5227 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5228 rp++;
5229 }
5230 } /* end !UseMMX_avg */
5231 break;
5232
5233 case PNG_FILTER_VALUE_PAETH:
5234#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5235#if !defined(PNG_1_0_X)
5236 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5237 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5238 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5239#else
5240 if (_mmx_supported)
5241#endif
5242 {
5243 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5244 }
5245 else
5246#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5247 {
5248 png_uint_32 i;
5249 png_bytep rp = row;
5250 png_bytep pp = prev_row;
5251 png_bytep lp = row;
5252 png_bytep cp = prev_row;
5253 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5254 png_uint_32 istop = row_info->rowbytes - bpp;
5255
5256 for (i = 0; i < bpp; i++)
5257 {
5258 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5259 rp++;
5260 }
5261
5262 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5263 {
5264 int a, b, c, pa, pb, pc, p;
5265
5266 a = *lp++;
5267 b = *pp++;
5268 c = *cp++;
5269
5270 p = b - c;
5271 pc = a - c;
5272
5273#ifdef PNG_USE_ABS
5274 pa = abs(p);
5275 pb = abs(pc);
5276 pc = abs(p + pc);
5277#else
5278 pa = p < 0 ? -p : p;
5279 pb = pc < 0 ? -pc : pc;
5280 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5281#endif
5282
5283 /*
5284 if (pa <= pb && pa <= pc)
5285 p = a;
5286 else if (pb <= pc)
5287 p = b;
5288 else
5289 p = c;
5290 */
5291
5292 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5293
5294 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5295 rp++;
5296 }
5297 } /* end !UseMMX_paeth */
5298 break;
5299
5300 default:
5301 png_warning(png_ptr, "Ignoring bad row-filter type");
5302 *row=0;
5303 break;
5304 }
5305}
5306
5307#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5308
5309
5310/*===========================================================================*/
5311/* */
5312/* P N G _ M M X _ S U P P O R T */
5313/* */
5314/*===========================================================================*/
5315
5316/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5317 * (2) all instructions compile with gcc 2.7.2.3 and later
5318 * (3) the function is moved down here to prevent gcc from
5319 * inlining it in multiple places and then barfing be-
5320 * cause the ".NOT_SUPPORTED" label is multiply defined
5321 * [is there a way to signal that a *single* function should
5322 * not be inlined? is there a way to modify the label for
5323 * each inlined instance, e.g., by appending _1, _2, etc.?
5324 * maybe if don't use leading "." in label name? (nope...sigh)]
5325 */
5326
5327int PNGAPI
5328png_mmx_support(void)
5329{
5330#if defined(PNG_MMX_CODE_SUPPORTED)
5331 __asm__ __volatile__ (
7f88f624
VZ
5332 "pushl %%ebx \n\t" /* ebx gets clobbered by CPUID instruction */
5333 "pushl %%ecx \n\t" /* so does ecx... */
5334 "pushl %%edx \n\t" /* ...and edx (but ecx & edx safe on Linux) */
5335/* ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd */
5336/* "pushf \n\t" // 16-bit pushf */
5337 "pushfl \n\t" /* save Eflag to stack */
5338 "popl %%eax \n\t" /* get Eflag from stack into eax */
5339 "movl %%eax, %%ecx \n\t" /* make another copy of Eflag in ecx */
5340 "xorl $0x200000, %%eax \n\t" /* toggle ID bit in Eflag (i.e., bit 21) */
5341 "pushl %%eax \n\t" /* save modified Eflag back to stack */
5342/* ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd */
5343/* "popf \n\t" // 16-bit popf */
5344 "popfl \n\t" /* restore modified value to Eflag reg */
5345 "pushfl \n\t" /* save Eflag to stack */
5346 "popl %%eax \n\t" /* get Eflag from stack */
5347 "pushl %%ecx \n\t" /* save original Eflag to stack */
5348 "popfl \n\t" /* restore original Eflag */
5349 "xorl %%ecx, %%eax \n\t" /* compare new Eflag with original Eflag */
5350 "jz 0f \n\t" /* if same, CPUID instr. is not supported */
5351
5352 "xorl %%eax, %%eax \n\t" /* set eax to zero */
5353/* ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode) */
5354 "cpuid \n\t" /* get the CPU identification info */
5355 "cmpl $1, %%eax \n\t" /* make sure eax return non-zero value */
5356 "jl 0f \n\t" /* if eax is zero, MMX is not supported */
5357
5358 "xorl %%eax, %%eax \n\t" /* set eax to zero and... */
5359 "incl %%eax \n\t" /* ...increment eax to 1. This pair is */
5360 /* faster than the instruction "mov eax, 1" */
5361 "cpuid \n\t" /* get the CPU identification info again */
5362 "andl $0x800000, %%edx \n\t" /* mask out all bits but MMX bit (23) */
5363 "cmpl $0, %%edx \n\t" /* 0 = MMX not supported */
5364 "jz 0f \n\t" /* non-zero = yes, MMX IS supported */
5365
5366 "movl $1, %%eax \n\t" /* set return value to 1 */
5367 "jmp 1f \n\t" /* DONE: have MMX support */
5368
5369 "0: \n\t" /* .NOT_SUPPORTED: target label for jump instructions */
5370 "movl $0, %%eax \n\t" /* set return value to 0 */
5371 "1: \n\t" /* .RETURN: target label for jump instructions */
5372 "movl %%eax, _mmx_supported \n\t" /* save in global static variable, too */
5373 "popl %%edx \n\t" /* restore edx */
5374 "popl %%ecx \n\t" /* restore ecx */
5375 "popl %%ebx \n\t" /* restore ebx */
5376
5377/* "ret \n\t" // DONE: no MMX support */
5378 /* (fall through to standard C "ret") */
5379
5380 : /* output list (none) */
5381
5382 : /* any variables used on input (none) */
5383
5384 : "%eax" /* clobber list */
5385/* , "%ebx", "%ecx", "%edx" // GRR: we handle these manually */
5386/* , "memory" // if write to a variable gcc thought was in a reg */
5387/* , "cc" // "condition codes" (flag bits) */
c6b71bff
GD
5388 );
5389#else
5390 _mmx_supported = 0;
5391#endif /* PNG_MMX_CODE_SUPPORTED */
5392
5393 return _mmx_supported;
5394}
5395
5396
5397#endif /* PNG_USE_PNGGCCRD */