src/png/pnggccrd.c

   1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
   2  *
   3  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
   4  *
   5  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
   6  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
   7  *     for Intel's performance analysis of the MMX vs. non-MMX code.
   8  *
   9  * libpng version 1.2.5rc3 - September 18, 2002
  10  * For conditions of distribution and use, see copyright notice in png.h
  11  * Copyright (c) 1998-2002 Glenn Randers-Pehrson
  12  * Copyright (c) 1998, Intel Corporation
  13  *
  14  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
  15  * Interface to libpng contributed by Gilles Vollant, 1999.
  16  * GNU C port by Greg Roelofs, 1999-2001.
  17  *
  18  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
  19  *
  20  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
  21  *
  22  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
  23  *
  24  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
  25  *        is required to assemble the newer MMX instructions such as movq.
  26  *        For djgpp, see
  27  *
  28  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
  29  *
  30  *        (or a later version in the same directory).  For Linux, check your
  31  *        distribution's web site(s) or try these links:
  32  *
  33  *           http://rufus.w3.org/linux/RPM/binutils.html
  34  *           http://www.debian.org/Packages/stable/devel/binutils.html
  35  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
  36  *             binutils.tgz
  37  *
  38  *        For other platforms, see the main GNU site:
  39  *
  40  *           ftp://ftp.gnu.org/pub/gnu/binutils/
  41  *
  42  *        Version 2.5.2l.15 is definitely too old...
  43  */
  44
  45 /*
  46  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
  47  * =====================================
  48  *
  49  * 19991006:
  50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
  51  *
  52  * 19991007:
  53  *  - additional optimizations (possible or definite):
  54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
  55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
  56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
  57  *        why subtract 8 from width_mmx in the pass 4/5 case?
  58  *        (only width_mmx case) (near line 1606)
  59  *     x [DONE] replace pixel_bytes within each block with the true
  60  *        constant value (or are compilers smart enough to do that?)
  61  *     - rewrite all MMX interlacing code so it's aligned with
  62  *        the *beginning* of the row buffer, not the end.  This
  63  *        would not only allow one to eliminate half of the memory
  64  *        writes for odd passes (that is, pass == odd), it may also
  65  *        eliminate some unaligned-data-access exceptions (assuming
  66  *        there's a penalty for not aligning 64-bit accesses on
  67  *        64-bit boundaries).  The only catch is that the "leftover"
  68  *        pixel(s) at the end of the row would have to be saved,
  69  *        but there are enough unused MMX registers in every case,
  70  *        so this is not a problem.  A further benefit is that the
  71  *        post-MMX cleanup code (C code) in at least some of the
  72  *        cases could be done within the assembler block.
  73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
  74  *     inconsistent, and don't match the MMX Programmer's Reference
  75  *     Manual conventions anyway.  They should be changed to
  76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
  77  *     was lowest in memory (e.g., corresponding to a left pixel)
  78  *     and b7 is the byte that was highest (e.g., a right pixel).
  79  *
  80  * 19991016:
  81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
  82  *     want globals prefixed by underscores when referencing them--
  83  *     i.e., if the variable is const4, then refer to it as const4,
  84  *     not _const4.  This seems to be a djgpp-specific requirement.
  85  *     Also, such variables apparently *must* be declared outside
  86  *     of functions; neither static nor automatic variables work if
  87  *     defined within the scope of a single function, but both
  88  *     static and truly global (multi-module) variables work fine.
  89  *
  90  * 19991023:
  91  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
  92  *  - switched from string-concatenation-with-macros to cleaner method of
  93  *     renaming global variables for djgpp--i.e., always use prefixes in
  94  *     inlined assembler code (== strings) and conditionally rename the
  95  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
  96  *
  97  * 19991024:
  98  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
  99  *     This one was severely weird:  even though mmxsupport() doesn't touch
 100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
 101  *     the register (even in static/non-fPIC code--see below), which in turn
 102  *     caused png_do_read_interlace() to return prematurely on the first row of
 103  *     interlaced images (i.e., without expanding the interlaced pixels).
 104  *     Inspection of the generated assembly code didn't turn up any clues,
 105  *     although it did point at a minor optimization (i.e., get rid of
 106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
 107  *     instruction is more destructive than it looks?  (Not yet checked.)
 108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
 109  *     listings...  Apparently register spillage has to do with ebx, since
 110  *     it's used to index the global offset table.  Commenting it out of the
 111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
 112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
 113  *
 114  * 19991107:
 115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
 116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
 117  *
 118  * 19991120:
 119  *  - made "diff" variable (now "_dif") global to simplify conversion of
 120  *     filtering routines (running out of regs, sigh).  "diff" is still used
 121  *     in interlacing routines, however.
 122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
 123  *     macro determines which is used); original not yet tested.
 124  *
 125  * 20000213:
 126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
 127  *
 128  * 20000319:
 129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
 130  *     pass == 4 or 5, that caused visible corruption of interlaced images
 131  *
 132  * 20000623:
 133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
 134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
 135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
 136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
 137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
 138  *     for the original (anonymous) SourceForge bug report.
 139  *
 140  * 20000706:
 141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
 142  *       pnggccrd.c: In function `png_combine_row':
 143  *       pnggccrd.c:525: more than 10 operands in `asm'
 144  *       pnggccrd.c:669: more than 10 operands in `asm'
 145  *       pnggccrd.c:828: more than 10 operands in `asm'
 146  *       pnggccrd.c:994: more than 10 operands in `asm'
 147  *       pnggccrd.c:1177: more than 10 operands in `asm'
 148  *     They are all the same problem and can be worked around by using the
 149  *     global _unmask variable unconditionally, not just in the -fPIC case.
 150  *     Reportedly earlier versions of gcc also have the problem with more than
 151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
 152  *
 153  * 20000729:
 154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
 155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
 156  *  - to finish remaining sections:
 157  *     - clean up indentation and comments
 158  *     - preload local variables
 159  *     - add output and input regs (order of former determines numerical
 160  *        mapping of latter)
 161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
 162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
 163  *
 164  * 20000731:
 165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
 166  *
 167  * 20000822:
 168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
 169  *     shared-library (-fPIC) version!  Code works just fine as part of static
 170  *     library.  Damn damn damn damn damn, should have tested that sooner.
 171  *     ebx is getting clobbered again (explicitly this time); need to save it
 172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
 173  *
 174  * 20000823:
 175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
 176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
 177  *     and *Mask* globals and got rid of leading "$" signs.
 178  *
 179  * 20000826:
 180  *  - added visual separators to help navigate microscopic printed copies
 181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
 182  *     on png_read_filter_row_mmx_avg()
 183  *
 184  * 20000828:
 185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
 186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
 187  *     cleaned up/shortened in either routine, but functionality is complete
 188  *     and seems to be working fine.
 189  *
 190  * 20000829:
 191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
 192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
 193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
 194  *     is simple enough...
 195  *
 196  * 20000914:
 197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
 198  *     correctly (but 48-bit RGB just fine)
 199  *
 200  * 20000916:
 201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
 202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
 203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
 204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
 205  *
 206  * 20010101:
 207  *  - added new png_init_mmx_flags() function (here only because it needs to
 208  *     call mmxsupport(), which should probably become global png_mmxsupport());
 209  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
 210  *
 211  * 20010103:
 212  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
 213  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
 214  *
 215  * 20010104:
 216  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
 217  *     within MMX version of png_read_filter_row()) so no longer necessary to
 218  *     compile it into pngrutil.o
 219  *
 220  * 20010310:
 221  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
 222  *
 223  * 20020304:
 224  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
 225  *
 226  * STILL TO DO:
 227  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
 228  *     - write MMX code for 48-bit case (pixel_bytes == 6)
 229  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
 230  *        why subtract 8 from width_mmx in the pass 4/5 case?
 231  *        (only width_mmx case) (near line 1606)
 232  *     - rewrite all MMX interlacing code so it's aligned with beginning
 233  *        of the row buffer, not the end (see 19991007 for details)
 234  *     x pick one version of mmxsupport() and get rid of the other
 235  *     - add error messages to any remaining bogus default cases
 236  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
 237  *     x add support for runtime enable/disable/query of various MMX routines
 238  */
 239
 240 #define PNG_INTERNAL
 241 #include "png.h"
 242
 243 #if defined(PNG_USE_PNGGCCRD)
 244
 245 int PNGAPI png_mmx_support(void);
 246
 247 #ifdef PNG_USE_LOCAL_ARRAYS
 248 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
 249 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
 250 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
 251 #endif
 252
 253 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
 254 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
 255  * so define them without: */
 256 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
 257 #  define _mmx_supported  mmx_supported
 258 #  define _const4         const4
 259 #  define _const6         const6
 260 #  define _mask8_0        mask8_0
 261 #  define _mask16_1       mask16_1
 262 #  define _mask16_0       mask16_0
 263 #  define _mask24_2       mask24_2
 264 #  define _mask24_1       mask24_1
 265 #  define _mask24_0       mask24_0
 266 #  define _mask32_3       mask32_3
 267 #  define _mask32_2       mask32_2
 268 #  define _mask32_1       mask32_1
 269 #  define _mask32_0       mask32_0
 270 #  define _mask48_5       mask48_5
 271 #  define _mask48_4       mask48_4
 272 #  define _mask48_3       mask48_3
 273 #  define _mask48_2       mask48_2
 274 #  define _mask48_1       mask48_1
 275 #  define _mask48_0       mask48_0
 276 #  define _LBCarryMask    LBCarryMask
 277 #  define _HBClearMask    HBClearMask
 278 #  define _ActiveMask     ActiveMask
 279 #  define _ActiveMask2    ActiveMask2
 280 #  define _ActiveMaskEnd  ActiveMaskEnd
 281 #  define _ShiftBpp       ShiftBpp
 282 #  define _ShiftRem       ShiftRem
 283 #ifdef PNG_THREAD_UNSAFE_OK
 284 #  define _unmask         unmask
 285 #  define _FullLength     FullLength
 286 #  define _MMXLength      MMXLength
 287 #  define _dif            dif
 288 #  define _patemp         patemp
 289 #  define _pbtemp         pbtemp
 290 #  define _pctemp         pctemp
 291 #endif
 292 #endif
 293
 294
 295 /* These constants are used in the inlined MMX assembly code.
 296    Ignore gcc's "At top level: defined but not used" warnings. */
 297
 298 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
 299  *  since that case uses the %ebx register for indexing the Global Offset Table
 300  *  and there were no other registers available.  But gcc 2.95 and later emit
 301  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
 302  *  in the non-PIC case, so we'll just use the global unconditionally now.
 303  */
 304 #ifdef PNG_THREAD_UNSAFE_OK
 305 static int _unmask;
 306 #endif
 307
 308 static unsigned long long _mask8_0  = 0x0102040810204080LL;
 309
 310 static unsigned long long _mask16_1 = 0x0101020204040808LL;
 311 static unsigned long long _mask16_0 = 0x1010202040408080LL;
 312
 313 static unsigned long long _mask24_2 = 0x0101010202020404LL;
 314 static unsigned long long _mask24_1 = 0x0408080810101020LL;
 315 static unsigned long long _mask24_0 = 0x2020404040808080LL;
 316
 317 static unsigned long long _mask32_3 = 0x0101010102020202LL;
 318 static unsigned long long _mask32_2 = 0x0404040408080808LL;
 319 static unsigned long long _mask32_1 = 0x1010101020202020LL;
 320 static unsigned long long _mask32_0 = 0x4040404080808080LL;
 321
 322 static unsigned long long _mask48_5 = 0x0101010101010202LL;
 323 static unsigned long long _mask48_4 = 0x0202020204040404LL;
 324 static unsigned long long _mask48_3 = 0x0404080808080808LL;
 325 static unsigned long long _mask48_2 = 0x1010101010102020LL;
 326 static unsigned long long _mask48_1 = 0x2020202040404040LL;
 327 static unsigned long long _mask48_0 = 0x4040808080808080LL;
 328
 329 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
 330 /* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */  /* NOT USED */
 331 static unsigned long long _const6   = 0x00000000000000FFLL;
 332
 333 /* These are used in the row-filter routines and should/would be local */
 334 /*  variables if not for gcc addressing limitations. */
 335 /* WARNING: Their presence probably defeats the thread safety of libpng. */
 336
 337 #ifdef PNG_THREAD_UNSAFE_OK
 338 static png_uint_32  _FullLength;
 339 static png_uint_32  _MMXLength;
 340 static int          _dif;
 341 static int          _patemp; /* temp variables for Paeth routine */
 342 static int          _pbtemp;
 343 static int          _pctemp;
 344 #endif
 345
 346 void /* PRIVATE */
 347 png_squelch_warnings(void)
 348 {
 349 #ifdef PNG_THREAD_UNSAFE_OK
 350    _dif = _dif;
 351    _patemp = _patemp;
 352    _pbtemp = _pbtemp;
 353    _pctemp = _pctemp;
 354    _MMXLength = _MMXLength;
 355 #endif
 356    _const4  = _const4;
 357    _const6  = _const6;
 358    _mask8_0  = _mask8_0;
 359    _mask16_1 = _mask16_1;
 360    _mask16_0 = _mask16_0;
 361    _mask24_2 = _mask24_2;
 362    _mask24_1 = _mask24_1;
 363    _mask24_0 = _mask24_0;
 364    _mask32_3 = _mask32_3;
 365    _mask32_2 = _mask32_2;
 366    _mask32_1 = _mask32_1;
 367    _mask32_0 = _mask32_0;
 368    _mask48_5 = _mask48_5;
 369    _mask48_4 = _mask48_4;
 370    _mask48_3 = _mask48_3;
 371    _mask48_2 = _mask48_2;
 372    _mask48_1 = _mask48_1;
 373    _mask48_0 = _mask48_0;
 374 }
 375 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
 376
 377
 378 static int _mmx_supported = 2;
 379
 380 /*===========================================================================*/
 381 /*                                                                           */
 382 /*                       P N G _ C O M B I N E _ R O W                       */
 383 /*                                                                           */
 384 /*===========================================================================*/
 385
 386 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
 387
 388 #define BPP2  2
 389 #define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
 390 #define BPP4  4
 391 #define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
 392 #define BPP8  8
 393
 394 /* Combines the row recently read in with the previous row.
 395    This routine takes care of alpha and transparency if requested.
 396    This routine also handles the two methods of progressive display
 397    of interlaced images, depending on the mask value.
 398    The mask value describes which pixels are to be combined with
 399    the row.  The pattern always repeats every 8 pixels, so just 8
 400    bits are needed.  A one indicates the pixel is to be combined; a
 401    zero indicates the pixel is to be skipped.  This is in addition
 402    to any alpha or transparency value associated with the pixel.
 403    If you want all pixels to be combined, pass 0xff (255) in mask. */
 404
 405 /* Use this routine for the x86 platform - it uses a faster MMX routine
 406    if the machine supports MMX. */
 407
 408 void /* PRIVATE */
 409 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
 410 {
 411    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
 412
 413 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
 414    if (_mmx_supported == 2) {
 415        /* this should have happened in png_init_mmx_flags() already */
 416        png_warning(png_ptr, "asm_flags may not have been initialized");
 417        png_mmx_support();
 418    }
 419 #endif
 420
 421    if (mask == 0xff)
 422    {
 423       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
 424       png_memcpy(row, png_ptr->row_buf + 1,
 425        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
 426    }
 427    else   /* (png_combine_row() is never called with mask == 0) */
 428    {
 429       switch (png_ptr->row_info.pixel_depth)
 430       {
 431          case 1:        /* png_ptr->row_info.pixel_depth */
 432          {
 433             png_bytep sp;
 434             png_bytep dp;
 435             int s_inc, s_start, s_end;
 436             int m;
 437             int shift;
 438             png_uint_32 i;
 439
 440             sp = png_ptr->row_buf + 1;
 441             dp = row;
 442             m = 0x80;
 443 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
 444             if (png_ptr->transformations & PNG_PACKSWAP)
 445             {
 446                 s_start = 0;
 447                 s_end = 7;
 448                 s_inc = 1;
 449             }
 450             else
 451 #endif
 452             {
 453                 s_start = 7;
 454                 s_end = 0;
 455                 s_inc = -1;
 456             }
 457
 458             shift = s_start;
 459
 460             for (i = 0; i < png_ptr->width; i++)
 461             {
 462                if (m & mask)
 463                {
 464                   int value;
 465
 466                   value = (*sp >> shift) & 0x1;
 467                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
 468                   *dp |= (png_byte)(value << shift);
 469                }
 470
 471                if (shift == s_end)
 472                {
 473                   shift = s_start;
 474                   sp++;
 475                   dp++;
 476                }
 477                else
 478                   shift += s_inc;
 479
 480                if (m == 1)
 481                   m = 0x80;
 482                else
 483                   m >>= 1;
 484             }
 485             break;
 486          }
 487
 488          case 2:        /* png_ptr->row_info.pixel_depth */
 489          {
 490             png_bytep sp;
 491             png_bytep dp;
 492             int s_start, s_end, s_inc;
 493             int m;
 494             int shift;
 495             png_uint_32 i;
 496             int value;
 497
 498             sp = png_ptr->row_buf + 1;
 499             dp = row;
 500             m = 0x80;
 501 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
 502             if (png_ptr->transformations & PNG_PACKSWAP)
 503             {
 504                s_start = 0;
 505                s_end = 6;
 506                s_inc = 2;
 507             }
 508             else
 509 #endif
 510             {
 511                s_start = 6;
 512                s_end = 0;
 513                s_inc = -2;
 514             }
 515
 516             shift = s_start;
 517
 518             for (i = 0; i < png_ptr->width; i++)
 519             {
 520                if (m & mask)
 521                {
 522                   value = (*sp >> shift) & 0x3;
 523                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
 524                   *dp |= (png_byte)(value << shift);
 525                }
 526
 527                if (shift == s_end)
 528                {
 529                   shift = s_start;
 530                   sp++;
 531                   dp++;
 532                }
 533                else
 534                   shift += s_inc;
 535                if (m == 1)
 536                   m = 0x80;
 537                else
 538                   m >>= 1;
 539             }
 540             break;
 541          }
 542
 543          case 4:        /* png_ptr->row_info.pixel_depth */
 544          {
 545             png_bytep sp;
 546             png_bytep dp;
 547             int s_start, s_end, s_inc;
 548             int m;
 549             int shift;
 550             png_uint_32 i;
 551             int value;
 552
 553             sp = png_ptr->row_buf + 1;
 554             dp = row;
 555             m = 0x80;
 556 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
 557             if (png_ptr->transformations & PNG_PACKSWAP)
 558             {
 559                s_start = 0;
 560                s_end = 4;
 561                s_inc = 4;
 562             }
 563             else
 564 #endif
 565             {
 566                s_start = 4;
 567                s_end = 0;
 568                s_inc = -4;
 569             }
 570             shift = s_start;
 571
 572             for (i = 0; i < png_ptr->width; i++)
 573             {
 574                if (m & mask)
 575                {
 576                   value = (*sp >> shift) & 0xf;
 577                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
 578                   *dp |= (png_byte)(value << shift);
 579                }
 580
 581                if (shift == s_end)
 582                {
 583                   shift = s_start;
 584                   sp++;
 585                   dp++;
 586                }
 587                else
 588                   shift += s_inc;
 589                if (m == 1)
 590                   m = 0x80;
 591                else
 592                   m >>= 1;
 593             }
 594             break;
 595          }
 596
 597          case 8:        /* png_ptr->row_info.pixel_depth */
 598          {
 599             png_bytep srcptr;
 600             png_bytep dstptr;
 601
 602 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
 603 #if !defined(PNG_1_0_X)
 604             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
 605                 /* && _mmx_supported */ )
 606 #else
 607             if (_mmx_supported)
 608 #endif
 609             {
 610                png_uint_32 len;
 611                int diff;
 612                int dummy_value_a;   /* fix 'forbidden register spilled' error */
 613                int dummy_value_d;
 614                int dummy_value_c;
 615                int dummy_value_S;
 616                int dummy_value_D;
 617                _unmask = ~mask;            /* global variable for -fPIC version */
 618                srcptr = png_ptr->row_buf + 1;
 619                dstptr = row;
 620                len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
 621                diff = (int) (png_ptr->width & 7);  /* amount lost */
 622
 623                __asm__ __volatile__ (
 624                   "movd      _unmask, %%mm7  \n\t" /* load bit pattern */
 625                   "psubb     %%mm6, %%mm6    \n\t" /* zero mm6 */
 626                   "punpcklbw %%mm7, %%mm7    \n\t"
 627                   "punpcklwd %%mm7, %%mm7    \n\t"
 628                   "punpckldq %%mm7, %%mm7    \n\t" /* fill reg with 8 masks */
 629
 630                   "movq      _mask8_0, %%mm0 \n\t"
 631                   "pand      %%mm7, %%mm0    \n\t" /* nonzero if keep byte */
 632                   "pcmpeqb   %%mm6, %%mm0    \n\t" /* zeros->1s, v versa */
 633
 634 /* preload        "movl      len, %%ecx      \n\t" // load length of line */
 635 /* preload        "movl      srcptr, %%esi   \n\t" // load source */
 636 /* preload        "movl      dstptr, %%edi   \n\t" // load dest */
 637
 638                   "cmpl      $0, %%ecx       \n\t" /* len == 0 ? */
 639                   "je        mainloop8end    \n\t"
 640
 641                 "mainloop8:                  \n\t"
 642                   "movq      (%%esi), %%mm4  \n\t" /* *srcptr */
 643                   "pand      %%mm0, %%mm4    \n\t"
 644                   "movq      %%mm0, %%mm6    \n\t"
 645                   "pandn     (%%edi), %%mm6  \n\t" /* *dstptr */
 646                   "por       %%mm6, %%mm4    \n\t"
 647                   "movq      %%mm4, (%%edi)  \n\t"
 648                   "addl      $8, %%esi       \n\t" /* inc by 8 bytes processed */
 649                   "addl      $8, %%edi       \n\t"
 650                   "subl      $8, %%ecx       \n\t" /* dec by 8 pixels processed */
 651                   "ja        mainloop8       \n\t"
 652
 653                 "mainloop8end:               \n\t"
 654 /* preload        "movl      diff, %%ecx     \n\t" // (diff is in eax) */
 655                   "movl      %%eax, %%ecx    \n\t"
 656                   "cmpl      $0, %%ecx       \n\t"
 657                   "jz        end8            \n\t"
 658 /* preload        "movl      mask, %%edx     \n\t" */
 659                   "sall      $24, %%edx      \n\t" /* make low byte, high byte */
 660
 661                 "secondloop8:                \n\t"
 662                   "sall      %%edx           \n\t" /* move high bit to CF */
 663                   "jnc       skip8           \n\t" /* if CF = 0 */
 664                   "movb      (%%esi), %%al   \n\t"
 665                   "movb      %%al, (%%edi)   \n\t"
 666
 667                 "skip8:                      \n\t"
 668                   "incl      %%esi           \n\t"
 669                   "incl      %%edi           \n\t"
 670                   "decl      %%ecx           \n\t"
 671                   "jnz       secondloop8     \n\t"
 672
 673                 "end8:                       \n\t"
 674                   "EMMS                      \n\t"  /* DONE */
 675
 676                   : "=a" (dummy_value_a),           /* output regs (dummy) */
 677                     "=d" (dummy_value_d),
 678                     "=c" (dummy_value_c),
 679                     "=S" (dummy_value_S),
 680                     "=D" (dummy_value_D)
 681
 682                   : "3" (srcptr),      /* esi       // input regs */
 683                     "4" (dstptr),      /* edi */
 684                     "0" (diff),        /* eax */
 685 /* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
 686                     "2" (len),         /* ecx */
 687                     "1" (mask)         /* edx */
 688
 689 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
 690                   : "%mm0", "%mm4", "%mm6", "%mm7"  /* clobber list */
 691 #endif
 692                );
 693             }
 694             else /* mmx _not supported - Use modified C routine */
 695 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
 696             {
 697                register png_uint_32 i;
 698                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
 699                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
 700                register int stride = png_pass_inc[png_ptr->pass];
 701                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
 702                register int rep_bytes = png_pass_width[png_ptr->pass];
 703                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
 704                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
 705                int diff = (int) (png_ptr->width & 7); /* amount lost */
 706                register png_uint_32 final_val = len;  /* GRR bugfix */
 707
 708                srcptr = png_ptr->row_buf + 1 + initial_val;
 709                dstptr = row + initial_val;
 710
 711                for (i = initial_val; i < final_val; i += stride)
 712                {
 713                   png_memcpy(dstptr, srcptr, rep_bytes);
 714                   srcptr += stride;
 715                   dstptr += stride;
 716                }
 717                if (diff)  /* number of leftover pixels:  3 for pngtest */
 718                {
 719                   final_val+=diff /* *BPP1 */ ;
 720                   for (; i < final_val; i += stride)
 721                   {
 722                      if (rep_bytes > (int)(final_val-i))
 723                         rep_bytes = (int)(final_val-i);
 724                      png_memcpy(dstptr, srcptr, rep_bytes);
 725                      srcptr += stride;
 726                      dstptr += stride;
 727                   }
 728                }
 729
 730             } /* end of else (_mmx_supported) */
 731
 732             break;
 733          }       /* end 8 bpp */
 734
 735          case 16:       /* png_ptr->row_info.pixel_depth */
 736          {
 737             png_bytep srcptr;
 738             png_bytep dstptr;
 739
 740 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
 741 #if !defined(PNG_1_0_X)
 742             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
 743                 /* && _mmx_supported */ )
 744 #else
 745             if (_mmx_supported)
 746 #endif
 747             {
 748                png_uint_32 len;
 749                int diff;
 750                int dummy_value_a;   /* fix 'forbidden register spilled' error */
 751                int dummy_value_d;
 752                int dummy_value_c;
 753                int dummy_value_S;
 754                int dummy_value_D;
 755                _unmask = ~mask;            /* global variable for -fPIC version */
 756                srcptr = png_ptr->row_buf + 1;
 757                dstptr = row;
 758                len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
 759                diff = (int) (png_ptr->width & 7); /* amount lost // */
 760
 761                __asm__ __volatile__ (
 762                   "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
 763                   "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
 764                   "punpcklbw %%mm7, %%mm7     \n\t"
 765                   "punpcklwd %%mm7, %%mm7     \n\t"
 766                   "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
 767
 768                   "movq      _mask16_0, %%mm0 \n\t"
 769                   "movq      _mask16_1, %%mm1 \n\t"
 770
 771                   "pand      %%mm7, %%mm0     \n\t"
 772                   "pand      %%mm7, %%mm1     \n\t"
 773
 774                   "pcmpeqb   %%mm6, %%mm0     \n\t"
 775                   "pcmpeqb   %%mm6, %%mm1     \n\t"
 776
 777 /* preload        "movl      len, %%ecx       \n\t" // load length of line */
 778 /* preload        "movl      srcptr, %%esi    \n\t" // load source */
 779 /* preload        "movl      dstptr, %%edi    \n\t" // load dest */
 780
 781                   "cmpl      $0, %%ecx        \n\t"
 782                   "jz        mainloop16end    \n\t"
 783
 784                 "mainloop16:                  \n\t"
 785                   "movq      (%%esi), %%mm4   \n\t"
 786                   "pand      %%mm0, %%mm4     \n\t"
 787                   "movq      %%mm0, %%mm6     \n\t"
 788                   "movq      (%%edi), %%mm7   \n\t"
 789                   "pandn     %%mm7, %%mm6     \n\t"
 790                   "por       %%mm6, %%mm4     \n\t"
 791                   "movq      %%mm4, (%%edi)   \n\t"
 792
 793                   "movq      8(%%esi), %%mm5  \n\t"
 794                   "pand      %%mm1, %%mm5     \n\t"
 795                   "movq      %%mm1, %%mm7     \n\t"
 796                   "movq      8(%%edi), %%mm6  \n\t"
 797                   "pandn     %%mm6, %%mm7     \n\t"
 798                   "por       %%mm7, %%mm5     \n\t"
 799                   "movq      %%mm5, 8(%%edi)  \n\t"
 800
 801                   "addl      $16, %%esi       \n\t" /* inc by 16 bytes processed */
 802                   "addl      $16, %%edi       \n\t"
 803                   "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
 804                   "ja        mainloop16       \n\t"
 805
 806                 "mainloop16end:               \n\t"
 807 /* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
 808                   "movl      %%eax, %%ecx     \n\t"
 809                   "cmpl      $0, %%ecx        \n\t"
 810                   "jz        end16            \n\t"
 811 /* preload        "movl      mask, %%edx      \n\t" */
 812                   "sall      $24, %%edx       \n\t" /* make low byte, high byte */
 813
 814                 "secondloop16:                \n\t"
 815                   "sall      %%edx            \n\t" /* move high bit to CF */
 816                   "jnc       skip16           \n\t" /* if CF = 0 */
 817                   "movw      (%%esi), %%ax    \n\t"
 818                   "movw      %%ax, (%%edi)    \n\t"
 819
 820                 "skip16:                      \n\t"
 821                   "addl      $2, %%esi        \n\t"
 822                   "addl      $2, %%edi        \n\t"
 823                   "decl      %%ecx            \n\t"
 824                   "jnz       secondloop16     \n\t"
 825
 826                 "end16:                       \n\t"
 827                   "EMMS                       \n\t" /* DONE */
 828
 829                   : "=a" (dummy_value_a),           /* output regs (dummy) */
 830                     "=c" (dummy_value_c),
 831                     "=d" (dummy_value_d),
 832                     "=S" (dummy_value_S),
 833                     "=D" (dummy_value_D)
 834
 835                   : "0" (diff),        /* eax       // input regs */
 836 /* was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx */
 837                     "1" (len),         /* ecx */
 838                     "2" (mask),        /* edx */
 839                     "3" (srcptr),      /* esi */
 840                     "4" (dstptr)       /* edi */
 841
 842 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
 843                   : "%mm0", "%mm1", "%mm4"          /* clobber list */
 844                   , "%mm5", "%mm6", "%mm7"
 845 #endif
 846                );
 847             }
 848             else /* mmx _not supported - Use modified C routine */
 849 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
 850             {
 851                register png_uint_32 i;
 852                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
 853                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
 854                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
 855                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
 856                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
 857                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
 858                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
 859                int diff = (int) (png_ptr->width & 7); /* amount lost */
 860                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
 861
 862                srcptr = png_ptr->row_buf + 1 + initial_val;
 863                dstptr = row + initial_val;
 864
 865                for (i = initial_val; i < final_val; i += stride)
 866                {
 867                   png_memcpy(dstptr, srcptr, rep_bytes);
 868                   srcptr += stride;
 869                   dstptr += stride;
 870                }
 871                if (diff)  /* number of leftover pixels:  3 for pngtest */
 872                {
 873                   final_val+=diff*BPP2;
 874                   for (; i < final_val; i += stride)
 875                   {
 876                      if (rep_bytes > (int)(final_val-i))
 877                         rep_bytes = (int)(final_val-i);
 878                      png_memcpy(dstptr, srcptr, rep_bytes);
 879                      srcptr += stride;
 880                      dstptr += stride;
 881                   }
 882                }
 883             } /* end of else (_mmx_supported) */
 884
 885             break;
 886          }       /* end 16 bpp */
 887
 888          case 24:       /* png_ptr->row_info.pixel_depth */
 889          {
 890             png_bytep srcptr;
 891             png_bytep dstptr;
 892
 893 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
 894 #if !defined(PNG_1_0_X)
 895             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
 896                 /* && _mmx_supported */ )
 897 #else
 898             if (_mmx_supported)
 899 #endif
 900             {
 901                png_uint_32 len;
 902                int diff;
 903                int dummy_value_a;   /* fix 'forbidden register spilled' error */
 904                int dummy_value_d;
 905                int dummy_value_c;
 906                int dummy_value_S;
 907                int dummy_value_D;
 908                _unmask = ~mask;            /* global variable for -fPIC version */
 909                srcptr = png_ptr->row_buf + 1;
 910                dstptr = row;
 911                len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
 912                diff = (int) (png_ptr->width & 7); /* amount lost // */
 913
 914                __asm__ __volatile__ (
 915                   "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
 916                   "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
 917                   "punpcklbw %%mm7, %%mm7     \n\t"
 918                   "punpcklwd %%mm7, %%mm7     \n\t"
 919                   "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
 920
 921                   "movq      _mask24_0, %%mm0 \n\t"
 922                   "movq      _mask24_1, %%mm1 \n\t"
 923                   "movq      _mask24_2, %%mm2 \n\t"
 924
 925                   "pand      %%mm7, %%mm0     \n\t"
 926                   "pand      %%mm7, %%mm1     \n\t"
 927                   "pand      %%mm7, %%mm2     \n\t"
 928
 929                   "pcmpeqb   %%mm6, %%mm0     \n\t"
 930                   "pcmpeqb   %%mm6, %%mm1     \n\t"
 931                   "pcmpeqb   %%mm6, %%mm2     \n\t"
 932
 933 /* preload        "movl      len, %%ecx       \n\t" // load length of line */
 934 /* preload        "movl      srcptr, %%esi    \n\t" // load source */
 935 /* preload        "movl      dstptr, %%edi    \n\t" // load dest */
 936
 937                   "cmpl      $0, %%ecx        \n\t"
 938                   "jz        mainloop24end    \n\t"
 939
 940                 "mainloop24:                  \n\t"
 941                   "movq      (%%esi), %%mm4   \n\t"
 942                   "pand      %%mm0, %%mm4     \n\t"
 943                   "movq      %%mm0, %%mm6     \n\t"
 944                   "movq      (%%edi), %%mm7   \n\t"
 945                   "pandn     %%mm7, %%mm6     \n\t"
 946                   "por       %%mm6, %%mm4     \n\t"
 947                   "movq      %%mm4, (%%edi)   \n\t"
 948
 949                   "movq      8(%%esi), %%mm5  \n\t"
 950                   "pand      %%mm1, %%mm5     \n\t"
 951                   "movq      %%mm1, %%mm7     \n\t"
 952                   "movq      8(%%edi), %%mm6  \n\t"
 953                   "pandn     %%mm6, %%mm7     \n\t"
 954                   "por       %%mm7, %%mm5     \n\t"
 955                   "movq      %%mm5, 8(%%edi)  \n\t"
 956
 957                   "movq      16(%%esi), %%mm6 \n\t"
 958                   "pand      %%mm2, %%mm6     \n\t"
 959                   "movq      %%mm2, %%mm4     \n\t"
 960                   "movq      16(%%edi), %%mm7 \n\t"
 961                   "pandn     %%mm7, %%mm4     \n\t"
 962                   "por       %%mm4, %%mm6     \n\t"
 963                   "movq      %%mm6, 16(%%edi) \n\t"
 964
 965                   "addl      $24, %%esi       \n\t" /* inc by 24 bytes processed */
 966                   "addl      $24, %%edi       \n\t"
 967                   "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
 968
 969                   "ja        mainloop24       \n\t"
 970
 971                 "mainloop24end:               \n\t"
 972 /* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
 973                   "movl      %%eax, %%ecx     \n\t"
 974                   "cmpl      $0, %%ecx        \n\t"
 975                   "jz        end24            \n\t"
 976 /* preload        "movl      mask, %%edx      \n\t" */
 977                   "sall      $24, %%edx       \n\t" /* make low byte, high byte */
 978
 979                 "secondloop24:                \n\t"
 980                   "sall      %%edx            \n\t" /* move high bit to CF */
 981                   "jnc       skip24           \n\t" /* if CF = 0 */
 982                   "movw      (%%esi), %%ax    \n\t"
 983                   "movw      %%ax, (%%edi)    \n\t"
 984                   "xorl      %%eax, %%eax     \n\t"
 985                   "movb      2(%%esi), %%al   \n\t"
 986                   "movb      %%al, 2(%%edi)   \n\t"
 987
 988                 "skip24:                      \n\t"
 989                   "addl      $3, %%esi        \n\t"
 990                   "addl      $3, %%edi        \n\t"
 991                   "decl      %%ecx            \n\t"
 992                   "jnz       secondloop24     \n\t"
 993
 994                 "end24:                       \n\t"
 995                   "EMMS                       \n\t" /* DONE */
 996
 997                   : "=a" (dummy_value_a),           /* output regs (dummy) */
 998                     "=d" (dummy_value_d),
 999                     "=c" (dummy_value_c),
1000                     "=S" (dummy_value_S),
1001                     "=D" (dummy_value_D)
1002
1003                   : "3" (srcptr),      /* esi       // input regs */
1004                     "4" (dstptr),      /* edi */
1005                     "0" (diff),        /* eax */
1006 /* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
1007                     "2" (len),         /* ecx */
1008                     "1" (mask)         /* edx */
1009
1010 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1011                   : "%mm0", "%mm1", "%mm2"          /* clobber list */
1012                   , "%mm4", "%mm5", "%mm6", "%mm7"
1013 #endif
1014                );
1015             }
1016             else /* mmx _not supported - Use modified C routine */
1017 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1018             {
1019                register png_uint_32 i;
1020                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1021                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1022                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1023                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1024                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1025                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1026                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1027                int diff = (int) (png_ptr->width & 7); /* amount lost */
1028                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1029
1030                srcptr = png_ptr->row_buf + 1 + initial_val;
1031                dstptr = row + initial_val;
1032
1033                for (i = initial_val; i < final_val; i += stride)
1034                {
1035                   png_memcpy(dstptr, srcptr, rep_bytes);
1036                   srcptr += stride;
1037                   dstptr += stride;
1038                }
1039                if (diff)  /* number of leftover pixels:  3 for pngtest */
1040                {
1041                   final_val+=diff*BPP3;
1042                   for (; i < final_val; i += stride)
1043                   {
1044                      if (rep_bytes > (int)(final_val-i))
1045                         rep_bytes = (int)(final_val-i);
1046                      png_memcpy(dstptr, srcptr, rep_bytes);
1047                      srcptr += stride;
1048                      dstptr += stride;
1049                   }
1050                }
1051             } /* end of else (_mmx_supported) */
1052
1053             break;
1054          }       /* end 24 bpp */
1055
1056          case 32:       /* png_ptr->row_info.pixel_depth */
1057          {
1058             png_bytep srcptr;
1059             png_bytep dstptr;
1060
1061 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1062 #if !defined(PNG_1_0_X)
1063             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1064                 /* && _mmx_supported */ )
1065 #else
1066             if (_mmx_supported)
1067 #endif
1068             {
1069                png_uint_32 len;
1070                int diff;
1071                int dummy_value_a;   /* fix 'forbidden register spilled' error */
1072                int dummy_value_d;
1073                int dummy_value_c;
1074                int dummy_value_S;
1075                int dummy_value_D;
1076                _unmask = ~mask;            /* global variable for -fPIC version */
1077                srcptr = png_ptr->row_buf + 1;
1078                dstptr = row;
1079                len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
1080                diff = (int) (png_ptr->width & 7); /* amount lost // */
1081
1082                __asm__ __volatile__ (
1083                   "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
1084                   "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
1085                   "punpcklbw %%mm7, %%mm7     \n\t"
1086                   "punpcklwd %%mm7, %%mm7     \n\t"
1087                   "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
1088
1089                   "movq      _mask32_0, %%mm0 \n\t"
1090                   "movq      _mask32_1, %%mm1 \n\t"
1091                   "movq      _mask32_2, %%mm2 \n\t"
1092                   "movq      _mask32_3, %%mm3 \n\t"
1093
1094                   "pand      %%mm7, %%mm0     \n\t"
1095                   "pand      %%mm7, %%mm1     \n\t"
1096                   "pand      %%mm7, %%mm2     \n\t"
1097                   "pand      %%mm7, %%mm3     \n\t"
1098
1099                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1100                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1101                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1102                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1103
1104 /* preload        "movl      len, %%ecx       \n\t" // load length of line */
1105 /* preload        "movl      srcptr, %%esi    \n\t" // load source */
1106 /* preload        "movl      dstptr, %%edi    \n\t" // load dest */
1107
1108                   "cmpl      $0, %%ecx        \n\t" /* lcr */
1109                   "jz        mainloop32end    \n\t"
1110
1111                 "mainloop32:                  \n\t"
1112                   "movq      (%%esi), %%mm4   \n\t"
1113                   "pand      %%mm0, %%mm4     \n\t"
1114                   "movq      %%mm0, %%mm6     \n\t"
1115                   "movq      (%%edi), %%mm7   \n\t"
1116                   "pandn     %%mm7, %%mm6     \n\t"
1117                   "por       %%mm6, %%mm4     \n\t"
1118                   "movq      %%mm4, (%%edi)   \n\t"
1119
1120                   "movq      8(%%esi), %%mm5  \n\t"
1121                   "pand      %%mm1, %%mm5     \n\t"
1122                   "movq      %%mm1, %%mm7     \n\t"
1123                   "movq      8(%%edi), %%mm6  \n\t"
1124                   "pandn     %%mm6, %%mm7     \n\t"
1125                   "por       %%mm7, %%mm5     \n\t"
1126                   "movq      %%mm5, 8(%%edi)  \n\t"
1127
1128                   "movq      16(%%esi), %%mm6 \n\t"
1129                   "pand      %%mm2, %%mm6     \n\t"
1130                   "movq      %%mm2, %%mm4     \n\t"
1131                   "movq      16(%%edi), %%mm7 \n\t"
1132                   "pandn     %%mm7, %%mm4     \n\t"
1133                   "por       %%mm4, %%mm6     \n\t"
1134                   "movq      %%mm6, 16(%%edi) \n\t"
1135
1136                   "movq      24(%%esi), %%mm7 \n\t"
1137                   "pand      %%mm3, %%mm7     \n\t"
1138                   "movq      %%mm3, %%mm5     \n\t"
1139                   "movq      24(%%edi), %%mm4 \n\t"
1140                   "pandn     %%mm4, %%mm5     \n\t"
1141                   "por       %%mm5, %%mm7     \n\t"
1142                   "movq      %%mm7, 24(%%edi) \n\t"
1143
1144                   "addl      $32, %%esi       \n\t" /* inc by 32 bytes processed */
1145                   "addl      $32, %%edi       \n\t"
1146                   "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
1147                   "ja        mainloop32       \n\t"
1148
1149                 "mainloop32end:               \n\t"
1150 /* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
1151                   "movl      %%eax, %%ecx     \n\t"
1152                   "cmpl      $0, %%ecx        \n\t"
1153                   "jz        end32            \n\t"
1154 /* preload        "movl      mask, %%edx      \n\t" */
1155                   "sall      $24, %%edx       \n\t" /* low byte => high byte */
1156
1157                 "secondloop32:                \n\t"
1158                   "sall      %%edx            \n\t" /* move high bit to CF */
1159                   "jnc       skip32           \n\t" /* if CF = 0 */
1160                   "movl      (%%esi), %%eax   \n\t"
1161                   "movl      %%eax, (%%edi)   \n\t"
1162
1163                 "skip32:                      \n\t"
1164                   "addl      $4, %%esi        \n\t"
1165                   "addl      $4, %%edi        \n\t"
1166                   "decl      %%ecx            \n\t"
1167                   "jnz       secondloop32     \n\t"
1168
1169                 "end32:                       \n\t"
1170                   "EMMS                       \n\t" /* DONE */
1171
1172                   : "=a" (dummy_value_a),           /* output regs (dummy) */
1173                     "=d" (dummy_value_d),
1174                     "=c" (dummy_value_c),
1175                     "=S" (dummy_value_S),
1176                     "=D" (dummy_value_D)
1177
1178                   : "3" (srcptr),      /* esi       // input regs */
1179                     "4" (dstptr),      /* edi */
1180                     "0" (diff),        /* eax */
1181 /* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
1182                     "2" (len),         /* ecx */
1183                     "1" (mask)         /* edx */
1184
1185 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1186                   : "%mm0", "%mm1", "%mm2", "%mm3"  /* clobber list */
1187                   , "%mm4", "%mm5", "%mm6", "%mm7"
1188 #endif
1189                );
1190             }
1191             else /* mmx _not supported - Use modified C routine */
1192 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1193             {
1194                register png_uint_32 i;
1195                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1196                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1197                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1198                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1199                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1200                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1201                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1202                int diff = (int) (png_ptr->width & 7); /* amount lost */
1203                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1204
1205                srcptr = png_ptr->row_buf + 1 + initial_val;
1206                dstptr = row + initial_val;
1207
1208                for (i = initial_val; i < final_val; i += stride)
1209                {
1210                   png_memcpy(dstptr, srcptr, rep_bytes);
1211                   srcptr += stride;
1212                   dstptr += stride;
1213                }
1214                if (diff)  /* number of leftover pixels:  3 for pngtest */
1215                {
1216                   final_val+=diff*BPP4;
1217                   for (; i < final_val; i += stride)
1218                   {
1219                      if (rep_bytes > (int)(final_val-i))
1220                         rep_bytes = (int)(final_val-i);
1221                      png_memcpy(dstptr, srcptr, rep_bytes);
1222                      srcptr += stride;
1223                      dstptr += stride;
1224                   }
1225                }
1226             } /* end of else (_mmx_supported) */
1227
1228             break;
1229          }       /* end 32 bpp */
1230
1231          case 48:       /* png_ptr->row_info.pixel_depth */
1232          {
1233             png_bytep srcptr;
1234             png_bytep dstptr;
1235
1236 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1237 #if !defined(PNG_1_0_X)
1238             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1239                 /* && _mmx_supported */ )
1240 #else
1241             if (_mmx_supported)
1242 #endif
1243             {
1244                png_uint_32 len;
1245                int diff;
1246                int dummy_value_a;   /* fix 'forbidden register spilled' error */
1247                int dummy_value_d;
1248                int dummy_value_c;
1249                int dummy_value_S;
1250                int dummy_value_D;
1251                _unmask = ~mask;            /* global variable for -fPIC version */
1252                srcptr = png_ptr->row_buf + 1;
1253                dstptr = row;
1254                len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
1255                diff = (int) (png_ptr->width & 7); /* amount lost // */
1256
1257                __asm__ __volatile__ (
1258                   "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
1259                   "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
1260                   "punpcklbw %%mm7, %%mm7     \n\t"
1261                   "punpcklwd %%mm7, %%mm7     \n\t"
1262                   "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
1263
1264                   "movq      _mask48_0, %%mm0 \n\t"
1265                   "movq      _mask48_1, %%mm1 \n\t"
1266                   "movq      _mask48_2, %%mm2 \n\t"
1267                   "movq      _mask48_3, %%mm3 \n\t"
1268                   "movq      _mask48_4, %%mm4 \n\t"
1269                   "movq      _mask48_5, %%mm5 \n\t"
1270
1271                   "pand      %%mm7, %%mm0     \n\t"
1272                   "pand      %%mm7, %%mm1     \n\t"
1273                   "pand      %%mm7, %%mm2     \n\t"
1274                   "pand      %%mm7, %%mm3     \n\t"
1275                   "pand      %%mm7, %%mm4     \n\t"
1276                   "pand      %%mm7, %%mm5     \n\t"
1277
1278                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1279                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1280                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1281                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1282                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1283                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1284
1285 /* preload        "movl      len, %%ecx       \n\t" // load length of line */
1286 /* preload        "movl      srcptr, %%esi    \n\t" // load source */
1287 /* preload        "movl      dstptr, %%edi    \n\t" // load dest */
1288
1289                   "cmpl      $0, %%ecx        \n\t"
1290                   "jz        mainloop48end    \n\t"
1291
1292                 "mainloop48:                  \n\t"
1293                   "movq      (%%esi), %%mm7   \n\t"
1294                   "pand      %%mm0, %%mm7     \n\t"
1295                   "movq      %%mm0, %%mm6     \n\t"
1296                   "pandn     (%%edi), %%mm6   \n\t"
1297                   "por       %%mm6, %%mm7     \n\t"
1298                   "movq      %%mm7, (%%edi)   \n\t"
1299
1300                   "movq      8(%%esi), %%mm6  \n\t"
1301                   "pand      %%mm1, %%mm6     \n\t"
1302                   "movq      %%mm1, %%mm7     \n\t"
1303                   "pandn     8(%%edi), %%mm7  \n\t"
1304                   "por       %%mm7, %%mm6     \n\t"
1305                   "movq      %%mm6, 8(%%edi)  \n\t"
1306
1307                   "movq      16(%%esi), %%mm6 \n\t"
1308                   "pand      %%mm2, %%mm6     \n\t"
1309                   "movq      %%mm2, %%mm7     \n\t"
1310                   "pandn     16(%%edi), %%mm7 \n\t"
1311                   "por       %%mm7, %%mm6     \n\t"
1312                   "movq      %%mm6, 16(%%edi) \n\t"
1313
1314                   "movq      24(%%esi), %%mm7 \n\t"
1315                   "pand      %%mm3, %%mm7     \n\t"
1316                   "movq      %%mm3, %%mm6     \n\t"
1317                   "pandn     24(%%edi), %%mm6 \n\t"
1318                   "por       %%mm6, %%mm7     \n\t"
1319                   "movq      %%mm7, 24(%%edi) \n\t"
1320
1321                   "movq      32(%%esi), %%mm6 \n\t"
1322                   "pand      %%mm4, %%mm6     \n\t"
1323                   "movq      %%mm4, %%mm7     \n\t"
1324                   "pandn     32(%%edi), %%mm7 \n\t"
1325                   "por       %%mm7, %%mm6     \n\t"
1326                   "movq      %%mm6, 32(%%edi) \n\t"
1327
1328                   "movq      40(%%esi), %%mm7 \n\t"
1329                   "pand      %%mm5, %%mm7     \n\t"
1330                   "movq      %%mm5, %%mm6     \n\t"
1331                   "pandn     40(%%edi), %%mm6 \n\t"
1332                   "por       %%mm6, %%mm7     \n\t"
1333                   "movq      %%mm7, 40(%%edi) \n\t"
1334
1335                   "addl      $48, %%esi       \n\t" /* inc by 48 bytes processed */
1336                   "addl      $48, %%edi       \n\t"
1337                   "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
1338
1339                   "ja        mainloop48       \n\t"
1340
1341                 "mainloop48end:               \n\t"
1342 /* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
1343                   "movl      %%eax, %%ecx     \n\t"
1344                   "cmpl      $0, %%ecx        \n\t"
1345                   "jz        end48            \n\t"
1346 /* preload        "movl      mask, %%edx      \n\t" */
1347                   "sall      $24, %%edx       \n\t" /* make low byte, high byte */
1348
1349                 "secondloop48:                \n\t"
1350                   "sall      %%edx            \n\t" /* move high bit to CF */
1351                   "jnc       skip48           \n\t" /* if CF = 0 */
1352                   "movl      (%%esi), %%eax   \n\t"
1353                   "movl      %%eax, (%%edi)   \n\t"
1354
1355                 "skip48:                      \n\t"
1356                   "addl      $4, %%esi        \n\t"
1357                   "addl      $4, %%edi        \n\t"
1358                   "decl      %%ecx            \n\t"
1359                   "jnz       secondloop48     \n\t"
1360
1361                 "end48:                       \n\t"
1362                   "EMMS                       \n\t" /* DONE */
1363
1364                   : "=a" (dummy_value_a),           /* output regs (dummy) */
1365                     "=d" (dummy_value_d),
1366                     "=c" (dummy_value_c),
1367                     "=S" (dummy_value_S),
1368                     "=D" (dummy_value_D)
1369
1370                   : "3" (srcptr),      /* esi       // input regs */
1371                     "4" (dstptr),      /* edi */
1372                     "0" (diff),        /* eax */
1373 /* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
1374                     "2" (len),         /* ecx */
1375                     "1" (mask)         /* edx */
1376
1377 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1378                   : "%mm0", "%mm1", "%mm2", "%mm3"  /* clobber list */
1379                   , "%mm4", "%mm5", "%mm6", "%mm7"
1380 #endif
1381                );
1382             }
1383             else /* mmx _not supported - Use modified C routine */
1384 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1385             {
1386                register png_uint_32 i;
1387                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1388                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1389                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1390                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1391                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1392                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1393                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1394                int diff = (int) (png_ptr->width & 7); /* amount lost */
1395                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1396
1397                srcptr = png_ptr->row_buf + 1 + initial_val;
1398                dstptr = row + initial_val;
1399
1400                for (i = initial_val; i < final_val; i += stride)
1401                {
1402                   png_memcpy(dstptr, srcptr, rep_bytes);
1403                   srcptr += stride;
1404                   dstptr += stride;
1405                }
1406                if (diff)  /* number of leftover pixels:  3 for pngtest */
1407                {
1408                   final_val+=diff*BPP6;
1409                   for (; i < final_val; i += stride)
1410                   {
1411                      if (rep_bytes > (int)(final_val-i))
1412                         rep_bytes = (int)(final_val-i);
1413                      png_memcpy(dstptr, srcptr, rep_bytes);
1414                      srcptr += stride;
1415                      dstptr += stride;
1416                   }
1417                }
1418             } /* end of else (_mmx_supported) */
1419
1420             break;
1421          }       /* end 48 bpp */
1422
1423          case 64:       /* png_ptr->row_info.pixel_depth */
1424          {
1425             png_bytep srcptr;
1426             png_bytep dstptr;
1427             register png_uint_32 i;
1428             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1429               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1430             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1431               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1432             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1433               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1434             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1435             int diff = (int) (png_ptr->width & 7); /* amount lost */
1436             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1437
1438             srcptr = png_ptr->row_buf + 1 + initial_val;
1439             dstptr = row + initial_val;
1440
1441             for (i = initial_val; i < final_val; i += stride)
1442             {
1443                png_memcpy(dstptr, srcptr, rep_bytes);
1444                srcptr += stride;
1445                dstptr += stride;
1446             }
1447             if (diff)  /* number of leftover pixels:  3 for pngtest */
1448             {
1449                final_val+=diff*BPP8;
1450                for (; i < final_val; i += stride)
1451                {
1452                   if (rep_bytes > (int)(final_val-i))
1453                      rep_bytes = (int)(final_val-i);
1454                   png_memcpy(dstptr, srcptr, rep_bytes);
1455                   srcptr += stride;
1456                   dstptr += stride;
1457                }
1458             }
1459
1460             break;
1461          }       /* end 64 bpp */
1462
1463          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1464          {
1465             /* this should never happen */
1466             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1467             break;
1468          }
1469       } /* end switch (png_ptr->row_info.pixel_depth) */
1470
1471    } /* end if (non-trivial mask) */
1472
1473 } /* end png_combine_row() */
1474
1475 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1476
1477
1478
1479
1480 /*===========================================================================*/
1481 /*                                                                           */
1482 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
1483 /*                                                                           */
1484 /*===========================================================================*/
1485
1486 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1487 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1488
1489 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1490  * has taken place.  [GRR: what other steps come before and/or after?]
1491  */
1492
1493 void /* PRIVATE */
1494 png_do_read_interlace(png_structp png_ptr)
1495 {
1496    png_row_infop row_info = &(png_ptr->row_info);
1497    png_bytep row = png_ptr->row_buf + 1;
1498    int pass = png_ptr->pass;
1499 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1500    png_uint_32 transformations = png_ptr->transformations;
1501 #endif
1502
1503    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1504
1505 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1506    if (_mmx_supported == 2) {
1507 #if !defined(PNG_1_0_X)
1508        /* this should have happened in png_init_mmx_flags() already */
1509        png_warning(png_ptr, "asm_flags may not have been initialized");
1510 #endif
1511        png_mmx_support();
1512    }
1513 #endif
1514
1515    if (row != NULL && row_info != NULL)
1516    {
1517       png_uint_32 final_width;
1518
1519       final_width = row_info->width * png_pass_inc[pass];
1520
1521       switch (row_info->pixel_depth)
1522       {
1523          case 1:
1524          {
1525             png_bytep sp, dp;
1526             int sshift, dshift;
1527             int s_start, s_end, s_inc;
1528             png_byte v;
1529             png_uint_32 i;
1530             int j;
1531
1532             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1533             dp = row + (png_size_t)((final_width - 1) >> 3);
1534 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1535             if (transformations & PNG_PACKSWAP)
1536             {
1537                sshift = (int)((row_info->width + 7) & 7);
1538                dshift = (int)((final_width + 7) & 7);
1539                s_start = 7;
1540                s_end = 0;
1541                s_inc = -1;
1542             }
1543             else
1544 #endif
1545             {
1546                sshift = 7 - (int)((row_info->width + 7) & 7);
1547                dshift = 7 - (int)((final_width + 7) & 7);
1548                s_start = 0;
1549                s_end = 7;
1550                s_inc = 1;
1551             }
1552
1553             for (i = row_info->width; i; i--)
1554             {
1555                v = (png_byte)((*sp >> sshift) & 0x1);
1556                for (j = 0; j < png_pass_inc[pass]; j++)
1557                {
1558                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1559                   *dp |= (png_byte)(v << dshift);
1560                   if (dshift == s_end)
1561                   {
1562                      dshift = s_start;
1563                      dp--;
1564                   }
1565                   else
1566                      dshift += s_inc;
1567                }
1568                if (sshift == s_end)
1569                {
1570                   sshift = s_start;
1571                   sp--;
1572                }
1573                else
1574                   sshift += s_inc;
1575             }
1576             break;
1577          }
1578
1579          case 2:
1580          {
1581             png_bytep sp, dp;
1582             int sshift, dshift;
1583             int s_start, s_end, s_inc;
1584             png_uint_32 i;
1585
1586             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1587             dp = row + (png_size_t)((final_width - 1) >> 2);
1588 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1589             if (transformations & PNG_PACKSWAP)
1590             {
1591                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1592                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1593                s_start = 6;
1594                s_end = 0;
1595                s_inc = -2;
1596             }
1597             else
1598 #endif
1599             {
1600                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1601                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1602                s_start = 0;
1603                s_end = 6;
1604                s_inc = 2;
1605             }
1606
1607             for (i = row_info->width; i; i--)
1608             {
1609                png_byte v;
1610                int j;
1611
1612                v = (png_byte)((*sp >> sshift) & 0x3);
1613                for (j = 0; j < png_pass_inc[pass]; j++)
1614                {
1615                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1616                   *dp |= (png_byte)(v << dshift);
1617                   if (dshift == s_end)
1618                   {
1619                      dshift = s_start;
1620                      dp--;
1621                   }
1622                   else
1623                      dshift += s_inc;
1624                }
1625                if (sshift == s_end)
1626                {
1627                   sshift = s_start;
1628                   sp--;
1629                }
1630                else
1631                   sshift += s_inc;
1632             }
1633             break;
1634          }
1635
1636          case 4:
1637          {
1638             png_bytep sp, dp;
1639             int sshift, dshift;
1640             int s_start, s_end, s_inc;
1641             png_uint_32 i;
1642
1643             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1644             dp = row + (png_size_t)((final_width - 1) >> 1);
1645 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1646             if (transformations & PNG_PACKSWAP)
1647             {
1648                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1649                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1650                s_start = 4;
1651                s_end = 0;
1652                s_inc = -4;
1653             }
1654             else
1655 #endif
1656             {
1657                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1658                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1659                s_start = 0;
1660                s_end = 4;
1661                s_inc = 4;
1662             }
1663
1664             for (i = row_info->width; i; i--)
1665             {
1666                png_byte v;
1667                int j;
1668
1669                v = (png_byte)((*sp >> sshift) & 0xf);
1670                for (j = 0; j < png_pass_inc[pass]; j++)
1671                {
1672                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1673                   *dp |= (png_byte)(v << dshift);
1674                   if (dshift == s_end)
1675                   {
1676                      dshift = s_start;
1677                      dp--;
1678                   }
1679                   else
1680                      dshift += s_inc;
1681                }
1682                if (sshift == s_end)
1683                {
1684                   sshift = s_start;
1685                   sp--;
1686                }
1687                else
1688                   sshift += s_inc;
1689             }
1690             break;
1691          }
1692
1693        /*====================================================================*/
1694
1695          default: /* 8-bit or larger (this is where the routine is modified) */
1696          {
1697 #if 0
1698 /*          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good */
1699 /*          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good */
1700 /*          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good */
1701 /*          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good */
1702 #endif
1703             png_bytep sptr, dp;
1704             png_uint_32 i;
1705             png_size_t pixel_bytes;
1706             int width = (int)row_info->width;
1707
1708             pixel_bytes = (row_info->pixel_depth >> 3);
1709
1710             /* point sptr at the last pixel in the pre-expanded row: */
1711             sptr = row + (width - 1) * pixel_bytes;
1712
1713             /* point dp at the last pixel position in the expanded row: */
1714             dp = row + (final_width - 1) * pixel_bytes;
1715
1716             /* New code by Nirav Chhatrapati - Intel Corporation */
1717
1718 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1719 #if !defined(PNG_1_0_X)
1720             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1721                 /* && _mmx_supported */ )
1722 #else
1723             if (_mmx_supported)
1724 #endif
1725             {
1726                //--------------------------------------------------------------
1727                if (pixel_bytes == 3)
1728                {
1729                   if (((pass == 0) || (pass == 1)) && width)
1730                   {
1731                      int dummy_value_c;   /* fix 'forbidden register spilled' */
1732                      int dummy_value_S;
1733                      int dummy_value_D;
1734
1735                      __asm__ __volatile__ (
1736                         "subl $21, %%edi         \n\t"
1737                                      /* (png_pass_inc[pass] - 1)*pixel_bytes */
1738
1739                      ".loop3_pass0:              \n\t"
1740                         "movd (%%esi), %%mm0     \n\t" /* x x x x x 2 1 0 */
1741                         "pand _const4, %%mm0     \n\t" /* z z z z z 2 1 0 */
1742                         "movq %%mm0, %%mm1       \n\t" /* z z z z z 2 1 0 */
1743                         "psllq $16, %%mm0        \n\t" /* z z z 2 1 0 z z */
1744                         "movq %%mm0, %%mm2       \n\t" /* z z z 2 1 0 z z */
1745                         "psllq $24, %%mm0        \n\t" /* 2 1 0 z z z z z */
1746                         "psrlq $8, %%mm1         \n\t" /* z z z z z z 2 1 */
1747                         "por %%mm2, %%mm0        \n\t" /* 2 1 0 2 1 0 z z */
1748                         "por %%mm1, %%mm0        \n\t" /* 2 1 0 2 1 0 2 1 */
1749                         "movq %%mm0, %%mm3       \n\t" /* 2 1 0 2 1 0 2 1 */
1750                         "psllq $16, %%mm0        \n\t" /* 0 2 1 0 2 1 z z */
1751                         "movq %%mm3, %%mm4       \n\t" /* 2 1 0 2 1 0 2 1 */
1752                         "punpckhdq %%mm0, %%mm3  \n\t" /* 0 2 1 0 2 1 0 2 */
1753                         "movq %%mm4, 16(%%edi)   \n\t"
1754                         "psrlq $32, %%mm0        \n\t" /* z z z z 0 2 1 0 */
1755                         "movq %%mm3, 8(%%edi)    \n\t"
1756                         "punpckldq %%mm4, %%mm0  \n\t" /* 1 0 2 1 0 2 1 0 */
1757                         "subl $3, %%esi          \n\t"
1758                         "movq %%mm0, (%%edi)     \n\t"
1759                         "subl $24, %%edi         \n\t"
1760                         "decl %%ecx              \n\t"
1761                         "jnz .loop3_pass0        \n\t"
1762                         "EMMS                    \n\t" /* DONE */
1763
1764                         : "=c" (dummy_value_c),        /* output regs (dummy) */
1765                           "=S" (dummy_value_S),
1766                           "=D" (dummy_value_D)
1767
1768                         : "1" (sptr),      /* esi      // input regs */
1769                           "2" (dp),        /* edi */
1770                           "0" (width)      /* ecx */
1771 /* doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4) */
1772
1773 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1774                         : "%mm0", "%mm1", "%mm2"       /* clobber list */
1775                         , "%mm3", "%mm4"
1776 #endif
1777                      );
1778                   }
1779                   else if (((pass == 2) || (pass == 3)) && width)
1780                   {
1781                      int dummy_value_c;   /* fix 'forbidden register spilled' */
1782                      int dummy_value_S;
1783                      int dummy_value_D;
1784
1785                      __asm__ __volatile__ (
1786                         "subl $9, %%edi          \n\t"
1787                                      /* (png_pass_inc[pass] - 1)*pixel_bytes */
1788
1789                      ".loop3_pass2:              \n\t"
1790                         "movd (%%esi), %%mm0     \n\t" /* x x x x x 2 1 0 */
1791                         "pand _const4, %%mm0     \n\t" /* z z z z z 2 1 0 */
1792                         "movq %%mm0, %%mm1       \n\t" /* z z z z z 2 1 0 */
1793                         "psllq $16, %%mm0        \n\t" /* z z z 2 1 0 z z */
1794                         "movq %%mm0, %%mm2       \n\t" /* z z z 2 1 0 z z */
1795                         "psllq $24, %%mm0        \n\t" /* 2 1 0 z z z z z */
1796                         "psrlq $8, %%mm1         \n\t" /* z z z z z z 2 1 */
1797                         "por %%mm2, %%mm0        \n\t" /* 2 1 0 2 1 0 z z */
1798                         "por %%mm1, %%mm0        \n\t" /* 2 1 0 2 1 0 2 1 */
1799                         "movq %%mm0, 4(%%edi)    \n\t"
1800                         "psrlq $16, %%mm0        \n\t" /* z z 2 1 0 2 1 0 */
1801                         "subl $3, %%esi          \n\t"
1802                         "movd %%mm0, (%%edi)     \n\t"
1803                         "subl $12, %%edi         \n\t"
1804                         "decl %%ecx              \n\t"
1805                         "jnz .loop3_pass2        \n\t"
1806                         "EMMS                    \n\t" /* DONE */
1807
1808                         : "=c" (dummy_value_c),        /* output regs (dummy) */
1809                           "=S" (dummy_value_S),
1810                           "=D" (dummy_value_D)
1811
1812                         : "1" (sptr),      /* esi      // input regs */
1813                           "2" (dp),        /* edi */
1814                           "0" (width)      /* ecx */
1815
1816 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1817                         : "%mm0", "%mm1", "%mm2"       /* clobber list */
1818 #endif
1819                      );
1820                   }
1821                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1822                   {
1823                      int width_mmx = ((width >> 1) << 1) - 8;   /* GRR:  huh? */
1824                      if (width_mmx < 0)
1825                          width_mmx = 0;
1826                      width -= width_mmx;        /* 8 or 9 pix, 24 or 27 bytes */
1827                      if (width_mmx)
1828                      {
1829                         /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1830                         /* sptr points at last pixel in pre-expanded row */
1831                         /* dp points at last pixel position in expanded row */
1832                         int dummy_value_c;  /* fix 'forbidden register spilled' */
1833                         int dummy_value_S;
1834                         int dummy_value_D;
1835
1836                         __asm__ __volatile__ (
1837                            "subl $3, %%esi          \n\t"
1838                            "subl $9, %%edi          \n\t"
1839                                         /* (png_pass_inc[pass] + 1)*pixel_bytes */
1840
1841                         ".loop3_pass4:              \n\t"
1842                            "movq (%%esi), %%mm0     \n\t" /* x x 5 4 3 2 1 0 */
1843                            "movq %%mm0, %%mm1       \n\t" /* x x 5 4 3 2 1 0 */
1844                            "movq %%mm0, %%mm2       \n\t" /* x x 5 4 3 2 1 0 */
1845                            "psllq $24, %%mm0        \n\t" /* 4 3 2 1 0 z z z */
1846                            "pand _const4, %%mm1     \n\t" /* z z z z z 2 1 0 */
1847                            "psrlq $24, %%mm2        \n\t" /* z z z x x 5 4 3 */
1848                            "por %%mm1, %%mm0        \n\t" /* 4 3 2 1 0 2 1 0 */
1849                            "movq %%mm2, %%mm3       \n\t" /* z z z x x 5 4 3 */
1850                            "psllq $8, %%mm2         \n\t" /* z z x x 5 4 3 z */
1851                            "movq %%mm0, (%%edi)     \n\t"
1852                            "psrlq $16, %%mm3        \n\t" /* z z z z z x x 5 */
1853                            "pand _const6, %%mm3     \n\t" /* z z z z z z z 5 */
1854                            "por %%mm3, %%mm2        \n\t" /* z z x x 5 4 3 5 */
1855                            "subl $6, %%esi          \n\t"
1856                            "movd %%mm2, 8(%%edi)    \n\t"
1857                            "subl $12, %%edi         \n\t"
1858                            "subl $2, %%ecx          \n\t"
1859                            "jnz .loop3_pass4        \n\t"
1860                            "EMMS                    \n\t" /* DONE */
1861
1862                            : "=c" (dummy_value_c),        /* output regs (dummy) */
1863                              "=S" (dummy_value_S),
1864                              "=D" (dummy_value_D)
1865
1866                            : "1" (sptr),      /* esi      // input regs */
1867                              "2" (dp),        /* edi */
1868                              "0" (width_mmx)  /* ecx */
1869
1870 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1871                            : "%mm0", "%mm1"               /* clobber list */
1872                            , "%mm2", "%mm3"
1873 #endif
1874                         );
1875                      }
1876
1877                      sptr -= width_mmx*3;
1878                      dp -= width_mmx*6;
1879                      for (i = width; i; i--)
1880                      {
1881                         png_byte v[8];
1882                         int j;
1883
1884                         png_memcpy(v, sptr, 3);
1885                         for (j = 0; j < png_pass_inc[pass]; j++)
1886                         {
1887                            png_memcpy(dp, v, 3);
1888                            dp -= 3;
1889                         }
1890                         sptr -= 3;
1891                      }
1892                   }
1893                } /* end of pixel_bytes == 3 */
1894
1895                //--------------------------------------------------------------
1896                else if (pixel_bytes == 1)
1897                {
1898                   if (((pass == 0) || (pass == 1)) && width)
1899                   {
1900                      int width_mmx = ((width >> 2) << 2);
1901                      width -= width_mmx;        /* 0-3 pixels => 0-3 bytes */
1902                      if (width_mmx)
1903                      {
1904                         int dummy_value_c;  /* fix 'forbidden register spilled' */
1905                         int dummy_value_S;
1906                         int dummy_value_D;
1907
1908                         __asm__ __volatile__ (
1909                            "subl $3, %%esi          \n\t"
1910                            "subl $31, %%edi         \n\t"
1911
1912                         ".loop1_pass0:              \n\t"
1913                            "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
1914                            "movq %%mm0, %%mm1       \n\t" /* x x x x 3 2 1 0 */
1915                            "punpcklbw %%mm0, %%mm0  \n\t" /* 3 3 2 2 1 1 0 0 */
1916                            "movq %%mm0, %%mm2       \n\t" /* 3 3 2 2 1 1 0 0 */
1917                            "punpcklwd %%mm0, %%mm0  \n\t" /* 1 1 1 1 0 0 0 0 */
1918                            "movq %%mm0, %%mm3       \n\t" /* 1 1 1 1 0 0 0 0 */
1919                            "punpckldq %%mm0, %%mm0  \n\t" /* 0 0 0 0 0 0 0 0 */
1920                            "punpckhdq %%mm3, %%mm3  \n\t" /* 1 1 1 1 1 1 1 1 */
1921                            "movq %%mm0, (%%edi)     \n\t"
1922                            "punpckhwd %%mm2, %%mm2  \n\t" /* 3 3 3 3 2 2 2 2 */
1923                            "movq %%mm3, 8(%%edi)    \n\t"
1924                            "movq %%mm2, %%mm4       \n\t" /* 3 3 3 3 2 2 2 2 */
1925                            "punpckldq %%mm2, %%mm2  \n\t" /* 2 2 2 2 2 2 2 2 */
1926                            "punpckhdq %%mm4, %%mm4  \n\t" /* 3 3 3 3 3 3 3 3 */
1927                            "movq %%mm2, 16(%%edi)   \n\t"
1928                            "subl $4, %%esi          \n\t"
1929                            "movq %%mm4, 24(%%edi)   \n\t"
1930                            "subl $32, %%edi         \n\t"
1931                            "subl $4, %%ecx          \n\t"
1932                            "jnz .loop1_pass0        \n\t"
1933                            "EMMS                    \n\t" /* DONE */
1934
1935                            : "=c" (dummy_value_c),        /* output regs (dummy) */
1936                              "=S" (dummy_value_S),
1937                              "=D" (dummy_value_D)
1938
1939                            : "1" (sptr),      /* esi      // input regs */
1940                              "2" (dp),        /* edi */
1941                              "0" (width_mmx)  /* ecx */
1942
1943 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1944                            : "%mm0", "%mm1", "%mm2"       /* clobber list */
1945                            , "%mm3", "%mm4"
1946 #endif
1947                         );
1948                      }
1949
1950                      sptr -= width_mmx;
1951                      dp -= width_mmx*8;
1952                      for (i = width; i; i--)
1953                      {
1954                         int j;
1955
1956                        /* I simplified this part in version 1.0.4e
1957                         * here and in several other instances where
1958                         * pixel_bytes == 1  -- GR-P
1959                         *
1960                         * Original code:
1961                         *
1962                         * png_byte v[8];
1963                         * png_memcpy(v, sptr, pixel_bytes);
1964                         * for (j = 0; j < png_pass_inc[pass]; j++)
1965                         * {
1966                         *    png_memcpy(dp, v, pixel_bytes);
1967                         *    dp -= pixel_bytes;
1968                         * }
1969                         * sptr -= pixel_bytes;
1970                         *
1971                         * Replacement code is in the next three lines:
1972                         */
1973
1974                         for (j = 0; j < png_pass_inc[pass]; j++)
1975                         {
1976                            *dp-- = *sptr;
1977                         }
1978                         --sptr;
1979                      }
1980                   }
1981                   else if (((pass == 2) || (pass == 3)) && width)
1982                   {
1983                      int width_mmx = ((width >> 2) << 2);
1984                      width -= width_mmx;        /* 0-3 pixels => 0-3 bytes */
1985                      if (width_mmx)
1986                      {
1987                         int dummy_value_c;  /* fix 'forbidden register spilled' */
1988                         int dummy_value_S;
1989                         int dummy_value_D;
1990
1991                         __asm__ __volatile__ (
1992                            "subl $3, %%esi          \n\t"
1993                            "subl $15, %%edi         \n\t"
1994
1995                         ".loop1_pass2:              \n\t"
1996                            "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
1997                            "punpcklbw %%mm0, %%mm0  \n\t" /* 3 3 2 2 1 1 0 0 */
1998                            "movq %%mm0, %%mm1       \n\t" /* 3 3 2 2 1 1 0 0 */
1999                            "punpcklwd %%mm0, %%mm0  \n\t" /* 1 1 1 1 0 0 0 0 */
2000                            "punpckhwd %%mm1, %%mm1  \n\t" /* 3 3 3 3 2 2 2 2 */
2001                            "movq %%mm0, (%%edi)     \n\t"
2002                            "subl $4, %%esi          \n\t"
2003                            "movq %%mm1, 8(%%edi)    \n\t"
2004                            "subl $16, %%edi         \n\t"
2005                            "subl $4, %%ecx          \n\t"
2006                            "jnz .loop1_pass2        \n\t"
2007                            "EMMS                    \n\t" /* DONE */
2008
2009                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2010                              "=S" (dummy_value_S),
2011                              "=D" (dummy_value_D)
2012
2013                            : "1" (sptr),      /* esi      // input regs */
2014                              "2" (dp),        /* edi */
2015                              "0" (width_mmx)  /* ecx */
2016
2017 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2018                            : "%mm0", "%mm1"               /* clobber list */
2019 #endif
2020                         );
2021                      }
2022
2023                      sptr -= width_mmx;
2024                      dp -= width_mmx*4;
2025                      for (i = width; i; i--)
2026                      {
2027                         int j;
2028
2029                         for (j = 0; j < png_pass_inc[pass]; j++)
2030                         {
2031                            *dp-- = *sptr;
2032                         }
2033                         --sptr;
2034                      }
2035                   }
2036                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
2037                   {
2038                      int width_mmx = ((width >> 3) << 3);
2039                      width -= width_mmx;        /* 0-3 pixels => 0-3 bytes */
2040                      if (width_mmx)
2041                      {
2042                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2043                         int dummy_value_S;
2044                         int dummy_value_D;
2045
2046                         __asm__ __volatile__ (
2047                            "subl $7, %%esi          \n\t"
2048                            "subl $15, %%edi         \n\t"
2049
2050                         ".loop1_pass4:              \n\t"
2051                            "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2052                            "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2053                            "punpcklbw %%mm0, %%mm0  \n\t" /* 3 3 2 2 1 1 0 0 */
2054                            "punpckhbw %%mm1, %%mm1  \n\t" /* 7 7 6 6 5 5 4 4 */
2055                            "movq %%mm1, 8(%%edi)    \n\t"
2056                            "subl $8, %%esi          \n\t"
2057                            "movq %%mm0, (%%edi)     \n\t"
2058                            "subl $16, %%edi         \n\t"
2059                            "subl $8, %%ecx          \n\t"
2060                            "jnz .loop1_pass4        \n\t"
2061                            "EMMS                    \n\t" /* DONE */
2062
2063                            : "=c" (dummy_value_c),        /* output regs (none) */
2064                              "=S" (dummy_value_S),
2065                              "=D" (dummy_value_D)
2066
2067                            : "1" (sptr),      /* esi      // input regs */
2068                              "2" (dp),        /* edi */
2069                              "0" (width_mmx)  /* ecx */
2070
2071 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2072                            : "%mm0", "%mm1"               /* clobber list */
2073 #endif
2074                         );
2075                      }
2076
2077                      sptr -= width_mmx;
2078                      dp -= width_mmx*2;
2079                      for (i = width; i; i--)
2080                      {
2081                         int j;
2082
2083                         for (j = 0; j < png_pass_inc[pass]; j++)
2084                         {
2085                            *dp-- = *sptr;
2086                         }
2087                         --sptr;
2088                      }
2089                   }
2090                } /* end of pixel_bytes == 1 */
2091
2092                //--------------------------------------------------------------
2093                else if (pixel_bytes == 2)
2094                {
2095                   if (((pass == 0) || (pass == 1)) && width)
2096                   {
2097                      int width_mmx = ((width >> 1) << 1);
2098                      width -= width_mmx;        /* 0,1 pixels => 0,2 bytes */
2099                      if (width_mmx)
2100                      {
2101                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2102                         int dummy_value_S;
2103                         int dummy_value_D;
2104
2105                         __asm__ __volatile__ (
2106                            "subl $2, %%esi          \n\t"
2107                            "subl $30, %%edi         \n\t"
2108
2109                         ".loop2_pass0:              \n\t"
2110                            "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2111                            "punpcklwd %%mm0, %%mm0  \n\t" /* 3 2 3 2 1 0 1 0 */
2112                            "movq %%mm0, %%mm1       \n\t" /* 3 2 3 2 1 0 1 0 */
2113                            "punpckldq %%mm0, %%mm0  \n\t" /* 1 0 1 0 1 0 1 0 */
2114                            "punpckhdq %%mm1, %%mm1  \n\t" /* 3 2 3 2 3 2 3 2 */
2115                            "movq %%mm0, (%%edi)     \n\t"
2116                            "movq %%mm0, 8(%%edi)    \n\t"
2117                            "movq %%mm1, 16(%%edi)   \n\t"
2118                            "subl $4, %%esi          \n\t"
2119                            "movq %%mm1, 24(%%edi)   \n\t"
2120                            "subl $32, %%edi         \n\t"
2121                            "subl $2, %%ecx          \n\t"
2122                            "jnz .loop2_pass0        \n\t"
2123                            "EMMS                    \n\t" /* DONE */
2124
2125                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2126                              "=S" (dummy_value_S),
2127                              "=D" (dummy_value_D)
2128
2129                            : "1" (sptr),      /* esi      // input regs */
2130                              "2" (dp),        /* edi */
2131                              "0" (width_mmx)  /* ecx */
2132
2133 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2134                            : "%mm0", "%mm1"               /* clobber list */
2135 #endif
2136                         );
2137                      }
2138
2139                      sptr -= (width_mmx*2 - 2); /* sign fixed */
2140                      dp -= (width_mmx*16 - 2);  /* sign fixed */
2141                      for (i = width; i; i--)
2142                      {
2143                         png_byte v[8];
2144                         int j;
2145                         sptr -= 2;
2146                         png_memcpy(v, sptr, 2);
2147                         for (j = 0; j < png_pass_inc[pass]; j++)
2148                         {
2149                            dp -= 2;
2150                            png_memcpy(dp, v, 2);
2151                         }
2152                      }
2153                   }
2154                   else if (((pass == 2) || (pass == 3)) && width)
2155                   {
2156                      int width_mmx = ((width >> 1) << 1) ;
2157                      width -= width_mmx;        /* 0,1 pixels => 0,2 bytes */
2158                      if (width_mmx)
2159                      {
2160                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2161                         int dummy_value_S;
2162                         int dummy_value_D;
2163
2164                         __asm__ __volatile__ (
2165                            "subl $2, %%esi          \n\t"
2166                            "subl $14, %%edi         \n\t"
2167
2168                         ".loop2_pass2:              \n\t"
2169                            "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2170                            "punpcklwd %%mm0, %%mm0  \n\t" /* 3 2 3 2 1 0 1 0 */
2171                            "movq %%mm0, %%mm1       \n\t" /* 3 2 3 2 1 0 1 0 */
2172                            "punpckldq %%mm0, %%mm0  \n\t" /* 1 0 1 0 1 0 1 0 */
2173                            "punpckhdq %%mm1, %%mm1  \n\t" /* 3 2 3 2 3 2 3 2 */
2174                            "movq %%mm0, (%%edi)     \n\t"
2175                            "subl $4, %%esi          \n\t"
2176                            "movq %%mm1, 8(%%edi)    \n\t"
2177                            "subl $16, %%edi         \n\t"
2178                            "subl $2, %%ecx          \n\t"
2179                            "jnz .loop2_pass2        \n\t"
2180                            "EMMS                    \n\t" /* DONE */
2181
2182                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2183                              "=S" (dummy_value_S),
2184                              "=D" (dummy_value_D)
2185
2186                            : "1" (sptr),      /* esi      // input regs */
2187                              "2" (dp),        /* edi */
2188                              "0" (width_mmx)  /* ecx */
2189
2190 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2191                            : "%mm0", "%mm1"               /* clobber list */
2192 #endif
2193                         );
2194                      }
2195
2196                      sptr -= (width_mmx*2 - 2); /* sign fixed */
2197                      dp -= (width_mmx*8 - 2);   /* sign fixed */
2198                      for (i = width; i; i--)
2199                      {
2200                         png_byte v[8];
2201                         int j;
2202                         sptr -= 2;
2203                         png_memcpy(v, sptr, 2);
2204                         for (j = 0; j < png_pass_inc[pass]; j++)
2205                         {
2206                            dp -= 2;
2207                            png_memcpy(dp, v, 2);
2208                         }
2209                      }
2210                   }
2211                   else if (width)  /* pass == 4 or 5 */
2212                   {
2213                      int width_mmx = ((width >> 1) << 1) ;
2214                      width -= width_mmx;        /* 0,1 pixels => 0,2 bytes */
2215                      if (width_mmx)
2216                      {
2217                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2218                         int dummy_value_S;
2219                         int dummy_value_D;
2220
2221                         __asm__ __volatile__ (
2222                            "subl $2, %%esi          \n\t"
2223                            "subl $6, %%edi          \n\t"
2224
2225                         ".loop2_pass4:              \n\t"
2226                            "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2227                            "punpcklwd %%mm0, %%mm0  \n\t" /* 3 2 3 2 1 0 1 0 */
2228                            "subl $4, %%esi          \n\t"
2229                            "movq %%mm0, (%%edi)     \n\t"
2230                            "subl $8, %%edi          \n\t"
2231                            "subl $2, %%ecx          \n\t"
2232                            "jnz .loop2_pass4        \n\t"
2233                            "EMMS                    \n\t" /* DONE */
2234
2235                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2236                              "=S" (dummy_value_S),
2237                              "=D" (dummy_value_D)
2238
2239                            : "1" (sptr),      /* esi      // input regs */
2240                              "2" (dp),        /* edi */
2241                              "0" (width_mmx)  /* ecx */
2242
2243 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2244                            : "%mm0"                       /* clobber list */
2245 #endif
2246                         );
2247                      }
2248
2249                      sptr -= (width_mmx*2 - 2); /* sign fixed */
2250                      dp -= (width_mmx*4 - 2);   /* sign fixed */
2251                      for (i = width; i; i--)
2252                      {
2253                         png_byte v[8];
2254                         int j;
2255                         sptr -= 2;
2256                         png_memcpy(v, sptr, 2);
2257                         for (j = 0; j < png_pass_inc[pass]; j++)
2258                         {
2259                            dp -= 2;
2260                            png_memcpy(dp, v, 2);
2261                         }
2262                      }
2263                   }
2264                } /* end of pixel_bytes == 2 */
2265
2266                //--------------------------------------------------------------
2267                else if (pixel_bytes == 4)
2268                {
2269                   if (((pass == 0) || (pass == 1)) && width)
2270                   {
2271                      int width_mmx = ((width >> 1) << 1);
2272                      width -= width_mmx;        /* 0,1 pixels => 0,4 bytes */
2273                      if (width_mmx)
2274                      {
2275                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2276                         int dummy_value_S;
2277                         int dummy_value_D;
2278
2279                         __asm__ __volatile__ (
2280                            "subl $4, %%esi          \n\t"
2281                            "subl $60, %%edi         \n\t"
2282
2283                         ".loop4_pass0:              \n\t"
2284                            "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2285                            "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2286                            "punpckldq %%mm0, %%mm0  \n\t" /* 3 2 1 0 3 2 1 0 */
2287                            "punpckhdq %%mm1, %%mm1  \n\t" /* 7 6 5 4 7 6 5 4 */
2288                            "movq %%mm0, (%%edi)     \n\t"
2289                            "movq %%mm0, 8(%%edi)    \n\t"
2290                            "movq %%mm0, 16(%%edi)   \n\t"
2291                            "movq %%mm0, 24(%%edi)   \n\t"
2292                            "movq %%mm1, 32(%%edi)   \n\t"
2293                            "movq %%mm1, 40(%%edi)   \n\t"
2294                            "movq %%mm1, 48(%%edi)   \n\t"
2295                            "subl $8, %%esi          \n\t"
2296                            "movq %%mm1, 56(%%edi)   \n\t"
2297                            "subl $64, %%edi         \n\t"
2298                            "subl $2, %%ecx          \n\t"
2299                            "jnz .loop4_pass0        \n\t"
2300                            "EMMS                    \n\t" /* DONE */
2301
2302                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2303                              "=S" (dummy_value_S),
2304                              "=D" (dummy_value_D)
2305
2306                            : "1" (sptr),      /* esi      // input regs */
2307                              "2" (dp),        /* edi */
2308                              "0" (width_mmx)  /* ecx */
2309
2310 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2311                            : "%mm0", "%mm1"               /* clobber list */
2312 #endif
2313                         );
2314                      }
2315
2316                      sptr -= (width_mmx*4 - 4); /* sign fixed */
2317                      dp -= (width_mmx*32 - 4);  /* sign fixed */
2318                      for (i = width; i; i--)
2319                      {
2320                         png_byte v[8];
2321                         int j;
2322                         sptr -= 4;
2323                         png_memcpy(v, sptr, 4);
2324                         for (j = 0; j < png_pass_inc[pass]; j++)
2325                         {
2326                            dp -= 4;
2327                            png_memcpy(dp, v, 4);
2328                         }
2329                      }
2330                   }
2331                   else if (((pass == 2) || (pass == 3)) && width)
2332                   {
2333                      int width_mmx = ((width >> 1) << 1);
2334                      width -= width_mmx;        /* 0,1 pixels => 0,4 bytes */
2335                      if (width_mmx)
2336                      {
2337                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2338                         int dummy_value_S;
2339                         int dummy_value_D;
2340
2341                         __asm__ __volatile__ (
2342                            "subl $4, %%esi          \n\t"
2343                            "subl $28, %%edi         \n\t"
2344
2345                         ".loop4_pass2:              \n\t"
2346                            "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2347                            "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2348                            "punpckldq %%mm0, %%mm0  \n\t" /* 3 2 1 0 3 2 1 0 */
2349                            "punpckhdq %%mm1, %%mm1  \n\t" /* 7 6 5 4 7 6 5 4 */
2350                            "movq %%mm0, (%%edi)     \n\t"
2351                            "movq %%mm0, 8(%%edi)    \n\t"
2352                            "movq %%mm1, 16(%%edi)   \n\t"
2353                            "movq %%mm1, 24(%%edi)   \n\t"
2354                            "subl $8, %%esi          \n\t"
2355                            "subl $32, %%edi         \n\t"
2356                            "subl $2, %%ecx          \n\t"
2357                            "jnz .loop4_pass2        \n\t"
2358                            "EMMS                    \n\t" /* DONE */
2359
2360                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2361                              "=S" (dummy_value_S),
2362                              "=D" (dummy_value_D)
2363
2364                            : "1" (sptr),      /* esi      // input regs */
2365                              "2" (dp),        /* edi */
2366                              "0" (width_mmx)  /* ecx */
2367
2368 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2369                            : "%mm0", "%mm1"               /* clobber list */
2370 #endif
2371                         );
2372                      }
2373
2374                      sptr -= (width_mmx*4 - 4); /* sign fixed */
2375                      dp -= (width_mmx*16 - 4);  /* sign fixed */
2376                      for (i = width; i; i--)
2377                      {
2378                         png_byte v[8];
2379                         int j;
2380                         sptr -= 4;
2381                         png_memcpy(v, sptr, 4);
2382                         for (j = 0; j < png_pass_inc[pass]; j++)
2383                         {
2384                            dp -= 4;
2385                            png_memcpy(dp, v, 4);
2386                         }
2387                      }
2388                   }
2389                   else if (width)  /* pass == 4 or 5 */
2390                   {
2391                      int width_mmx = ((width >> 1) << 1) ;
2392                      width -= width_mmx;        /* 0,1 pixels => 0,4 bytes */
2393                      if (width_mmx)
2394                      {
2395                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2396                         int dummy_value_S;
2397                         int dummy_value_D;
2398
2399                         __asm__ __volatile__ (
2400                            "subl $4, %%esi          \n\t"
2401                            "subl $12, %%edi         \n\t"
2402
2403                         ".loop4_pass4:              \n\t"
2404                            "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2405                            "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2406                            "punpckldq %%mm0, %%mm0  \n\t" /* 3 2 1 0 3 2 1 0 */
2407                            "punpckhdq %%mm1, %%mm1  \n\t" /* 7 6 5 4 7 6 5 4 */
2408                            "movq %%mm0, (%%edi)     \n\t"
2409                            "subl $8, %%esi          \n\t"
2410                            "movq %%mm1, 8(%%edi)    \n\t"
2411                            "subl $16, %%edi         \n\t"
2412                            "subl $2, %%ecx          \n\t"
2413                            "jnz .loop4_pass4        \n\t"
2414                            "EMMS                    \n\t" /* DONE */
2415
2416                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2417                              "=S" (dummy_value_S),
2418                              "=D" (dummy_value_D)
2419
2420                            : "1" (sptr),      /* esi      // input regs */
2421                              "2" (dp),        /* edi */
2422                              "0" (width_mmx)  /* ecx */
2423
2424 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2425                            : "%mm0", "%mm1"               /* clobber list */
2426 #endif
2427                         );
2428                      }
2429
2430                      sptr -= (width_mmx*4 - 4); /* sign fixed */
2431                      dp -= (width_mmx*8 - 4);   /* sign fixed */
2432                      for (i = width; i; i--)
2433                      {
2434                         png_byte v[8];
2435                         int j;
2436                         sptr -= 4;
2437                         png_memcpy(v, sptr, 4);
2438                         for (j = 0; j < png_pass_inc[pass]; j++)
2439                         {
2440                            dp -= 4;
2441                            png_memcpy(dp, v, 4);
2442                         }
2443                      }
2444                   }
2445                } /* end of pixel_bytes == 4 */
2446
2447                //--------------------------------------------------------------
2448                else if (pixel_bytes == 8)
2449                {
2450 /* GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?) */
2451                   /* GRR NOTE:  no need to combine passes here! */
2452                   if (((pass == 0) || (pass == 1)) && width)
2453                   {
2454                      int dummy_value_c;  /* fix 'forbidden register spilled' */
2455                      int dummy_value_S;
2456                      int dummy_value_D;
2457
2458                      /* source is 8-byte RRGGBBAA */
2459                      /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */
2460                      __asm__ __volatile__ (
2461                         "subl $56, %%edi         \n\t" /* start of last block */
2462
2463                      ".loop8_pass0:              \n\t"
2464                         "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2465                         "movq %%mm0, (%%edi)     \n\t"
2466                         "movq %%mm0, 8(%%edi)    \n\t"
2467                         "movq %%mm0, 16(%%edi)   \n\t"
2468                         "movq %%mm0, 24(%%edi)   \n\t"
2469                         "movq %%mm0, 32(%%edi)   \n\t"
2470                         "movq %%mm0, 40(%%edi)   \n\t"
2471                         "movq %%mm0, 48(%%edi)   \n\t"
2472                         "subl $8, %%esi          \n\t"
2473                         "movq %%mm0, 56(%%edi)   \n\t"
2474                         "subl $64, %%edi         \n\t"
2475                         "decl %%ecx              \n\t"
2476                         "jnz .loop8_pass0        \n\t"
2477                         "EMMS                    \n\t" /* DONE */
2478
2479                         : "=c" (dummy_value_c),        /* output regs (dummy) */
2480                           "=S" (dummy_value_S),
2481                           "=D" (dummy_value_D)
2482
2483                         : "1" (sptr),      /* esi      // input regs */
2484                           "2" (dp),        /* edi */
2485                           "0" (width)      /* ecx */
2486
2487 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2488                         : "%mm0"                       /* clobber list */
2489 #endif
2490                      );
2491                   }
2492                   else if (((pass == 2) || (pass == 3)) && width)
2493                   {
2494                      /* source is 8-byte RRGGBBAA */
2495                      /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
2496                      /* (recall that expansion is _in place_:  sptr and dp */
2497                      /*  both point at locations within same row buffer) */
2498                      {
2499                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2500                         int dummy_value_S;
2501                         int dummy_value_D;
2502
2503                         __asm__ __volatile__ (
2504                            "subl $24, %%edi         \n\t" /* start of last block */
2505
2506                         ".loop8_pass2:              \n\t"
2507                            "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2508                            "movq %%mm0, (%%edi)     \n\t"
2509                            "movq %%mm0, 8(%%edi)    \n\t"
2510                            "movq %%mm0, 16(%%edi)   \n\t"
2511                            "subl $8, %%esi          \n\t"
2512                            "movq %%mm0, 24(%%edi)   \n\t"
2513                            "subl $32, %%edi         \n\t"
2514                            "decl %%ecx              \n\t"
2515                            "jnz .loop8_pass2        \n\t"
2516                            "EMMS                    \n\t" /* DONE */
2517
2518                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2519                              "=S" (dummy_value_S),
2520                              "=D" (dummy_value_D)
2521
2522                            : "1" (sptr),      /* esi      // input regs */
2523                              "2" (dp),        /* edi */
2524                              "0" (width)      /* ecx */
2525
2526 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2527                            : "%mm0"                       /* clobber list */
2528 #endif
2529                         );
2530                      }
2531                   }
2532                   else if (width)  /* pass == 4 or 5 */
2533                   {
2534                      /* source is 8-byte RRGGBBAA */
2535                      /* dest is 16-byte RRGGBBAA RRGGBBAA */
2536                      {
2537                         int dummy_value_c;  /* fix 'forbidden register spilled' */
2538                         int dummy_value_S;
2539                         int dummy_value_D;
2540
2541                         __asm__ __volatile__ (
2542                            "subl $8, %%edi          \n\t" /* start of last block */
2543
2544                         ".loop8_pass4:              \n\t"
2545                            "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2546                            "movq %%mm0, (%%edi)     \n\t"
2547                            "subl $8, %%esi          \n\t"
2548                            "movq %%mm0, 8(%%edi)    \n\t"
2549                            "subl $16, %%edi         \n\t"
2550                            "decl %%ecx              \n\t"
2551                            "jnz .loop8_pass4        \n\t"
2552                            "EMMS                    \n\t" /* DONE */
2553
2554                            : "=c" (dummy_value_c),        /* output regs (dummy) */
2555                              "=S" (dummy_value_S),
2556                              "=D" (dummy_value_D)
2557
2558                            : "1" (sptr),      /* esi      // input regs */
2559                              "2" (dp),        /* edi */
2560                              "0" (width)      /* ecx */
2561
2562 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2563                            : "%mm0"                       /* clobber list */
2564 #endif
2565                         );
2566                      }
2567                   }
2568
2569                } /* end of pixel_bytes == 8 */
2570
2571                //--------------------------------------------------------------
2572                else if (pixel_bytes == 6)
2573                {
2574                   for (i = width; i; i--)
2575                   {
2576                      png_byte v[8];
2577                      int j;
2578                      png_memcpy(v, sptr, 6);
2579                      for (j = 0; j < png_pass_inc[pass]; j++)
2580                      {
2581                         png_memcpy(dp, v, 6);
2582                         dp -= 6;
2583                      }
2584                      sptr -= 6;
2585                   }
2586                } /* end of pixel_bytes == 6 */
2587
2588                //--------------------------------------------------------------
2589                else
2590                {
2591                   for (i = width; i; i--)
2592                   {
2593                      png_byte v[8];
2594                      int j;
2595                      png_memcpy(v, sptr, pixel_bytes);
2596                      for (j = 0; j < png_pass_inc[pass]; j++)
2597                      {
2598                         png_memcpy(dp, v, pixel_bytes);
2599                         dp -= pixel_bytes;
2600                      }
2601                      sptr-= pixel_bytes;
2602                   }
2603                }
2604             } /* end of _mmx_supported ======================================== */
2605
2606             else /* MMX not supported:  use modified C code - takes advantage
2607                   *   of inlining of png_memcpy for a constant */
2608                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2609                   *   block be replaced with immediate value (e.g., 1)? */
2610                  /* GRR 19991017:  replaced with constants in each case */
2611 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2612             {
2613                if (pixel_bytes == 1)
2614                {
2615                   for (i = width; i; i--)
2616                   {
2617                      int j;
2618                      for (j = 0; j < png_pass_inc[pass]; j++)
2619                      {
2620                         *dp-- = *sptr;
2621                      }
2622                      --sptr;
2623                   }
2624                }
2625                else if (pixel_bytes == 3)
2626                {
2627                   for (i = width; i; i--)
2628                   {
2629                      png_byte v[8];
2630                      int j;
2631                      png_memcpy(v, sptr, 3);
2632                      for (j = 0; j < png_pass_inc[pass]; j++)
2633                      {
2634                         png_memcpy(dp, v, 3);
2635                         dp -= 3;
2636                      }
2637                      sptr -= 3;
2638                   }
2639                }
2640                else if (pixel_bytes == 2)
2641                {
2642                   for (i = width; i; i--)
2643                   {
2644                      png_byte v[8];
2645                      int j;
2646                      png_memcpy(v, sptr, 2);
2647                      for (j = 0; j < png_pass_inc[pass]; j++)
2648                      {
2649                         png_memcpy(dp, v, 2);
2650                         dp -= 2;
2651                      }
2652                      sptr -= 2;
2653                   }
2654                }
2655                else if (pixel_bytes == 4)
2656                {
2657                   for (i = width; i; i--)
2658                   {
2659                      png_byte v[8];
2660                      int j;
2661                      png_memcpy(v, sptr, 4);
2662                      for (j = 0; j < png_pass_inc[pass]; j++)
2663                      {
2664 #ifdef PNG_DEBUG
2665                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2666                         {
2667                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2668                              row, dp, row+png_ptr->row_buf_size);
2669                            printf("row_buf=%d\n",png_ptr->row_buf_size);
2670                         }
2671 #endif
2672                         png_memcpy(dp, v, 4);
2673                         dp -= 4;
2674                      }
2675                      sptr -= 4;
2676                   }
2677                }
2678                else if (pixel_bytes == 6)
2679                {
2680                   for (i = width; i; i--)
2681                   {
2682                      png_byte v[8];
2683                      int j;
2684                      png_memcpy(v, sptr, 6);
2685                      for (j = 0; j < png_pass_inc[pass]; j++)
2686                      {
2687                         png_memcpy(dp, v, 6);
2688                         dp -= 6;
2689                      }
2690                      sptr -= 6;
2691                   }
2692                }
2693                else if (pixel_bytes == 8)
2694                {
2695                   for (i = width; i; i--)
2696                   {
2697                      png_byte v[8];
2698                      int j;
2699                      png_memcpy(v, sptr, 8);
2700                      for (j = 0; j < png_pass_inc[pass]; j++)
2701                      {
2702                         png_memcpy(dp, v, 8);
2703                         dp -= 8;
2704                      }
2705                      sptr -= 8;
2706                   }
2707                }
2708                else     /* GRR:  should never be reached */
2709                {
2710                   for (i = width; i; i--)
2711                   {
2712                      png_byte v[8];
2713                      int j;
2714                      png_memcpy(v, sptr, pixel_bytes);
2715                      for (j = 0; j < png_pass_inc[pass]; j++)
2716                      {
2717                         png_memcpy(dp, v, pixel_bytes);
2718                         dp -= pixel_bytes;
2719                      }
2720                      sptr -= pixel_bytes;
2721                   }
2722                }
2723
2724             } /* end if (MMX not supported) */
2725             break;
2726          }
2727       } /* end switch (row_info->pixel_depth) */
2728
2729       row_info->width = final_width;
2730       row_info->rowbytes = ((final_width *
2731          (png_uint_32)row_info->pixel_depth + 7) >> 3);
2732    }
2733
2734 } /* end png_do_read_interlace() */
2735
2736 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2737 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2738
2739
2740
2741 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2742 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2743
2744 /* These variables are utilized in the functions below.  They are declared */
2745 /* globally here to ensure alignment on 8-byte boundaries. */
2746
2747 union uAll {
2748    long long use;
2749    double  align;
2750 } _LBCarryMask = {0x0101010101010101LL},
2751   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2752   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2753
2754 #ifdef PNG_THREAD_UNSAFE_OK
2755 /*===========================================================================*/
2756 /*                                                                           */
2757 /*           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           */
2758 /*                                                                           */
2759 /*===========================================================================*/
2760
2761 /* Optimized code for PNG Average filter decoder */
2762
2763 static void /* PRIVATE */
2764 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2765                             png_bytep prev_row)
2766 {
2767    int bpp;
2768    int dummy_value_c;   /* fix 'forbidden register 2 (cx) was spilled' error */
2769    int dummy_value_S;
2770    int dummy_value_D;
2771
2772    bpp = (row_info->pixel_depth + 7) >> 3;  /* get # bytes per pixel */
2773    _FullLength  = row_info->rowbytes;       /* # of bytes to filter */
2774
2775    __asm__ __volatile__ (
2776       /* initialize address pointers and offset */
2777 #ifdef __PIC__
2778       "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
2779 #endif
2780 /*pre "movl row, %%edi             \n\t" */ /* edi:  Avg(x) */
2781       "xorl %%ebx, %%ebx           \n\t" /* ebx:  x */
2782       "movl %%edi, %%edx           \n\t"
2783 /*pre "movl prev_row, %%esi        \n\t" */ /* esi:  Prior(x) */
2784 /*pre "subl bpp, %%edx             \n\t" */ /* (bpp is preloaded into ecx) */
2785       "subl %%ecx, %%edx           \n\t" /* edx:  Raw(x-bpp) */
2786
2787       "xorl %%eax,%%eax            \n\t"
2788
2789       /* Compute the Raw value for the first bpp bytes */
2790       /*    Raw(x) = Avg(x) + (Prior(x)/2) */
2791    "avg_rlp:                       \n\t"
2792       "movb (%%esi,%%ebx,),%%al    \n\t" /* load al with Prior(x) */
2793       "incl %%ebx                  \n\t"
2794       "shrb %%al                   \n\t" /* divide by 2 */
2795       "addb -1(%%edi,%%ebx,),%%al  \n\t" /* add Avg(x); -1 to offset inc ebx */
2796 /* pre "cmpl bpp, %%ebx             \n\t" */ /* (bpp is preloaded into ecx) */
2797       "cmpl %%ecx, %%ebx           \n\t"
2798       "movb %%al,-1(%%edi,%%ebx,)  \n\t" /* write Raw(x); -1 to offset inc ebx */
2799       "jb avg_rlp                  \n\t" /* mov does not affect flags */
2800
2801       /* get # of bytes to alignment */
2802       "movl %%edi, _dif            \n\t" /* take start of row */
2803       "addl %%ebx, _dif            \n\t" /* add bpp */
2804       "addl $0xf, _dif             \n\t" /* add 7+8 to incr past alignment bdry */
2805       "andl $0xfffffff8, _dif      \n\t" /* mask to alignment boundary */
2806       "subl %%edi, _dif            \n\t" /* subtract from start => value ebx at */
2807       "jz avg_go                   \n\t" /*  alignment */
2808
2809       /* fix alignment */
2810       /* Compute the Raw value for the bytes up to the alignment boundary */
2811       /*    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2812       "xorl %%ecx, %%ecx           \n\t"
2813
2814    "avg_lp1:                       \n\t"
2815       "xorl %%eax, %%eax           \n\t"
2816       "movb (%%esi,%%ebx,), %%cl   \n\t" /* load cl with Prior(x) */
2817       "movb (%%edx,%%ebx,), %%al   \n\t" /* load al with Raw(x-bpp) */
2818       "addw %%cx, %%ax             \n\t"
2819       "incl %%ebx                  \n\t"
2820       "shrw %%ax                   \n\t" /* divide by 2 */
2821       "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2822       "cmpl _dif, %%ebx            \n\t" /* check if at alignment boundary */
2823       "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2824       "jb avg_lp1                  \n\t" /* repeat until at alignment boundary */
2825
2826    "avg_go:                        \n\t"
2827       "movl _FullLength, %%eax     \n\t"
2828       "movl %%eax, %%ecx           \n\t"
2829       "subl %%ebx, %%eax           \n\t" /* subtract alignment fix */
2830       "andl $0x00000007, %%eax     \n\t" /* calc bytes over mult of 8 */
2831       "subl %%eax, %%ecx           \n\t" /* drop over bytes from original length */
2832       "movl %%ecx, _MMXLength      \n\t"
2833 #ifdef __PIC__
2834       "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
2835 #endif
2836
2837       : "=c" (dummy_value_c),            /* output regs (dummy) */
2838         "=S" (dummy_value_S),
2839         "=D" (dummy_value_D)
2840
2841       : "0" (bpp),       /* ecx          // input regs */
2842         "1" (prev_row),  /* esi */
2843         "2" (row)        /* edi */
2844
2845       : "%eax", "%edx"                   /* clobber list */
2846 #ifndef __PIC__
2847       , "%ebx"
2848 #endif
2849       /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */
2850       /* (seems to work fine without...) */
2851    );
2852
2853    /* now do the math for the rest of the row */
2854    switch (bpp)
2855    {
2856       case 3:
2857       {
2858          _ActiveMask.use  = 0x0000000000ffffffLL;
2859          _ShiftBpp.use = 24;    /* == 3 * 8 */
2860          _ShiftRem.use = 40;    /* == 64 - 24 */
2861
2862          __asm__ __volatile__ (
2863             /* re-init address pointers and offset */
2864             "movq _ActiveMask, %%mm7      \n\t"
2865             "movl _dif, %%ecx             \n\t" /* ecx:  x = offset to */
2866             "movq _LBCarryMask, %%mm5     \n\t" /*  alignment boundary */
2867 /* preload  "movl row, %%edi              \n\t" // edi:  Avg(x) */
2868             "movq _HBClearMask, %%mm4     \n\t"
2869 /* preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x) */
2870
2871             /* prime the pump:  load the first Raw(x-bpp) data set */
2872             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2873                                                 /* (correct pos. in loop below) */
2874          "avg_3lp:                        \n\t"
2875             "movq (%%edi,%%ecx,), %%mm0   \n\t" /* load mm0 with Avg(x) */
2876             "movq %%mm5, %%mm3            \n\t"
2877             "psrlq _ShiftRem, %%mm2       \n\t" /* correct position Raw(x-bpp) */
2878                                                 /* data */
2879             "movq (%%esi,%%ecx,), %%mm1   \n\t" /* load mm1 with Prior(x) */
2880             "movq %%mm7, %%mm6            \n\t"
2881             "pand %%mm1, %%mm3            \n\t" /* get lsb for each prev_row byte */
2882             "psrlq $1, %%mm1              \n\t" /* divide prev_row bytes by 2 */
2883             "pand  %%mm4, %%mm1           \n\t" /* clear invalid bit 7 of each */
2884                                                 /* byte */
2885             "paddb %%mm1, %%mm0           \n\t" /* add (Prev_row/2) to Avg for */
2886                                                 /* each byte */
2887             /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */
2888             "movq %%mm3, %%mm1            \n\t" /* now use mm1 for getting */
2889                                                 /* LBCarrys */
2890             "pand %%mm2, %%mm1            \n\t" /* get LBCarrys for each byte */
2891                                                 /* where both */
2892                                /* lsb's were == 1 (only valid for active group) */
2893             "psrlq $1, %%mm2              \n\t" /* divide raw bytes by 2 */
2894             "pand  %%mm4, %%mm2           \n\t" /* clear invalid bit 7 of each */
2895                                                 /* byte */
2896             "paddb %%mm1, %%mm2           \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2897                                                 /* for each byte */
2898             "pand %%mm6, %%mm2            \n\t" /* leave only Active Group 1 */
2899                                                 /* bytes to add to Avg */
2900             "paddb %%mm2, %%mm0           \n\t" /* add (Raw/2) + LBCarrys to */
2901                                                 /* Avg for each Active */
2902                                /*  byte */
2903             /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
2904             "psllq _ShiftBpp, %%mm6       \n\t" /* shift the mm6 mask to cover */
2905                                                 /* bytes 3-5 */
2906             "movq %%mm0, %%mm2            \n\t" /* mov updated Raws to mm2 */
2907             "psllq _ShiftBpp, %%mm2       \n\t" /* shift data to pos. correctly */
2908             "movq %%mm3, %%mm1            \n\t" /* now use mm1 for getting */
2909                                                 /* LBCarrys */
2910             "pand %%mm2, %%mm1            \n\t" /* get LBCarrys for each byte */
2911                                                 /* where both */
2912                                /* lsb's were == 1 (only valid for active group) */
2913             "psrlq $1, %%mm2              \n\t" /* divide raw bytes by 2 */
2914             "pand  %%mm4, %%mm2           \n\t" /* clear invalid bit 7 of each */
2915                                                 /* byte */
2916             "paddb %%mm1, %%mm2           \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2917                                                 /* for each byte */
2918             "pand %%mm6, %%mm2            \n\t" /* leave only Active Group 2 */
2919                                                 /* bytes to add to Avg */
2920             "paddb %%mm2, %%mm0           \n\t" /* add (Raw/2) + LBCarrys to */
2921                                                 /* Avg for each Active */
2922                                /*  byte */
2923
2924             /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
2925             "psllq _ShiftBpp, %%mm6       \n\t" /* shift mm6 mask to cover last */
2926                                                 /* two */
2927                                  /* bytes */
2928             "movq %%mm0, %%mm2            \n\t" /* mov updated Raws to mm2 */
2929             "psllq _ShiftBpp, %%mm2       \n\t" /* shift data to pos. correctly */
2930                               /* Data only needs to be shifted once here to */
2931                               /* get the correct x-bpp offset. */
2932             "movq %%mm3, %%mm1            \n\t" /* now use mm1 for getting */
2933                                                 /* LBCarrys */
2934             "pand %%mm2, %%mm1            \n\t" /* get LBCarrys for each byte */
2935                                                 /* where both */
2936                               /* lsb's were == 1 (only valid for active group) */
2937             "psrlq $1, %%mm2              \n\t" /* divide raw bytes by 2 */
2938             "pand  %%mm4, %%mm2           \n\t" /* clear invalid bit 7 of each */
2939                                                 /* byte */
2940             "paddb %%mm1, %%mm2           \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2941                                                 /* for each byte */
2942             "pand %%mm6, %%mm2            \n\t" /* leave only Active Group 2 */
2943                                                 /* bytes to add to Avg */
2944             "addl $8, %%ecx               \n\t"
2945             "paddb %%mm2, %%mm0           \n\t" /* add (Raw/2) + LBCarrys to */
2946                                                 /* Avg for each Active */
2947                                                 /* byte */
2948             /* now ready to write back to memory */
2949             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2950             /* move updated Raw(x) to use as Raw(x-bpp) for next loop */
2951             "cmpl _MMXLength, %%ecx       \n\t"
2952             "movq %%mm0, %%mm2            \n\t" /* mov updated Raw(x) to mm2 */
2953             "jb avg_3lp                   \n\t"
2954
2955             : "=S" (dummy_value_S),             /* output regs (dummy) */
2956               "=D" (dummy_value_D)
2957
2958             : "0" (prev_row),  /* esi           // input regs */
2959               "1" (row)        /* edi */
2960
2961             : "%ecx"                            /* clobber list */
2962 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2963             , "%mm0", "%mm1", "%mm2", "%mm3"
2964             , "%mm4", "%mm5", "%mm6", "%mm7"
2965 #endif
2966          );
2967       }
2968       break;  /* end 3 bpp */
2969
2970       case 6:
2971       case 4:
2972       //case 7:   /* who wrote this?  PNG doesn't support 5 or 7 bytes/pixel */
2973       //case 5:   /* GRR BOGUS */
2974       {
2975          _ActiveMask.use  = 0xffffffffffffffffLL; /* use shift below to clear */
2976                                                   /* appropriate inactive bytes */
2977          _ShiftBpp.use = bpp << 3;
2978          _ShiftRem.use = 64 - _ShiftBpp.use;
2979
2980          __asm__ __volatile__ (
2981             "movq _HBClearMask, %%mm4    \n\t"
2982
2983             /* re-init address pointers and offset */
2984             "movl _dif, %%ecx            \n\t" /* ecx:  x = offset to */
2985                                                /* alignment boundary */
2986
2987             /* load _ActiveMask and clear all bytes except for 1st active group */
2988             "movq _ActiveMask, %%mm7     \n\t"
2989 /* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
2990             "psrlq _ShiftRem, %%mm7      \n\t"
2991 /* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
2992             "movq %%mm7, %%mm6           \n\t"
2993             "movq _LBCarryMask, %%mm5    \n\t"
2994             "psllq _ShiftBpp, %%mm6      \n\t" /* create mask for 2nd active */
2995                                                /* group */
2996
2997             /* prime the pump:  load the first Raw(x-bpp) data set */
2998             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2999                                           /* (we correct pos. in loop below) */
3000          "avg_4lp:                       \n\t"
3001             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3002             "psrlq _ShiftRem, %%mm2      \n\t" /* shift data to pos. correctly */
3003             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3004             /* add (Prev_row/2) to average */
3005             "movq %%mm5, %%mm3           \n\t"
3006             "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3007             "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3008             "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7 of each */
3009                                                /* byte */
3010             "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg for */
3011                                                /* each byte */
3012             /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3013             "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3014                                                /* LBCarrys */
3015             "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3016                                                /* where both */
3017                               /* lsb's were == 1 (only valid for active group) */
3018             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3019             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3020                                                /* byte */
3021             "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3022                                                /* for each byte */
3023             "pand %%mm7, %%mm2           \n\t" /* leave only Active Group 1 */
3024                                                /* bytes to add to Avg */
3025             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to Avg */
3026                                                /* for each Active */
3027                               /* byte */
3028             /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3029             "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3030             "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3031             "addl $8, %%ecx              \n\t"
3032             "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3033                                                /* LBCarrys */
3034             "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3035                                                /* where both */
3036                               /* lsb's were == 1 (only valid for active group) */
3037             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3038             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3039                                                /* byte */
3040             "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3041                                                /* for each byte */
3042             "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3043                                                /* bytes to add to Avg */
3044             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3045                                                /* Avg for each Active */
3046                               /* byte */
3047             "cmpl _MMXLength, %%ecx      \n\t"
3048             /* now ready to write back to memory */
3049             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3050             /* prep Raw(x-bpp) for next loop */
3051             "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3052             "jb avg_4lp                  \n\t"
3053
3054             : "=S" (dummy_value_S),            /* output regs (dummy) */
3055               "=D" (dummy_value_D)
3056
3057             : "0" (prev_row),  /* esi          // input regs */
3058               "1" (row)        /* edi */
3059
3060             : "%ecx"                           /* clobber list */
3061 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3062             , "%mm0", "%mm1", "%mm2", "%mm3"
3063             , "%mm4", "%mm5", "%mm6", "%mm7"
3064 #endif
3065          );
3066       }
3067       break;  /* end 4,6 bpp */
3068
3069       case 2:
3070       {
3071          _ActiveMask.use  = 0x000000000000ffffLL;
3072          _ShiftBpp.use = 16;   /* == 2 * 8 */
3073          _ShiftRem.use = 48;   /* == 64 - 16 */
3074
3075          __asm__ __volatile__ (
3076             /* load _ActiveMask */
3077             "movq _ActiveMask, %%mm7     \n\t"
3078             /* re-init address pointers and offset */
3079             "movl _dif, %%ecx            \n\t" /* ecx:  x = offset to alignment */
3080                                                /* boundary */
3081             "movq _LBCarryMask, %%mm5    \n\t"
3082 /* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
3083             "movq _HBClearMask, %%mm4    \n\t"
3084 /* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3085
3086             /* prime the pump:  load the first Raw(x-bpp) data set */
3087             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3088                               /* (we correct pos. in loop below) */
3089          "avg_2lp:                       \n\t"
3090             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3091             "psrlq _ShiftRem, %%mm2      \n\t" /* shift data to pos. correctly */
3092             "movq (%%esi,%%ecx,), %%mm1  \n\t" /*  (GRR BUGFIX:  was psllq) */
3093             /* add (Prev_row/2) to average */
3094             "movq %%mm5, %%mm3           \n\t"
3095             "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3096             "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3097             "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7 of each */
3098                                                /* byte */
3099             "movq %%mm7, %%mm6           \n\t"
3100             "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg for */
3101                                                /* each byte */
3102
3103             /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3104             "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3105                                                /* LBCarrys */
3106             "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3107                                                /* where both */
3108                                                /* lsb's were == 1 (only valid */
3109                                                /* for active group) */
3110             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3111             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3112                                                /* byte */
3113             "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3114                                                /* for each byte */
3115             "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 1 */
3116                                                /* bytes to add to Avg */
3117             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to Avg */
3118                                                /* for each Active byte */
3119
3120             /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3121             "psllq _ShiftBpp, %%mm6      \n\t" /* shift the mm6 mask to cover */
3122                                                /* bytes 2 & 3 */
3123             "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3124             "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3125             "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3126                                                /* LBCarrys */
3127             "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3128                                                /* where both */
3129                                                /* lsb's were == 1 (only valid */
3130                                                /* for active group) */
3131             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3132             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3133                                                /* byte */
3134             "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3135                                                /* for each byte */
3136             "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3137                                                /* bytes to add to Avg */
3138             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3139                                                /* Avg for each Active byte */
3140
3141             /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
3142             "psllq _ShiftBpp, %%mm6      \n\t" /* shift the mm6 mask to cover */
3143                                                /* bytes 4 & 5 */
3144             "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3145             "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3146             "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3147                                                /* LBCarrys */
3148             "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3149                                                /* where both lsb's were == 1 */
3150                                                /* (only valid for active group) */
3151             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3152             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3153                                                /* byte */
3154             "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3155                                                /* for each byte */
3156             "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3157                                                /* bytes to add to Avg */
3158             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3159                                                /* Avg for each Active byte */
3160
3161             /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */
3162             "psllq _ShiftBpp, %%mm6      \n\t" /* shift the mm6 mask to cover */
3163                                                /* bytes 6 & 7 */
3164             "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3165             "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3166             "addl $8, %%ecx              \n\t"
3167             "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3168                                                /* LBCarrys */
3169             "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3170                                                /* where both */
3171                                                /* lsb's were == 1 (only valid */
3172                                                /* for active group) */
3173             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3174             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3175                                                /* byte */
3176             "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3177                                                /* for each byte */
3178             "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3179                                                /* bytes to add to Avg */
3180             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3181                                                /* Avg for each Active byte */
3182
3183             "cmpl _MMXLength, %%ecx      \n\t"
3184             /* now ready to write back to memory */
3185             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3186             /* prep Raw(x-bpp) for next loop */
3187             "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3188             "jb avg_2lp                  \n\t"
3189
3190             : "=S" (dummy_value_S),            /* output regs (dummy) */
3191               "=D" (dummy_value_D)
3192
3193             : "0" (prev_row),  /* esi          // input regs */
3194               "1" (row)        /* edi */
3195
3196             : "%ecx"                           /* clobber list */
3197 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3198             , "%mm0", "%mm1", "%mm2", "%mm3"
3199             , "%mm4", "%mm5", "%mm6", "%mm7"
3200 #endif
3201          );
3202       }
3203       break;  /* end 2 bpp */
3204
3205       case 1:
3206       {
3207          __asm__ __volatile__ (
3208             /* re-init address pointers and offset */
3209 #ifdef __PIC__
3210             "pushl %%ebx                 \n\t" /* save Global Offset Table index */
3211 #endif
3212             "movl _dif, %%ebx            \n\t" /* ebx:  x = offset to alignment */
3213                                                /* boundary */
3214 /* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
3215             "cmpl _FullLength, %%ebx     \n\t" /* test if offset at end of array */
3216             "jnb avg_1end                \n\t"
3217             /* do Paeth decode for remaining bytes */
3218 /* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3219             "movl %%edi, %%edx           \n\t"
3220 /* preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx) */
3221             "subl %%ecx, %%edx           \n\t" /* edx:  Raw(x-bpp) */
3222             "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx */
3223                                                /*  in loop below */
3224          "avg_1lp:                       \n\t"
3225             /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3226             "xorl %%eax, %%eax           \n\t"
3227             "movb (%%esi,%%ebx,), %%cl   \n\t" /* load cl with Prior(x) */
3228             "movb (%%edx,%%ebx,), %%al   \n\t" /* load al with Raw(x-bpp) */
3229             "addw %%cx, %%ax             \n\t"
3230             "incl %%ebx                  \n\t"
3231             "shrw %%ax                   \n\t" /* divide by 2 */
3232             "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */
3233                                                /* inc ebx */
3234             "cmpl _FullLength, %%ebx     \n\t" /* check if at end of array */
3235             "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */
3236                          /* mov does not affect flags; -1 to offset inc ebx */
3237             "jb avg_1lp                  \n\t"
3238
3239          "avg_1end:                      \n\t"
3240 #ifdef __PIC__
3241             "popl %%ebx                  \n\t" /* Global Offset Table index */
3242 #endif
3243
3244             : "=c" (dummy_value_c),            /* output regs (dummy) */
3245               "=S" (dummy_value_S),
3246               "=D" (dummy_value_D)
3247
3248             : "0" (bpp),       /* ecx          // input regs */
3249               "1" (prev_row),  /* esi */
3250               "2" (row)        /* edi */
3251
3252             : "%eax", "%edx"                   /* clobber list */
3253 #ifndef __PIC__
3254             , "%ebx"
3255 #endif
3256          );
3257       }
3258       return;  /* end 1 bpp */
3259
3260       case 8:
3261       {
3262          __asm__ __volatile__ (
3263             /* re-init address pointers and offset */
3264             "movl _dif, %%ecx            \n\t" /* ecx:  x == offset to alignment */
3265             "movq _LBCarryMask, %%mm5    \n\t" /*            boundary */
3266 /* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
3267             "movq _HBClearMask, %%mm4    \n\t"
3268 /* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3269
3270             /* prime the pump:  load the first Raw(x-bpp) data set */
3271             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3272                                       /* (NO NEED to correct pos. in loop below) */
3273
3274          "avg_8lp:                       \n\t"
3275             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3276             "movq %%mm5, %%mm3           \n\t"
3277             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3278             "addl $8, %%ecx              \n\t"
3279             "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3280             "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3281             "pand %%mm2, %%mm3           \n\t" /* get LBCarrys for each byte */
3282                                                /*  where both lsb's were == 1 */
3283             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3284             "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7, each byte */
3285             "paddb %%mm3, %%mm0          \n\t" /* add LBCarrys to Avg, each byte */
3286             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7, each byte */
3287             "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg, each */
3288             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) to Avg for each */
3289             "cmpl _MMXLength, %%ecx      \n\t"
3290             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3291             "movq %%mm0, %%mm2           \n\t" /* reuse as Raw(x-bpp) */
3292             "jb avg_8lp                  \n\t"
3293
3294             : "=S" (dummy_value_S),            /* output regs (dummy) */
3295               "=D" (dummy_value_D)
3296
3297             : "0" (prev_row),  /* esi          // input regs */
3298               "1" (row)        /* edi */
3299
3300             : "%ecx"                           /* clobber list */
3301 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3302             , "%mm0", "%mm1", "%mm2"
3303             , "%mm3", "%mm4", "%mm5"
3304 #endif
3305          );
3306       }
3307       break;  /* end 8 bpp */
3308
3309       default:                  /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */
3310       {
3311
3312 #ifdef PNG_DEBUG
3313          /* GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED */
3314         png_debug(1,
3315         "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3316 #endif
3317
3318 #if 0
3319         __asm__ __volatile__ (
3320             "movq _LBCarryMask, %%mm5    \n\t"
3321             /* re-init address pointers and offset */
3322             "movl _dif, %%ebx            \n\t" /* ebx:  x = offset to */
3323                                                /* alignment boundary */
3324             "movl row, %%edi             \n\t" /* edi:  Avg(x) */
3325             "movq _HBClearMask, %%mm4    \n\t"
3326             "movl %%edi, %%edx           \n\t"
3327             "movl prev_row, %%esi        \n\t" /* esi:  Prior(x) */
3328             "subl bpp, %%edx             \n\t" /* edx:  Raw(x-bpp) */
3329          "avg_Alp:                       \n\t"
3330             "movq (%%edi,%%ebx,), %%mm0  \n\t"
3331             "movq %%mm5, %%mm3           \n\t"
3332             "movq (%%esi,%%ebx,), %%mm1  \n\t"
3333             "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3334             "movq (%%edx,%%ebx,), %%mm2  \n\t"
3335             "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3336             "pand %%mm2, %%mm3           \n\t" /* get LBCarrys for each byte */
3337                                                /* where both lsb's were == 1 */
3338             "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3339             "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7 of each */
3340                                                /* byte */
3341             "paddb %%mm3, %%mm0          \n\t" /* add LBCarrys to Avg for each */
3342                                                /* byte */
3343             "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3344                                                /* byte */
3345             "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg for */
3346                                                /* each byte */
3347             "addl $8, %%ebx              \n\t"
3348             "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) to Avg for each */
3349                                                /* byte */
3350             "cmpl _MMXLength, %%ebx      \n\t"
3351             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3352             "jb avg_Alp                  \n\t"
3353
3354             : /* FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var) */
3355
3356             : /* FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest) */
3357
3358             : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */
3359          );
3360 #endif /* 0 - NEVER REACHED */
3361       }
3362       break;
3363
3364    } /* end switch (bpp) */
3365
3366    __asm__ __volatile__ (
3367       /* MMX acceleration complete; now do clean-up */
3368       /* check if any remaining bytes left to decode */
3369 #ifdef __PIC__
3370       "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
3371 #endif
3372       "movl _MMXLength, %%ebx      \n\t" /* ebx:  x == offset bytes after MMX */
3373 /* pre "movl row, %%edi             \n\t" */ /* edi:  Avg(x) */
3374       "cmpl _FullLength, %%ebx     \n\t" /* test if offset at end of array */
3375       "jnb avg_end                 \n\t"
3376
3377       /* do Avg decode for remaining bytes */
3378 /*pre "movl prev_row, %%esi        \n\t" */ /* esi:  Prior(x) */
3379       "movl %%edi, %%edx           \n\t"
3380 /*pre "subl bpp, %%edx             \n\t" */ /* (bpp is preloaded into ecx) */
3381       "subl %%ecx, %%edx           \n\t" /* edx:  Raw(x-bpp) */
3382       "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx below */
3383
3384    "avg_lp2:                       \n\t"
3385       /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3386       "xorl %%eax, %%eax           \n\t"
3387       "movb (%%esi,%%ebx,), %%cl   \n\t" /* load cl with Prior(x) */
3388       "movb (%%edx,%%ebx,), %%al   \n\t" /* load al with Raw(x-bpp) */
3389       "addw %%cx, %%ax             \n\t"
3390       "incl %%ebx                  \n\t"
3391       "shrw %%ax                   \n\t" /* divide by 2 */
3392       "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
3393       "cmpl _FullLength, %%ebx     \n\t" /* check if at end of array */
3394       "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */
3395       "jb avg_lp2                  \n\t" /*  affect flags; -1 to offset inc ebx] */
3396
3397    "avg_end:                       \n\t"
3398       "EMMS                        \n\t" /* end MMX; prep for poss. FP instrs. */
3399 #ifdef __PIC__
3400       "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
3401 #endif
3402
3403       : "=c" (dummy_value_c),            /* output regs (dummy) */
3404         "=S" (dummy_value_S),
3405         "=D" (dummy_value_D)
3406
3407       : "0" (bpp),       /* ecx          // input regs */
3408         "1" (prev_row),  /* esi */
3409         "2" (row)        /* edi */
3410
3411       : "%eax", "%edx"                   /* clobber list */
3412 #ifndef __PIC__
3413       , "%ebx"
3414 #endif
3415    );
3416
3417 } /* end png_read_filter_row_mmx_avg() */
3418 #endif
3419
3420
3421
3422 #ifdef PNG_THREAD_UNSAFE_OK
3423 /*===========================================================================*/
3424 /*                                                                           */
3425 /*         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         */
3426 /*                                                                           */
3427 /*===========================================================================*/
3428
3429 /* Optimized code for PNG Paeth filter decoder */
3430
3431 static void /* PRIVATE */
3432 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3433                               png_bytep prev_row)
3434 {
3435    int bpp;
3436    int dummy_value_c;   /* fix 'forbidden register 2 (cx) was spilled' error */
3437    int dummy_value_S;
3438    int dummy_value_D;
3439
3440    bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3441    _FullLength  = row_info->rowbytes; /* # of bytes to filter */
3442
3443    __asm__ __volatile__ (
3444 #ifdef __PIC__
3445       "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
3446 #endif
3447       "xorl %%ebx, %%ebx           \n\t" /* ebx:  x offset */
3448 /*pre "movl row, %%edi             \n\t" */
3449       "xorl %%edx, %%edx           \n\t" /* edx:  x-bpp offset */
3450 /*pre "movl prev_row, %%esi        \n\t" */
3451       "xorl %%eax, %%eax           \n\t"
3452
3453       /* Compute the Raw value for the first bpp bytes */
3454       /* Note: the formula works out to be always */
3455       /*   Paeth(x) = Raw(x) + Prior(x)      where x < bpp */
3456    "paeth_rlp:                     \n\t"
3457       "movb (%%edi,%%ebx,), %%al   \n\t"
3458       "addb (%%esi,%%ebx,), %%al   \n\t"
3459       "incl %%ebx                  \n\t"
3460 /*pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx) */
3461       "cmpl %%ecx, %%ebx           \n\t"
3462       "movb %%al, -1(%%edi,%%ebx,) \n\t"
3463       "jb paeth_rlp                \n\t"
3464       /* get # of bytes to alignment */
3465       "movl %%edi, _dif            \n\t" /* take start of row */
3466       "addl %%ebx, _dif            \n\t" /* add bpp */
3467       "xorl %%ecx, %%ecx           \n\t"
3468       "addl $0xf, _dif             \n\t" /* add 7 + 8 to incr past alignment */
3469                                          /* boundary */
3470       "andl $0xfffffff8, _dif      \n\t" /* mask to alignment boundary */
3471       "subl %%edi, _dif            \n\t" /* subtract from start ==> value ebx */
3472                                          /* at alignment */
3473       "jz paeth_go                 \n\t"
3474       /* fix alignment */
3475
3476    "paeth_lp1:                     \n\t"
3477       "xorl %%eax, %%eax           \n\t"
3478       /* pav = p - a = (a + b - c) - a = b - c */
3479       "movb (%%esi,%%ebx,), %%al   \n\t" /* load Prior(x) into al */
3480       "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
3481       "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
3482       "movl %%eax, _patemp         \n\t" /* Save pav for later use */
3483       "xorl %%eax, %%eax           \n\t"
3484       /* pbv = p - b = (a + b - c) - b = a - c */
3485       "movb (%%edi,%%edx,), %%al   \n\t" /* load Raw(x-bpp) into al */
3486       "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
3487       "movl %%eax, %%ecx           \n\t"
3488       /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3489       "addl _patemp, %%eax         \n\t" /* pcv = pav + pbv */
3490       /* pc = abs(pcv) */
3491       "testl $0x80000000, %%eax    \n\t"
3492       "jz paeth_pca                \n\t"
3493       "negl %%eax                  \n\t" /* reverse sign of neg values */
3494
3495    "paeth_pca:                     \n\t"
3496       "movl %%eax, _pctemp         \n\t" /* save pc for later use */
3497       /* pb = abs(pbv) */
3498       "testl $0x80000000, %%ecx    \n\t"
3499       "jz paeth_pba                \n\t"
3500       "negl %%ecx                  \n\t" /* reverse sign of neg values */
3501
3502    "paeth_pba:                     \n\t"
3503       "movl %%ecx, _pbtemp         \n\t" /* save pb for later use */
3504       /* pa = abs(pav) */
3505       "movl _patemp, %%eax         \n\t"
3506       "testl $0x80000000, %%eax    \n\t"
3507       "jz paeth_paa                \n\t"
3508       "negl %%eax                  \n\t" /* reverse sign of neg values */
3509
3510    "paeth_paa:                     \n\t"
3511       "movl %%eax, _patemp         \n\t" /* save pa for later use */
3512       /* test if pa <= pb */
3513       "cmpl %%ecx, %%eax           \n\t"
3514       "jna paeth_abb               \n\t"
3515       /* pa > pb; now test if pb <= pc */
3516       "cmpl _pctemp, %%ecx         \n\t"
3517       "jna paeth_bbc               \n\t"
3518       /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3519       "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
3520       "jmp paeth_paeth             \n\t"
3521
3522    "paeth_bbc:                     \n\t"
3523       /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3524       "movb (%%esi,%%ebx,), %%cl   \n\t" /* load Prior(x) into cl */
3525       "jmp paeth_paeth             \n\t"
3526
3527    "paeth_abb:                     \n\t"
3528       /* pa <= pb; now test if pa <= pc */
3529       "cmpl _pctemp, %%eax         \n\t"
3530       "jna paeth_abc               \n\t"
3531       /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3532       "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
3533       "jmp paeth_paeth             \n\t"
3534
3535    "paeth_abc:                     \n\t"
3536       /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3537       "movb (%%edi,%%edx,), %%cl   \n\t" /* load Raw(x-bpp) into cl */
3538
3539    "paeth_paeth:                   \n\t"
3540       "incl %%ebx                  \n\t"
3541       "incl %%edx                  \n\t"
3542       /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3543       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3544       "cmpl _dif, %%ebx            \n\t"
3545       "jb paeth_lp1                \n\t"
3546
3547    "paeth_go:                      \n\t"
3548       "movl _FullLength, %%ecx     \n\t"
3549       "movl %%ecx, %%eax           \n\t"
3550       "subl %%ebx, %%eax           \n\t" /* subtract alignment fix */
3551       "andl $0x00000007, %%eax     \n\t" /* calc bytes over mult of 8 */
3552       "subl %%eax, %%ecx           \n\t" /* drop over bytes from original length */
3553       "movl %%ecx, _MMXLength      \n\t"
3554 #ifdef __PIC__
3555       "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
3556 #endif
3557
3558       : "=c" (dummy_value_c),            /* output regs (dummy) */
3559         "=S" (dummy_value_S),
3560         "=D" (dummy_value_D)
3561
3562       : "0" (bpp),       /* ecx          // input regs */
3563         "1" (prev_row),  /* esi */
3564         "2" (row)        /* edi */
3565
3566       : "%eax", "%edx"                   /* clobber list */
3567 #ifndef __PIC__
3568       , "%ebx"
3569 #endif
3570    );
3571
3572    /* now do the math for the rest of the row */
3573    switch (bpp)
3574    {
3575       case 3:
3576       {
3577          _ActiveMask.use = 0x0000000000ffffffLL;
3578          _ActiveMaskEnd.use = 0xffff000000000000LL;
3579          _ShiftBpp.use = 24;    /* == bpp(3) * 8 */
3580          _ShiftRem.use = 40;    /* == 64 - 24 */
3581
3582          __asm__ __volatile__ (
3583             "movl _dif, %%ecx            \n\t"
3584 /* preload  "movl row, %%edi             \n\t" */
3585 /* preload  "movl prev_row, %%esi        \n\t" */
3586             "pxor %%mm0, %%mm0           \n\t"
3587             /* prime the pump:  load the first Raw(x-bpp) data set */
3588             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3589          "paeth_3lp:                     \n\t"
3590             "psrlq _ShiftRem, %%mm1      \n\t" /* shift last 3 bytes to 1st */
3591                                                /* 3 bytes */
3592             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3593             "punpcklbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3594             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */
3595             "punpcklbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3596             "psrlq _ShiftRem, %%mm3      \n\t" /* shift last 3 bytes to 1st */
3597                                                /* 3 bytes */
3598             /* pav = p - a = (a + b - c) - a = b - c */
3599             "movq %%mm2, %%mm4           \n\t"
3600             "punpcklbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3601             /* pbv = p - b = (a + b - c) - b = a - c */
3602             "movq %%mm1, %%mm5           \n\t"
3603             "psubw %%mm3, %%mm4          \n\t"
3604             "pxor %%mm7, %%mm7           \n\t"
3605             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3606             "movq %%mm4, %%mm6           \n\t"
3607             "psubw %%mm3, %%mm5          \n\t"
3608
3609             /* pa = abs(p-a) = abs(pav) */
3610             /* pb = abs(p-b) = abs(pbv) */
3611             /* pc = abs(p-c) = abs(pcv) */
3612             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3613             "paddw %%mm5, %%mm6          \n\t"
3614             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3615             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3616             "psubw %%mm0, %%mm4          \n\t"
3617             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3618             "psubw %%mm0, %%mm4          \n\t"
3619             "psubw %%mm7, %%mm5          \n\t"
3620             "pxor %%mm0, %%mm0           \n\t"
3621             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3622             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3623             "psubw %%mm7, %%mm5          \n\t"
3624             "psubw %%mm0, %%mm6          \n\t"
3625             /*  test pa <= pb */
3626             "movq %%mm4, %%mm7           \n\t"
3627             "psubw %%mm0, %%mm6          \n\t"
3628             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3629             "movq %%mm7, %%mm0           \n\t"
3630             /* use mm7 mask to merge pa & pb */
3631             "pand %%mm7, %%mm5           \n\t"
3632             /* use mm0 mask copy to merge a & b */
3633             "pand %%mm0, %%mm2           \n\t"
3634             "pandn %%mm4, %%mm7          \n\t"
3635             "pandn %%mm1, %%mm0          \n\t"
3636             "paddw %%mm5, %%mm7          \n\t"
3637             "paddw %%mm2, %%mm0          \n\t"
3638             /*  test  ((pa <= pb)? pa:pb) <= pc */
3639             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3640             "pxor %%mm1, %%mm1           \n\t"
3641             "pand %%mm7, %%mm3           \n\t"
3642             "pandn %%mm0, %%mm7          \n\t"
3643             "paddw %%mm3, %%mm7          \n\t"
3644             "pxor %%mm0, %%mm0           \n\t"
3645             "packuswb %%mm1, %%mm7       \n\t"
3646             "movq (%%esi,%%ecx,), %%mm3  \n\t" /* load c=Prior(x-bpp) */
3647             "pand _ActiveMask, %%mm7     \n\t"
3648             "movq %%mm3, %%mm2           \n\t" /* load b=Prior(x) step 1 */
3649             "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3650             "punpcklbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3651             "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
3652             "movq %%mm7, %%mm1           \n\t" /* now mm1 will be used as */
3653                                                /* Raw(x-bpp) */
3654             /* now do Paeth for 2nd set of bytes (3-5) */
3655             "psrlq _ShiftBpp, %%mm2      \n\t" /* load b=Prior(x) step 2 */
3656             "punpcklbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3657             "pxor %%mm7, %%mm7           \n\t"
3658             "punpcklbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3659             /* pbv = p - b = (a + b - c) - b = a - c */
3660             "movq %%mm1, %%mm5           \n\t"
3661             /* pav = p - a = (a + b - c) - a = b - c */
3662             "movq %%mm2, %%mm4           \n\t"
3663             "psubw %%mm3, %%mm5          \n\t"
3664             "psubw %%mm3, %%mm4          \n\t"
3665             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
3666             /*       pav + pbv = pbv + pav */
3667             "movq %%mm5, %%mm6           \n\t"
3668             "paddw %%mm4, %%mm6          \n\t"
3669
3670             /* pa = abs(p-a) = abs(pav) */
3671             /* pb = abs(p-b) = abs(pbv) */
3672             /* pc = abs(p-c) = abs(pcv) */
3673             "pcmpgtw %%mm5, %%mm0        \n\t" /* create mask pbv bytes < 0 */
3674             "pcmpgtw %%mm4, %%mm7        \n\t" /* create mask pav bytes < 0 */
3675             "pand %%mm5, %%mm0           \n\t" /* only pbv bytes < 0 in mm0 */
3676             "pand %%mm4, %%mm7           \n\t" /* only pav bytes < 0 in mm7 */
3677             "psubw %%mm0, %%mm5          \n\t"
3678             "psubw %%mm7, %%mm4          \n\t"
3679             "psubw %%mm0, %%mm5          \n\t"
3680             "psubw %%mm7, %%mm4          \n\t"
3681             "pxor %%mm0, %%mm0           \n\t"
3682             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3683             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3684             "psubw %%mm0, %%mm6          \n\t"
3685             /*  test pa <= pb */
3686             "movq %%mm4, %%mm7           \n\t"
3687             "psubw %%mm0, %%mm6          \n\t"
3688             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3689             "movq %%mm7, %%mm0           \n\t"
3690             /* use mm7 mask to merge pa & pb */
3691             "pand %%mm7, %%mm5           \n\t"
3692             /* use mm0 mask copy to merge a & b */
3693             "pand %%mm0, %%mm2           \n\t"
3694             "pandn %%mm4, %%mm7          \n\t"
3695             "pandn %%mm1, %%mm0          \n\t"
3696             "paddw %%mm5, %%mm7          \n\t"
3697             "paddw %%mm2, %%mm0          \n\t"
3698             /*  test  ((pa <= pb)? pa:pb) <= pc */
3699             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3700             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3701             "pand %%mm7, %%mm3           \n\t"
3702             "pandn %%mm0, %%mm7          \n\t"
3703             "pxor %%mm1, %%mm1           \n\t"
3704             "paddw %%mm3, %%mm7          \n\t"
3705             "pxor %%mm0, %%mm0           \n\t"
3706             "packuswb %%mm1, %%mm7       \n\t"
3707             "movq %%mm2, %%mm3           \n\t" /* load c=Prior(x-bpp) step 1 */
3708             "pand _ActiveMask, %%mm7     \n\t"
3709             "punpckhbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3710             "psllq _ShiftBpp, %%mm7      \n\t" /* shift bytes to 2nd group of */
3711                                                /* 3 bytes */
3712              /* pav = p - a = (a + b - c) - a = b - c */
3713             "movq %%mm2, %%mm4           \n\t"
3714             "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3715             "psllq _ShiftBpp, %%mm3      \n\t" /* load c=Prior(x-bpp) step 2 */
3716             "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
3717             "movq %%mm7, %%mm1           \n\t"
3718             "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3719             "psllq _ShiftBpp, %%mm1      \n\t" /* shift bytes */
3720                                     /* now mm1 will be used as Raw(x-bpp) */
3721             /* now do Paeth for 3rd, and final, set of bytes (6-7) */
3722             "pxor %%mm7, %%mm7           \n\t"
3723             "punpckhbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3724             "psubw %%mm3, %%mm4          \n\t"
3725             /* pbv = p - b = (a + b - c) - b = a - c */
3726             "movq %%mm1, %%mm5           \n\t"
3727             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3728             "movq %%mm4, %%mm6           \n\t"
3729             "psubw %%mm3, %%mm5          \n\t"
3730             "pxor %%mm0, %%mm0           \n\t"
3731             "paddw %%mm5, %%mm6          \n\t"
3732
3733             /* pa = abs(p-a) = abs(pav) */
3734             /* pb = abs(p-b) = abs(pbv) */
3735             /* pc = abs(p-c) = abs(pcv) */
3736             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3737             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3738             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3739             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3740             "psubw %%mm0, %%mm4          \n\t"
3741             "psubw %%mm7, %%mm5          \n\t"
3742             "psubw %%mm0, %%mm4          \n\t"
3743             "psubw %%mm7, %%mm5          \n\t"
3744             "pxor %%mm0, %%mm0           \n\t"
3745             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3746             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3747             "psubw %%mm0, %%mm6          \n\t"
3748             /*  test pa <= pb */
3749             "movq %%mm4, %%mm7           \n\t"
3750             "psubw %%mm0, %%mm6          \n\t"
3751             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3752             "movq %%mm7, %%mm0           \n\t"
3753             /* use mm0 mask copy to merge a & b */
3754             "pand %%mm0, %%mm2           \n\t"
3755             /* use mm7 mask to merge pa & pb */
3756             "pand %%mm7, %%mm5           \n\t"
3757             "pandn %%mm1, %%mm0          \n\t"
3758             "pandn %%mm4, %%mm7          \n\t"
3759             "paddw %%mm2, %%mm0          \n\t"
3760             "paddw %%mm5, %%mm7          \n\t"
3761             /*  test  ((pa <= pb)? pa:pb) <= pc */
3762             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3763             "pand %%mm7, %%mm3           \n\t"
3764             "pandn %%mm0, %%mm7          \n\t"
3765             "paddw %%mm3, %%mm7          \n\t"
3766             "pxor %%mm1, %%mm1           \n\t"
3767             "packuswb %%mm7, %%mm1       \n\t"
3768             /* step ecx to next set of 8 bytes and repeat loop til done */
3769             "addl $8, %%ecx              \n\t"
3770             "pand _ActiveMaskEnd, %%mm1  \n\t"
3771             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */
3772                                                  /* Raw(x) */
3773
3774             "cmpl _MMXLength, %%ecx      \n\t"
3775             "pxor %%mm0, %%mm0           \n\t" /* pxor does not affect flags */
3776             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3777                                  /* mm1 will be used as Raw(x-bpp) next loop */
3778                            /* mm3 ready to be used as Prior(x-bpp) next loop */
3779             "jb paeth_3lp                \n\t"
3780
3781             : "=S" (dummy_value_S),             /* output regs (dummy) */
3782               "=D" (dummy_value_D)
3783
3784             : "0" (prev_row),  /* esi           // input regs */
3785               "1" (row)        /* edi */
3786
3787             : "%ecx"                            /* clobber list */
3788 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3789             , "%mm0", "%mm1", "%mm2", "%mm3"
3790             , "%mm4", "%mm5", "%mm6", "%mm7"
3791 #endif
3792          );
3793       }
3794       break;  /* end 3 bpp */
3795
3796       case 6:
3797       //case 7:   /* GRR BOGUS */
3798       //case 5:   /* GRR BOGUS */
3799       {
3800          _ActiveMask.use  = 0x00000000ffffffffLL;
3801          _ActiveMask2.use = 0xffffffff00000000LL;
3802          _ShiftBpp.use = bpp << 3;    /* == bpp * 8 */
3803          _ShiftRem.use = 64 - _ShiftBpp.use;
3804
3805          __asm__ __volatile__ (
3806             "movl _dif, %%ecx            \n\t"
3807 /* preload  "movl row, %%edi             \n\t" */
3808 /* preload  "movl prev_row, %%esi        \n\t" */
3809             /* prime the pump:  load the first Raw(x-bpp) data set */
3810             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3811             "pxor %%mm0, %%mm0           \n\t"
3812
3813          "paeth_6lp:                     \n\t"
3814             /* must shift to position Raw(x-bpp) data */
3815             "psrlq _ShiftRem, %%mm1      \n\t"
3816             /* do first set of 4 bytes */
3817             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3818             "punpcklbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
3819             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3820             "punpcklbw %%mm0, %%mm2      \n\t" /* unpack Low bytes of b */
3821             /* must shift to position Prior(x-bpp) data */
3822             "psrlq _ShiftRem, %%mm3      \n\t"
3823             /* pav = p - a = (a + b - c) - a = b - c */
3824             "movq %%mm2, %%mm4           \n\t"
3825             "punpcklbw %%mm0, %%mm3      \n\t" /* unpack Low bytes of c */
3826             /* pbv = p - b = (a + b - c) - b = a - c */
3827             "movq %%mm1, %%mm5           \n\t"
3828             "psubw %%mm3, %%mm4          \n\t"
3829             "pxor %%mm7, %%mm7           \n\t"
3830             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3831             "movq %%mm4, %%mm6           \n\t"
3832             "psubw %%mm3, %%mm5          \n\t"
3833             /* pa = abs(p-a) = abs(pav) */
3834             /* pb = abs(p-b) = abs(pbv) */
3835             /* pc = abs(p-c) = abs(pcv) */
3836             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3837             "paddw %%mm5, %%mm6          \n\t"
3838             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3839             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3840             "psubw %%mm0, %%mm4          \n\t"
3841             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3842             "psubw %%mm0, %%mm4          \n\t"
3843             "psubw %%mm7, %%mm5          \n\t"
3844             "pxor %%mm0, %%mm0           \n\t"
3845             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3846             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3847             "psubw %%mm7, %%mm5          \n\t"
3848             "psubw %%mm0, %%mm6          \n\t"
3849             /*  test pa <= pb */
3850             "movq %%mm4, %%mm7           \n\t"
3851             "psubw %%mm0, %%mm6          \n\t"
3852             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3853             "movq %%mm7, %%mm0           \n\t"
3854             /* use mm7 mask to merge pa & pb */
3855             "pand %%mm7, %%mm5           \n\t"
3856             /* use mm0 mask copy to merge a & b */
3857             "pand %%mm0, %%mm2           \n\t"
3858             "pandn %%mm4, %%mm7          \n\t"
3859             "pandn %%mm1, %%mm0          \n\t"
3860             "paddw %%mm5, %%mm7          \n\t"
3861             "paddw %%mm2, %%mm0          \n\t"
3862             /*  test  ((pa <= pb)? pa:pb) <= pc */
3863             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3864             "pxor %%mm1, %%mm1           \n\t"
3865             "pand %%mm7, %%mm3           \n\t"
3866             "pandn %%mm0, %%mm7          \n\t"
3867             "paddw %%mm3, %%mm7          \n\t"
3868             "pxor %%mm0, %%mm0           \n\t"
3869             "packuswb %%mm1, %%mm7       \n\t"
3870             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
3871             "pand _ActiveMask, %%mm7     \n\t"
3872             "psrlq _ShiftRem, %%mm3      \n\t"
3873             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) step 1 */
3874             "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */
3875             "movq %%mm2, %%mm6           \n\t"
3876             "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
3877             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3878             "psllq _ShiftBpp, %%mm6      \n\t"
3879             "movq %%mm7, %%mm5           \n\t"
3880             "psrlq _ShiftRem, %%mm1      \n\t"
3881             "por %%mm6, %%mm3            \n\t"
3882             "psllq _ShiftBpp, %%mm5      \n\t"
3883             "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3884             "por %%mm5, %%mm1            \n\t"
3885             /* do second set of 4 bytes */
3886             "punpckhbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3887             "punpckhbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3888             /* pav = p - a = (a + b - c) - a = b - c */
3889             "movq %%mm2, %%mm4           \n\t"
3890             /* pbv = p - b = (a + b - c) - b = a - c */
3891             "movq %%mm1, %%mm5           \n\t"
3892             "psubw %%mm3, %%mm4          \n\t"
3893             "pxor %%mm7, %%mm7           \n\t"
3894             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3895             "movq %%mm4, %%mm6           \n\t"
3896             "psubw %%mm3, %%mm5          \n\t"
3897             /* pa = abs(p-a) = abs(pav) */
3898             /* pb = abs(p-b) = abs(pbv) */
3899             /* pc = abs(p-c) = abs(pcv) */
3900             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3901             "paddw %%mm5, %%mm6          \n\t"
3902             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3903             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3904             "psubw %%mm0, %%mm4          \n\t"
3905             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3906             "psubw %%mm0, %%mm4          \n\t"
3907             "psubw %%mm7, %%mm5          \n\t"
3908             "pxor %%mm0, %%mm0           \n\t"
3909             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3910             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3911             "psubw %%mm7, %%mm5          \n\t"
3912             "psubw %%mm0, %%mm6          \n\t"
3913             /*  test pa <= pb */
3914             "movq %%mm4, %%mm7           \n\t"
3915             "psubw %%mm0, %%mm6          \n\t"
3916             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3917             "movq %%mm7, %%mm0           \n\t"
3918             /* use mm7 mask to merge pa & pb */
3919             "pand %%mm7, %%mm5           \n\t"
3920             /* use mm0 mask copy to merge a & b */
3921             "pand %%mm0, %%mm2           \n\t"
3922             "pandn %%mm4, %%mm7          \n\t"
3923             "pandn %%mm1, %%mm0          \n\t"
3924             "paddw %%mm5, %%mm7          \n\t"
3925             "paddw %%mm2, %%mm0          \n\t"
3926             /*  test  ((pa <= pb)? pa:pb) <= pc */
3927             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3928             "pxor %%mm1, %%mm1           \n\t"
3929             "pand %%mm7, %%mm3           \n\t"
3930             "pandn %%mm0, %%mm7          \n\t"
3931             "pxor %%mm1, %%mm1           \n\t"
3932             "paddw %%mm3, %%mm7          \n\t"
3933             "pxor %%mm0, %%mm0           \n\t"
3934             /* step ecx to next set of 8 bytes and repeat loop til done */
3935             "addl $8, %%ecx              \n\t"
3936             "packuswb %%mm7, %%mm1       \n\t"
3937             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
3938             "cmpl _MMXLength, %%ecx      \n\t"
3939             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3940                                 /* mm1 will be used as Raw(x-bpp) next loop */
3941             "jb paeth_6lp                \n\t"
3942
3943             : "=S" (dummy_value_S),             /* output regs (dummy) */
3944               "=D" (dummy_value_D)
3945
3946             : "0" (prev_row),  /* esi           // input regs */
3947               "1" (row)        /* edi */
3948
3949             : "%ecx"                            /* clobber list */
3950 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3951             , "%mm0", "%mm1", "%mm2", "%mm3"
3952             , "%mm4", "%mm5", "%mm6", "%mm7"
3953 #endif
3954          );
3955       }
3956       break;  /* end 6 bpp */
3957
3958       case 4:
3959       {
3960          _ActiveMask.use  = 0x00000000ffffffffLL;
3961
3962          __asm__ __volatile__ (
3963             "movl _dif, %%ecx            \n\t"
3964 /* preload  "movl row, %%edi             \n\t" */
3965 /* preload  "movl prev_row, %%esi        \n\t" */
3966             "pxor %%mm0, %%mm0           \n\t"
3967             /* prime the pump:  load the first Raw(x-bpp) data set */
3968             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
3969                                      /*  a=Raw(x-bpp) bytes */
3970          "paeth_4lp:                     \n\t"
3971             /* do first set of 4 bytes */
3972             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3973             "punpckhbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
3974             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3975             "punpcklbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3976             /* pav = p - a = (a + b - c) - a = b - c */
3977             "movq %%mm2, %%mm4           \n\t"
3978             "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3979             /* pbv = p - b = (a + b - c) - b = a - c */
3980             "movq %%mm1, %%mm5           \n\t"
3981             "psubw %%mm3, %%mm4          \n\t"
3982             "pxor %%mm7, %%mm7           \n\t"
3983             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3984             "movq %%mm4, %%mm6           \n\t"
3985             "psubw %%mm3, %%mm5          \n\t"
3986             /* pa = abs(p-a) = abs(pav) */
3987             /* pb = abs(p-b) = abs(pbv) */
3988             /* pc = abs(p-c) = abs(pcv) */
3989             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3990             "paddw %%mm5, %%mm6          \n\t"
3991             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3992             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3993             "psubw %%mm0, %%mm4          \n\t"
3994             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3995             "psubw %%mm0, %%mm4          \n\t"
3996             "psubw %%mm7, %%mm5          \n\t"
3997             "pxor %%mm0, %%mm0           \n\t"
3998             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3999             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4000             "psubw %%mm7, %%mm5          \n\t"
4001             "psubw %%mm0, %%mm6          \n\t"
4002             /*  test pa <= pb */
4003             "movq %%mm4, %%mm7           \n\t"
4004             "psubw %%mm0, %%mm6          \n\t"
4005             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4006             "movq %%mm7, %%mm0           \n\t"
4007             /* use mm7 mask to merge pa & pb */
4008             "pand %%mm7, %%mm5           \n\t"
4009             /* use mm0 mask copy to merge a & b */
4010             "pand %%mm0, %%mm2           \n\t"
4011             "pandn %%mm4, %%mm7          \n\t"
4012             "pandn %%mm1, %%mm0          \n\t"
4013             "paddw %%mm5, %%mm7          \n\t"
4014             "paddw %%mm2, %%mm0          \n\t"
4015             /*  test  ((pa <= pb)? pa:pb) <= pc */
4016             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4017             "pxor %%mm1, %%mm1           \n\t"
4018             "pand %%mm7, %%mm3           \n\t"
4019             "pandn %%mm0, %%mm7          \n\t"
4020             "paddw %%mm3, %%mm7          \n\t"
4021             "pxor %%mm0, %%mm0           \n\t"
4022             "packuswb %%mm1, %%mm7       \n\t"
4023             "movq (%%esi,%%ecx,), %%mm3  \n\t" /* load c=Prior(x-bpp) */
4024             "pand _ActiveMask, %%mm7     \n\t"
4025             "movq %%mm3, %%mm2           \n\t" /* load b=Prior(x) step 1 */
4026             "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4027             "punpcklbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
4028             "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
4029             "movq %%mm7, %%mm1           \n\t" /* now mm1 will be used as Raw(x-bpp) */
4030             /* do second set of 4 bytes */
4031             "punpckhbw %%mm0, %%mm2      \n\t" /* unpack Low bytes of b */
4032             "punpcklbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
4033             /* pav = p - a = (a + b - c) - a = b - c */
4034             "movq %%mm2, %%mm4           \n\t"
4035             /* pbv = p - b = (a + b - c) - b = a - c */
4036             "movq %%mm1, %%mm5           \n\t"
4037             "psubw %%mm3, %%mm4          \n\t"
4038             "pxor %%mm7, %%mm7           \n\t"
4039             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4040             "movq %%mm4, %%mm6           \n\t"
4041             "psubw %%mm3, %%mm5          \n\t"
4042             /* pa = abs(p-a) = abs(pav) */
4043             /* pb = abs(p-b) = abs(pbv) */
4044             /* pc = abs(p-c) = abs(pcv) */
4045             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
4046             "paddw %%mm5, %%mm6          \n\t"
4047             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4048             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4049             "psubw %%mm0, %%mm4          \n\t"
4050             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4051             "psubw %%mm0, %%mm4          \n\t"
4052             "psubw %%mm7, %%mm5          \n\t"
4053             "pxor %%mm0, %%mm0           \n\t"
4054             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4055             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4056             "psubw %%mm7, %%mm5          \n\t"
4057             "psubw %%mm0, %%mm6          \n\t"
4058             /*  test pa <= pb */
4059             "movq %%mm4, %%mm7           \n\t"
4060             "psubw %%mm0, %%mm6          \n\t"
4061             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4062             "movq %%mm7, %%mm0           \n\t"
4063             /* use mm7 mask to merge pa & pb */
4064             "pand %%mm7, %%mm5           \n\t"
4065             /* use mm0 mask copy to merge a & b */
4066             "pand %%mm0, %%mm2           \n\t"
4067             "pandn %%mm4, %%mm7          \n\t"
4068             "pandn %%mm1, %%mm0          \n\t"
4069             "paddw %%mm5, %%mm7          \n\t"
4070             "paddw %%mm2, %%mm0          \n\t"
4071             /*  test  ((pa <= pb)? pa:pb) <= pc */
4072             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4073             "pxor %%mm1, %%mm1           \n\t"
4074             "pand %%mm7, %%mm3           \n\t"
4075             "pandn %%mm0, %%mm7          \n\t"
4076             "pxor %%mm1, %%mm1           \n\t"
4077             "paddw %%mm3, %%mm7          \n\t"
4078             "pxor %%mm0, %%mm0           \n\t"
4079             /* step ecx to next set of 8 bytes and repeat loop til done */
4080             "addl $8, %%ecx              \n\t"
4081             "packuswb %%mm7, %%mm1       \n\t"
4082             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */
4083             "cmpl _MMXLength, %%ecx      \n\t"
4084             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4085                                 /* mm1 will be used as Raw(x-bpp) next loop */
4086             "jb paeth_4lp                \n\t"
4087
4088             : "=S" (dummy_value_S),             /* output regs (dummy) */
4089               "=D" (dummy_value_D)
4090
4091             : "0" (prev_row),  /* esi           // input regs */
4092               "1" (row)        /* edi */
4093
4094             : "%ecx"                            /* clobber list */
4095 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4096             , "%mm0", "%mm1", "%mm2", "%mm3"
4097             , "%mm4", "%mm5", "%mm6", "%mm7"
4098 #endif
4099          );
4100       }
4101       break;  /* end 4 bpp */
4102
4103       case 8:                          /* bpp == 8 */
4104       {
4105          _ActiveMask.use  = 0x00000000ffffffffLL;
4106
4107          __asm__ __volatile__ (
4108             "movl _dif, %%ecx            \n\t"
4109 /* preload  "movl row, %%edi             \n\t" */
4110 /* preload  "movl prev_row, %%esi        \n\t" */
4111             "pxor %%mm0, %%mm0           \n\t"
4112             /* prime the pump:  load the first Raw(x-bpp) data set */
4113             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
4114                                        /*  a=Raw(x-bpp) bytes */
4115          "paeth_8lp:                     \n\t"
4116             /* do first set of 4 bytes */
4117             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4118             "punpcklbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
4119             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
4120             "punpcklbw %%mm0, %%mm2      \n\t" /* unpack Low bytes of b */
4121             /* pav = p - a = (a + b - c) - a = b - c */
4122             "movq %%mm2, %%mm4           \n\t"
4123             "punpcklbw %%mm0, %%mm3      \n\t" /* unpack Low bytes of c */
4124             /* pbv = p - b = (a + b - c) - b = a - c */
4125             "movq %%mm1, %%mm5           \n\t"
4126             "psubw %%mm3, %%mm4          \n\t"
4127             "pxor %%mm7, %%mm7           \n\t"
4128             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4129             "movq %%mm4, %%mm6           \n\t"
4130             "psubw %%mm3, %%mm5          \n\t"
4131             /* pa = abs(p-a) = abs(pav) */
4132             /* pb = abs(p-b) = abs(pbv) */
4133             /* pc = abs(p-c) = abs(pcv) */
4134             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
4135             "paddw %%mm5, %%mm6          \n\t"
4136             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4137             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4138             "psubw %%mm0, %%mm4          \n\t"
4139             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4140             "psubw %%mm0, %%mm4          \n\t"
4141             "psubw %%mm7, %%mm5          \n\t"
4142             "pxor %%mm0, %%mm0           \n\t"
4143             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4144             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4145             "psubw %%mm7, %%mm5          \n\t"
4146             "psubw %%mm0, %%mm6          \n\t"
4147             /*  test pa <= pb */
4148             "movq %%mm4, %%mm7           \n\t"
4149             "psubw %%mm0, %%mm6          \n\t"
4150             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4151             "movq %%mm7, %%mm0           \n\t"
4152             /* use mm7 mask to merge pa & pb */
4153             "pand %%mm7, %%mm5           \n\t"
4154             /* use mm0 mask copy to merge a & b */
4155             "pand %%mm0, %%mm2           \n\t"
4156             "pandn %%mm4, %%mm7          \n\t"
4157             "pandn %%mm1, %%mm0          \n\t"
4158             "paddw %%mm5, %%mm7          \n\t"
4159             "paddw %%mm2, %%mm0          \n\t"
4160             /*  test  ((pa <= pb)? pa:pb) <= pc */
4161             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4162             "pxor %%mm1, %%mm1           \n\t"
4163             "pand %%mm7, %%mm3           \n\t"
4164             "pandn %%mm0, %%mm7          \n\t"
4165             "paddw %%mm3, %%mm7          \n\t"
4166             "pxor %%mm0, %%mm0           \n\t"
4167             "packuswb %%mm1, %%mm7       \n\t"
4168             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4169             "pand _ActiveMask, %%mm7     \n\t"
4170             "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
4171             "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4172             "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
4173             "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
4174             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */
4175
4176             /* do second set of 4 bytes */
4177             "punpckhbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
4178             "punpckhbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
4179             /* pav = p - a = (a + b - c) - a = b - c */
4180             "movq %%mm2, %%mm4           \n\t"
4181             /* pbv = p - b = (a + b - c) - b = a - c */
4182             "movq %%mm1, %%mm5           \n\t"
4183             "psubw %%mm3, %%mm4          \n\t"
4184             "pxor %%mm7, %%mm7           \n\t"
4185             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4186             "movq %%mm4, %%mm6           \n\t"
4187             "psubw %%mm3, %%mm5          \n\t"
4188             /* pa = abs(p-a) = abs(pav) */
4189             /* pb = abs(p-b) = abs(pbv) */
4190             /* pc = abs(p-c) = abs(pcv) */
4191             "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
4192             "paddw %%mm5, %%mm6          \n\t"
4193             "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4194             "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4195             "psubw %%mm0, %%mm4          \n\t"
4196             "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4197             "psubw %%mm0, %%mm4          \n\t"
4198             "psubw %%mm7, %%mm5          \n\t"
4199             "pxor %%mm0, %%mm0           \n\t"
4200             "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4201             "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4202             "psubw %%mm7, %%mm5          \n\t"
4203             "psubw %%mm0, %%mm6          \n\t"
4204             /*  test pa <= pb */
4205             "movq %%mm4, %%mm7           \n\t"
4206             "psubw %%mm0, %%mm6          \n\t"
4207             "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4208             "movq %%mm7, %%mm0           \n\t"
4209             /* use mm7 mask to merge pa & pb */
4210             "pand %%mm7, %%mm5           \n\t"
4211             /* use mm0 mask copy to merge a & b */
4212             "pand %%mm0, %%mm2           \n\t"
4213             "pandn %%mm4, %%mm7          \n\t"
4214             "pandn %%mm1, %%mm0          \n\t"
4215             "paddw %%mm5, %%mm7          \n\t"
4216             "paddw %%mm2, %%mm0          \n\t"
4217             /*  test  ((pa <= pb)? pa:pb) <= pc */
4218             "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4219             "pxor %%mm1, %%mm1           \n\t"
4220             "pand %%mm7, %%mm3           \n\t"
4221             "pandn %%mm0, %%mm7          \n\t"
4222             "pxor %%mm1, %%mm1           \n\t"
4223             "paddw %%mm3, %%mm7          \n\t"
4224             "pxor %%mm0, %%mm0           \n\t"
4225             /* step ecx to next set of 8 bytes and repeat loop til done */
4226             "addl $8, %%ecx              \n\t"
4227             "packuswb %%mm7, %%mm1       \n\t"
4228             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
4229             "cmpl _MMXLength, %%ecx      \n\t"
4230             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4231                             /* mm1 will be used as Raw(x-bpp) next loop */
4232             "jb paeth_8lp                \n\t"
4233
4234             : "=S" (dummy_value_S),             /* output regs (dummy) */
4235               "=D" (dummy_value_D)
4236
4237             : "0" (prev_row),  /* esi           // input regs */
4238               "1" (row)        /* edi */
4239
4240             : "%ecx"                            /* clobber list */
4241 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4242             , "%mm0", "%mm1", "%mm2", "%mm3"
4243             , "%mm4", "%mm5", "%mm6", "%mm7"
4244 #endif
4245          );
4246       }
4247       break;  /* end 8 bpp */
4248
4249       case 1:                /* bpp = 1 */
4250       case 2:                /* bpp = 2 */
4251       default:               /* bpp > 8 */
4252       {
4253          __asm__ __volatile__ (
4254 #ifdef __PIC__
4255             "pushl %%ebx                 \n\t" /* save Global Offset Table index */
4256 #endif
4257             "movl _dif, %%ebx            \n\t"
4258             "cmpl _FullLength, %%ebx     \n\t"
4259             "jnb paeth_dend              \n\t"
4260
4261 /* preload  "movl row, %%edi             \n\t" */
4262 /* preload  "movl prev_row, %%esi        \n\t" */
4263             /* do Paeth decode for remaining bytes */
4264             "movl %%ebx, %%edx           \n\t"
4265 /* preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx) */
4266             "subl %%ecx, %%edx           \n\t" /* edx = ebx - bpp */
4267             "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx */
4268
4269          "paeth_dlp:                     \n\t"
4270             "xorl %%eax, %%eax           \n\t"
4271             /* pav = p - a = (a + b - c) - a = b - c */
4272             "movb (%%esi,%%ebx,), %%al   \n\t" /* load Prior(x) into al */
4273             "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4274             "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4275             "movl %%eax, _patemp         \n\t" /* Save pav for later use */
4276             "xorl %%eax, %%eax           \n\t"
4277             /* pbv = p - b = (a + b - c) - b = a - c */
4278             "movb (%%edi,%%edx,), %%al   \n\t" /* load Raw(x-bpp) into al */
4279             "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4280             "movl %%eax, %%ecx           \n\t"
4281             /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4282             "addl _patemp, %%eax         \n\t" /* pcv = pav + pbv */
4283             /* pc = abs(pcv) */
4284             "testl $0x80000000, %%eax    \n\t"
4285             "jz paeth_dpca               \n\t"
4286             "negl %%eax                  \n\t" /* reverse sign of neg values */
4287
4288          "paeth_dpca:                    \n\t"
4289             "movl %%eax, _pctemp         \n\t" /* save pc for later use */
4290             /* pb = abs(pbv) */
4291             "testl $0x80000000, %%ecx    \n\t"
4292             "jz paeth_dpba               \n\t"
4293             "negl %%ecx                  \n\t" /* reverse sign of neg values */
4294
4295          "paeth_dpba:                    \n\t"
4296             "movl %%ecx, _pbtemp         \n\t" /* save pb for later use */
4297             /* pa = abs(pav) */
4298             "movl _patemp, %%eax         \n\t"
4299             "testl $0x80000000, %%eax    \n\t"
4300             "jz paeth_dpaa               \n\t"
4301             "negl %%eax                  \n\t" /* reverse sign of neg values */
4302
4303          "paeth_dpaa:                    \n\t"
4304             "movl %%eax, _patemp         \n\t" /* save pa for later use */
4305             /* test if pa <= pb */
4306             "cmpl %%ecx, %%eax           \n\t"
4307             "jna paeth_dabb              \n\t"
4308             /* pa > pb; now test if pb <= pc */
4309             "cmpl _pctemp, %%ecx         \n\t"
4310             "jna paeth_dbbc              \n\t"
4311             /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4312             "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4313             "jmp paeth_dpaeth            \n\t"
4314
4315          "paeth_dbbc:                    \n\t"
4316             /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4317             "movb (%%esi,%%ebx,), %%cl   \n\t" /* load Prior(x) into cl */
4318             "jmp paeth_dpaeth            \n\t"
4319
4320          "paeth_dabb:                    \n\t"
4321             /* pa <= pb; now test if pa <= pc */
4322             "cmpl _pctemp, %%eax         \n\t"
4323             "jna paeth_dabc              \n\t"
4324             /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4325             "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4326             "jmp paeth_dpaeth            \n\t"
4327
4328          "paeth_dabc:                    \n\t"
4329             /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4330             "movb (%%edi,%%edx,), %%cl   \n\t" /* load Raw(x-bpp) into cl */
4331
4332          "paeth_dpaeth:                  \n\t"
4333             "incl %%ebx                  \n\t"
4334             "incl %%edx                  \n\t"
4335             /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4336             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4337             "cmpl _FullLength, %%ebx     \n\t"
4338             "jb paeth_dlp                \n\t"
4339
4340          "paeth_dend:                    \n\t"
4341 #ifdef __PIC__
4342             "popl %%ebx                  \n\t" /* index to Global Offset Table */
4343 #endif
4344
4345             : "=c" (dummy_value_c),            /* output regs (dummy) */
4346               "=S" (dummy_value_S),
4347               "=D" (dummy_value_D)
4348
4349             : "0" (bpp),       /* ecx          // input regs */
4350               "1" (prev_row),  /* esi */
4351               "2" (row)        /* edi */
4352
4353             : "%eax", "%edx"                   /* clobber list */
4354 #ifndef __PIC__
4355             , "%ebx"
4356 #endif
4357          );
4358       }
4359       return;                   /* No need to go further with this one */
4360
4361    } /* end switch (bpp) */
4362
4363    __asm__ __volatile__ (
4364       /* MMX acceleration complete; now do clean-up */
4365       /* check if any remaining bytes left to decode */
4366 #ifdef __PIC__
4367       "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
4368 #endif
4369       "movl _MMXLength, %%ebx      \n\t"
4370       "cmpl _FullLength, %%ebx     \n\t"
4371       "jnb paeth_end               \n\t"
4372 /*pre "movl row, %%edi             \n\t" */
4373 /*pre "movl prev_row, %%esi        \n\t" */
4374       /* do Paeth decode for remaining bytes */
4375       "movl %%ebx, %%edx           \n\t"
4376 /*pre "subl bpp, %%edx             \n\t" */ /* (bpp is preloaded into ecx) */
4377       "subl %%ecx, %%edx           \n\t" /* edx = ebx - bpp */
4378       "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx below */
4379
4380    "paeth_lp2:                     \n\t"
4381       "xorl %%eax, %%eax           \n\t"
4382       /* pav = p - a = (a + b - c) - a = b - c */
4383       "movb (%%esi,%%ebx,), %%al   \n\t" /* load Prior(x) into al */
4384       "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4385       "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4386       "movl %%eax, _patemp         \n\t" /* Save pav for later use */
4387       "xorl %%eax, %%eax           \n\t"
4388       /* pbv = p - b = (a + b - c) - b = a - c */
4389       "movb (%%edi,%%edx,), %%al   \n\t" /* load Raw(x-bpp) into al */
4390       "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4391       "movl %%eax, %%ecx           \n\t"
4392       /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4393       "addl _patemp, %%eax         \n\t" /* pcv = pav + pbv */
4394       /* pc = abs(pcv) */
4395       "testl $0x80000000, %%eax    \n\t"
4396       "jz paeth_pca2               \n\t"
4397       "negl %%eax                  \n\t" /* reverse sign of neg values */
4398
4399    "paeth_pca2:                    \n\t"
4400       "movl %%eax, _pctemp         \n\t" /* save pc for later use */
4401       /* pb = abs(pbv) */
4402       "testl $0x80000000, %%ecx    \n\t"
4403       "jz paeth_pba2               \n\t"
4404       "negl %%ecx                  \n\t" /* reverse sign of neg values */
4405
4406    "paeth_pba2:                    \n\t"
4407       "movl %%ecx, _pbtemp         \n\t" /* save pb for later use */
4408       /* pa = abs(pav) */
4409       "movl _patemp, %%eax         \n\t"
4410       "testl $0x80000000, %%eax    \n\t"
4411       "jz paeth_paa2               \n\t"
4412       "negl %%eax                  \n\t" /* reverse sign of neg values */
4413
4414    "paeth_paa2:                    \n\t"
4415       "movl %%eax, _patemp         \n\t" /* save pa for later use */
4416       /* test if pa <= pb */
4417       "cmpl %%ecx, %%eax           \n\t"
4418       "jna paeth_abb2              \n\t"
4419       /* pa > pb; now test if pb <= pc */
4420       "cmpl _pctemp, %%ecx         \n\t"
4421       "jna paeth_bbc2              \n\t"
4422       /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4423       "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4424       "jmp paeth_paeth2            \n\t"
4425
4426    "paeth_bbc2:                    \n\t"
4427       /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4428       "movb (%%esi,%%ebx,), %%cl   \n\t" /* load Prior(x) into cl */
4429       "jmp paeth_paeth2            \n\t"
4430
4431    "paeth_abb2:                    \n\t"
4432       /* pa <= pb; now test if pa <= pc */
4433       "cmpl _pctemp, %%eax         \n\t"
4434       "jna paeth_abc2              \n\t"
4435       /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4436       "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4437       "jmp paeth_paeth2            \n\t"
4438
4439    "paeth_abc2:                    \n\t"
4440       /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4441       "movb (%%edi,%%edx,), %%cl   \n\t" /* load Raw(x-bpp) into cl */
4442
4443    "paeth_paeth2:                  \n\t"
4444       "incl %%ebx                  \n\t"
4445       "incl %%edx                  \n\t"
4446       /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4447       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4448       "cmpl _FullLength, %%ebx     \n\t"
4449       "jb paeth_lp2                \n\t"
4450
4451    "paeth_end:                     \n\t"
4452       "EMMS                        \n\t" /* end MMX; prep for poss. FP instrs. */
4453 #ifdef __PIC__
4454       "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
4455 #endif
4456
4457       : "=c" (dummy_value_c),            /* output regs (dummy) */
4458         "=S" (dummy_value_S),
4459         "=D" (dummy_value_D)
4460
4461       : "0" (bpp),       /* ecx          // input regs */
4462         "1" (prev_row),  /* esi */
4463         "2" (row)        /* edi */
4464
4465       : "%eax", "%edx"                   /* clobber list (no input regs!) */
4466 #ifndef __PIC__
4467       , "%ebx"
4468 #endif
4469    );
4470
4471 } /* end png_read_filter_row_mmx_paeth() */
4472 #endif
4473
4474
4475
4476
4477 #ifdef PNG_THREAD_UNSAFE_OK
4478 /*===========================================================================*/
4479 /*                                                                           */
4480 /*           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           */
4481 /*                                                                           */
4482 /*===========================================================================*/
4483
4484 /* Optimized code for PNG Sub filter decoder */
4485
4486 static void /* PRIVATE */
4487 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4488 {
4489    int bpp;
4490    int dummy_value_a;
4491    int dummy_value_D;
4492
4493    bpp = (row_info->pixel_depth + 7) >> 3;   /* calc number of bytes per pixel */
4494    _FullLength = row_info->rowbytes - bpp;   /* number of bytes to filter */
4495
4496    __asm__ __volatile__ (
4497 /*pre "movl row, %%edi             \n\t" */
4498       "movl %%edi, %%esi           \n\t" /* lp = row */
4499 /*pre "movl bpp, %%eax             \n\t" */
4500       "addl %%eax, %%edi           \n\t" /* rp = row + bpp */
4501 /*irr "xorl %%eax, %%eax           \n\t" */
4502       /* get # of bytes to alignment */
4503       "movl %%edi, _dif            \n\t" /* take start of row */
4504       "addl $0xf, _dif             \n\t" /* add 7 + 8 to incr past */
4505                                          /*  alignment boundary */
4506       "xorl %%ecx, %%ecx           \n\t"
4507       "andl $0xfffffff8, _dif      \n\t" /* mask to alignment boundary */
4508       "subl %%edi, _dif            \n\t" /* subtract from start ==> value */
4509       "jz sub_go                   \n\t" /*  ecx at alignment */
4510
4511    "sub_lp1:                       \n\t" /* fix alignment */
4512       "movb (%%esi,%%ecx,), %%al   \n\t"
4513       "addb %%al, (%%edi,%%ecx,)   \n\t"
4514       "incl %%ecx                  \n\t"
4515       "cmpl _dif, %%ecx            \n\t"
4516       "jb sub_lp1                  \n\t"
4517
4518    "sub_go:                        \n\t"
4519       "movl _FullLength, %%eax     \n\t"
4520       "movl %%eax, %%edx           \n\t"
4521       "subl %%ecx, %%edx           \n\t" /* subtract alignment fix */
4522       "andl $0x00000007, %%edx     \n\t" /* calc bytes over mult of 8 */
4523       "subl %%edx, %%eax           \n\t" /* drop over bytes from length */
4524       "movl %%eax, _MMXLength      \n\t"
4525
4526       : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4527         "=D" (dummy_value_D)    /* 1 */
4528
4529       : "0" (bpp),              /* eax    // input regs */
4530         "1" (row)               /* edi */
4531
4532       : "%ebx", "%ecx", "%edx"            /* clobber list */
4533       , "%esi"
4534
4535 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4536       , "%mm0", "%mm1", "%mm2", "%mm3"
4537       , "%mm4", "%mm5", "%mm6", "%mm7"
4538 #endif
4539    );
4540
4541    /* now do the math for the rest of the row */
4542    switch (bpp)
4543    {
4544       case 3:
4545       {
4546          _ActiveMask.use  = 0x0000ffffff000000LL;
4547          _ShiftBpp.use = 24;       /* == 3 * 8 */
4548          _ShiftRem.use  = 40;      /* == 64 - 24 */
4549
4550          __asm__ __volatile__ (
4551 /* preload  "movl row, %%edi              \n\t" */
4552             "movq _ActiveMask, %%mm7       \n\t" /* load _ActiveMask for 2nd */
4553                                                 /*  active byte group */
4554             "movl %%edi, %%esi            \n\t" /* lp = row */
4555 /* preload  "movl bpp, %%eax              \n\t" */
4556             "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4557             "movq %%mm7, %%mm6            \n\t"
4558             "movl _dif, %%edx             \n\t"
4559             "psllq _ShiftBpp, %%mm6       \n\t" /* move mask in mm6 to cover */
4560                                                 /*  3rd active byte group */
4561             /* prime the pump:  load the first Raw(x-bpp) data set */
4562             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4563
4564          "sub_3lp:                        \n\t" /* shift data for adding first */
4565             "psrlq _ShiftRem, %%mm1       \n\t" /*  bpp bytes (no need for mask; */
4566                                                 /*  shift clears inactive bytes) */
4567             /* add 1st active group */
4568             "movq (%%edi,%%edx,), %%mm0   \n\t"
4569             "paddb %%mm1, %%mm0           \n\t"
4570
4571             /* add 2nd active group */
4572             "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4573             "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4574             "pand %%mm7, %%mm1            \n\t" /* mask to use 2nd active group */
4575             "paddb %%mm1, %%mm0           \n\t"
4576
4577             /* add 3rd active group */
4578             "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4579             "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4580             "pand %%mm6, %%mm1            \n\t" /* mask to use 3rd active group */
4581             "addl $8, %%edx               \n\t"
4582             "paddb %%mm1, %%mm0           \n\t"
4583
4584             "cmpl _MMXLength, %%edx       \n\t"
4585             "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4586             "movq %%mm0, %%mm1            \n\t" /* prep 1st add at top of loop */
4587             "jb sub_3lp                   \n\t"
4588
4589             : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4590               "=D" (dummy_value_D)    /* 1 */
4591
4592             : "0" (bpp),              /* eax    // input regs */
4593               "1" (row)               /* edi */
4594
4595             : "%edx", "%esi"                    /* clobber list */
4596 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4597             , "%mm0", "%mm1", "%mm6", "%mm7"
4598 #endif
4599          );
4600       }
4601       break;
4602
4603       case 1:
4604       {
4605          __asm__ __volatile__ (
4606             "movl _dif, %%edx            \n\t"
4607 /* preload  "movl row, %%edi             \n\t" */
4608             "cmpl _FullLength, %%edx     \n\t"
4609             "jnb sub_1end                \n\t"
4610             "movl %%edi, %%esi           \n\t" /* lp = row */
4611             "xorl %%eax, %%eax           \n\t"
4612 /* preload  "movl bpp, %%eax             \n\t" */
4613             "addl %%eax, %%edi           \n\t" /* rp = row + bpp */
4614
4615          "sub_1lp:                       \n\t"
4616             "movb (%%esi,%%edx,), %%al   \n\t"
4617             "addb %%al, (%%edi,%%edx,)   \n\t"
4618             "incl %%edx                  \n\t"
4619             "cmpl _FullLength, %%edx     \n\t"
4620             "jb sub_1lp                  \n\t"
4621
4622          "sub_1end:                      \n\t"
4623
4624             : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4625               "=D" (dummy_value_D)    /* 1 */
4626
4627             : "0" (bpp),              /* eax    // input regs */
4628               "1" (row)               /* edi */
4629
4630             : "%edx", "%esi"                    /* clobber list */
4631          );
4632       }
4633       return;
4634
4635       case 6:
4636       case 4:
4637       //case 7:   /* GRR BOGUS */
4638       //case 5:   /* GRR BOGUS */
4639       {
4640          _ShiftBpp.use = bpp << 3;
4641          _ShiftRem.use = 64 - _ShiftBpp.use;
4642
4643          __asm__ __volatile__ (
4644 /* preload  "movl row, %%edi              \n\t" */
4645             "movl _dif, %%edx             \n\t"
4646             "movl %%edi, %%esi            \n\t" /* lp = row */
4647 /* preload  "movl bpp, %%eax              \n\t" */
4648             "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4649
4650             /* prime the pump:  load the first Raw(x-bpp) data set */
4651             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4652
4653          "sub_4lp:                        \n\t" /* shift data for adding first */
4654             "psrlq _ShiftRem, %%mm1       \n\t" /*  bpp bytes (no need for mask; */
4655                                                 /*  shift clears inactive bytes) */
4656             "movq (%%edi,%%edx,), %%mm0   \n\t"
4657             "paddb %%mm1, %%mm0           \n\t"
4658
4659             /* add 2nd active group */
4660             "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4661             "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4662             "addl $8, %%edx               \n\t"
4663             "paddb %%mm1, %%mm0           \n\t"
4664
4665             "cmpl _MMXLength, %%edx       \n\t"
4666             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4667             "movq %%mm0, %%mm1            \n\t" /* prep 1st add at top of loop */
4668             "jb sub_4lp                   \n\t"
4669
4670             : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4671               "=D" (dummy_value_D)    /* 1 */
4672
4673             : "0" (bpp),              /* eax    // input regs */
4674               "1" (row)               /* edi */
4675
4676             : "%edx", "%esi"                    /* clobber list */
4677 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4678             , "%mm0", "%mm1"
4679 #endif
4680          );
4681       }
4682       break;
4683
4684       case 2:
4685       {
4686          _ActiveMask.use = 0x00000000ffff0000LL;
4687          _ShiftBpp.use = 16;       /* == 2 * 8 */
4688          _ShiftRem.use = 48;       /* == 64 - 16 */
4689
4690          __asm__ __volatile__ (
4691             "movq _ActiveMask, %%mm7      \n\t" /* load _ActiveMask for 2nd */
4692                                                 /*  active byte group */
4693             "movl _dif, %%edx             \n\t"
4694             "movq %%mm7, %%mm6            \n\t"
4695 /* preload  "movl row, %%edi              \n\t" */
4696             "psllq _ShiftBpp, %%mm6       \n\t" /* move mask in mm6 to cover */
4697                                                 /*  3rd active byte group */
4698             "movl %%edi, %%esi            \n\t" /* lp = row */
4699             "movq %%mm6, %%mm5            \n\t"
4700 /* preload  "movl bpp, %%eax              \n\t" */
4701             "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4702             "psllq _ShiftBpp, %%mm5       \n\t" /* move mask in mm5 to cover */
4703                                                 /*  4th active byte group */
4704             /* prime the pump:  load the first Raw(x-bpp) data set */
4705             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4706
4707          "sub_2lp:                        \n\t" /* shift data for adding first */
4708             "psrlq _ShiftRem, %%mm1       \n\t" /*  bpp bytes (no need for mask; */
4709                                                 /*  shift clears inactive bytes) */
4710             /* add 1st active group */
4711             "movq (%%edi,%%edx,), %%mm0   \n\t"
4712             "paddb %%mm1, %%mm0           \n\t"
4713
4714             /* add 2nd active group */
4715             "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4716             "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4717             "pand %%mm7, %%mm1            \n\t" /* mask to use 2nd active group */
4718             "paddb %%mm1, %%mm0           \n\t"
4719
4720             /* add 3rd active group */
4721             "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4722             "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4723             "pand %%mm6, %%mm1            \n\t" /* mask to use 3rd active group */
4724             "paddb %%mm1, %%mm0           \n\t"
4725
4726             /* add 4th active group */
4727             "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4728             "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4729             "pand %%mm5, %%mm1            \n\t" /* mask to use 4th active group */
4730             "addl $8, %%edx               \n\t"
4731             "paddb %%mm1, %%mm0           \n\t"
4732             "cmpl _MMXLength, %%edx       \n\t"
4733             "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4734             "movq %%mm0, %%mm1            \n\t" /* prep 1st add at top of loop */
4735             "jb sub_2lp                   \n\t"
4736
4737             : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4738               "=D" (dummy_value_D)    /* 1 */
4739
4740             : "0" (bpp),              /* eax    // input regs */
4741               "1" (row)               /* edi */
4742
4743             : "%edx", "%esi"                    /* clobber list */
4744 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4745             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4746 #endif
4747          );
4748       }
4749       break;
4750
4751       case 8:
4752       {
4753          __asm__ __volatile__ (
4754 /* preload  "movl row, %%edi              \n\t" */
4755             "movl _dif, %%edx             \n\t"
4756             "movl %%edi, %%esi            \n\t" /* lp = row */
4757 /* preload  "movl bpp, %%eax              \n\t" */
4758             "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4759             "movl _MMXLength, %%ecx       \n\t"
4760
4761             /* prime the pump:  load the first Raw(x-bpp) data set */
4762             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4763             "andl $0x0000003f, %%ecx      \n\t" /* calc bytes over mult of 64 */
4764
4765          "sub_8lp:                        \n\t"
4766             "movq (%%edi,%%edx,), %%mm0   \n\t" /* load Sub(x) for 1st 8 bytes */
4767             "paddb %%mm7, %%mm0           \n\t"
4768             "movq 8(%%edi,%%edx,), %%mm1  \n\t" /* load Sub(x) for 2nd 8 bytes */
4769             "movq %%mm0, (%%edi,%%edx,)   \n\t" /* write Raw(x) for 1st 8 bytes */
4770
4771             /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */
4772             /* This will be repeated for each group of 8 bytes with the 8th */
4773             /* group being used as the Raw(x-bpp) for the 1st group of the */
4774             /* next loop. */
4775
4776             "paddb %%mm0, %%mm1           \n\t"
4777             "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */
4778             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" /* write Raw(x) for 2nd 8 bytes */
4779             "paddb %%mm1, %%mm2           \n\t"
4780             "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */
4781             "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */
4782             "paddb %%mm2, %%mm3           \n\t"
4783             "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */
4784             "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */
4785             "paddb %%mm3, %%mm4           \n\t"
4786             "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */
4787             "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */
4788             "paddb %%mm4, %%mm5           \n\t"
4789             "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */
4790             "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */
4791             "paddb %%mm5, %%mm6           \n\t"
4792             "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */
4793             "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */
4794             "addl $64, %%edx              \n\t"
4795             "paddb %%mm6, %%mm7           \n\t"
4796             "cmpl %%ecx, %%edx            \n\t"
4797             "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */
4798             "jb sub_8lp                   \n\t"
4799
4800             "cmpl _MMXLength, %%edx       \n\t"
4801             "jnb sub_8lt8                 \n\t"
4802
4803          "sub_8lpA:                       \n\t"
4804             "movq (%%edi,%%edx,), %%mm0   \n\t"
4805             "addl $8, %%edx               \n\t"
4806             "paddb %%mm7, %%mm0           \n\t"
4807             "cmpl _MMXLength, %%edx       \n\t"
4808             "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */
4809             "movq %%mm0, %%mm7            \n\t" /* move calculated Raw(x) data */
4810                                                 /*  to mm1 to be new Raw(x-bpp) */
4811                                                 /*  for next loop */
4812             "jb sub_8lpA                  \n\t"
4813
4814          "sub_8lt8:                       \n\t"
4815
4816             : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4817               "=D" (dummy_value_D)    /* 1 */
4818
4819             : "0" (bpp),              /* eax    // input regs */
4820               "1" (row)               /* edi */
4821
4822             : "%ecx", "%edx", "%esi"            /* clobber list */
4823 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4824             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4825 #endif
4826          );
4827       }
4828       break;
4829
4830       default:                /* bpp greater than 8 bytes   GRR BOGUS */
4831       {
4832          __asm__ __volatile__ (
4833             "movl _dif, %%edx             \n\t"
4834 /* preload  "movl row, %%edi              \n\t" */
4835             "movl %%edi, %%esi            \n\t" /* lp = row */
4836 /* preload  "movl bpp, %%eax              \n\t" */
4837             "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4838
4839          "sub_Alp:                        \n\t"
4840             "movq (%%edi,%%edx,), %%mm0   \n\t"
4841             "movq (%%esi,%%edx,), %%mm1   \n\t"
4842             "addl $8, %%edx               \n\t"
4843             "paddb %%mm1, %%mm0           \n\t"
4844             "cmpl _MMXLength, %%edx       \n\t"
4845             "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */
4846                                                 /*  -8 to offset addl edx */
4847             "jb sub_Alp                   \n\t"
4848
4849             : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4850               "=D" (dummy_value_D)    /* 1 */
4851
4852             : "0" (bpp),              /* eax    // input regs */
4853               "1" (row)               /* edi */
4854
4855             : "%edx", "%esi"                    /* clobber list */
4856 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4857             , "%mm0", "%mm1"
4858 #endif
4859          );
4860       }
4861       break;
4862
4863    } /* end switch (bpp) */
4864
4865    __asm__ __volatile__ (
4866       "movl _MMXLength, %%edx       \n\t"
4867 /* pre "movl row, %%edi              \n\t" */
4868       "cmpl _FullLength, %%edx      \n\t"
4869       "jnb sub_end                  \n\t"
4870
4871       "movl %%edi, %%esi            \n\t" /* lp = row */
4872 /* pre "movl bpp, %%eax              \n\t" */
4873       "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4874       "xorl %%eax, %%eax            \n\t"
4875
4876    "sub_lp2:                        \n\t"
4877       "movb (%%esi,%%edx,), %%al    \n\t"
4878       "addb %%al, (%%edi,%%edx,)    \n\t"
4879       "incl %%edx                   \n\t"
4880       "cmpl _FullLength, %%edx      \n\t"
4881       "jb sub_lp2                   \n\t"
4882
4883    "sub_end:                        \n\t"
4884       "EMMS                         \n\t" /* end MMX instructions */
4885
4886       : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4887         "=D" (dummy_value_D)    /* 1 */
4888
4889       : "0" (bpp),              /* eax    // input regs */
4890         "1" (row)               /* edi */
4891
4892       : "%edx", "%esi"                    /* clobber list */
4893    );
4894
4895 } /* end of png_read_filter_row_mmx_sub() */
4896 #endif
4897
4898
4899
4900
4901 /*===========================================================================*/
4902 /*                                                                           */
4903 /*            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            */
4904 /*                                                                           */
4905 /*===========================================================================*/
4906
4907 /* Optimized code for PNG Up filter decoder */
4908
4909 static void /* PRIVATE */
4910 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4911                            png_bytep prev_row)
4912 {
4913    png_uint_32 len;
4914    int dummy_value_d;   /* fix 'forbidden register 3 (dx) was spilled' error */
4915    int dummy_value_S;
4916    int dummy_value_D;
4917
4918    len = row_info->rowbytes;              /* number of bytes to filter */
4919
4920    __asm__ __volatile__ (
4921 /* pre "movl row, %%edi              \n\t" */
4922       /* get # of bytes to alignment */
4923 #ifdef __PIC__
4924       "pushl %%ebx                  \n\t"
4925 #endif
4926       "movl %%edi, %%ecx            \n\t"
4927       "xorl %%ebx, %%ebx            \n\t"
4928       "addl $0x7, %%ecx             \n\t"
4929       "xorl %%eax, %%eax            \n\t"
4930       "andl $0xfffffff8, %%ecx      \n\t"
4931 /* pre "movl prev_row, %%esi         \n\t" */
4932       "subl %%edi, %%ecx            \n\t"
4933       "jz up_go                     \n\t"
4934
4935    "up_lp1:                         \n\t" /* fix alignment */
4936       "movb (%%edi,%%ebx,), %%al    \n\t"
4937       "addb (%%esi,%%ebx,), %%al    \n\t"
4938       "incl %%ebx                   \n\t"
4939       "cmpl %%ecx, %%ebx            \n\t"
4940       "movb %%al, -1(%%edi,%%ebx,)  \n\t" /* mov does not affect flags; -1 to */
4941       "jb up_lp1                    \n\t" /*  offset incl ebx */
4942
4943    "up_go:                          \n\t"
4944 /* pre "movl len, %%edx              \n\t" */
4945       "movl %%edx, %%ecx            \n\t"
4946       "subl %%ebx, %%edx            \n\t" /* subtract alignment fix */
4947       "andl $0x0000003f, %%edx      \n\t" /* calc bytes over mult of 64 */
4948       "subl %%edx, %%ecx            \n\t" /* drop over bytes from length */
4949
4950       /* unrolled loop - use all MMX registers and interleave to reduce */
4951       /* number of branch instructions (loops) and reduce partial stalls */
4952    "up_loop:                        \n\t"
4953       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4954       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4955       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4956       "paddb %%mm1, %%mm0           \n\t"
4957       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4958       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4959       "paddb %%mm3, %%mm2           \n\t"
4960       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4961       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4962       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4963       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4964       "paddb %%mm5, %%mm4           \n\t"
4965       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4966       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4967       "paddb %%mm7, %%mm6           \n\t"
4968       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4969       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4970       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4971       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4972       "paddb %%mm1, %%mm0           \n\t"
4973       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4974       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4975       "paddb %%mm3, %%mm2           \n\t"
4976       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4977       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4978       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4979       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4980       "paddb %%mm5, %%mm4           \n\t"
4981       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4982       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4983       "addl $64, %%ebx              \n\t"
4984       "paddb %%mm7, %%mm6           \n\t"
4985       "cmpl %%ecx, %%ebx            \n\t"
4986       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */
4987       "jb up_loop                   \n\t" /*  -8 to offset addl ebx */
4988
4989       "cmpl $0, %%edx               \n\t" /* test for bytes over mult of 64 */
4990       "jz up_end                    \n\t"
4991
4992       "cmpl $8, %%edx               \n\t" /* test for less than 8 bytes */
4993       "jb up_lt8                    \n\t" /*  [added by lcreeve@netins.net] */
4994
4995       "addl %%edx, %%ecx            \n\t"
4996       "andl $0x00000007, %%edx      \n\t" /* calc bytes over mult of 8 */
4997       "subl %%edx, %%ecx            \n\t" /* drop over bytes from length */
4998       "jz up_lt8                    \n\t"
4999
5000    "up_lpA:                         \n\t" /* use MMX regs to update 8 bytes sim. */
5001       "movq (%%esi,%%ebx,), %%mm1   \n\t"
5002       "movq (%%edi,%%ebx,), %%mm0   \n\t"
5003       "addl $8, %%ebx               \n\t"
5004       "paddb %%mm1, %%mm0           \n\t"
5005       "cmpl %%ecx, %%ebx            \n\t"
5006       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */
5007       "jb up_lpA                    \n\t" /*  offset add ebx */
5008       "cmpl $0, %%edx               \n\t" /* test for bytes over mult of 8 */
5009       "jz up_end                    \n\t"
5010
5011    "up_lt8:                         \n\t"
5012       "xorl %%eax, %%eax            \n\t"
5013       "addl %%edx, %%ecx            \n\t" /* move over byte count into counter */
5014
5015    "up_lp2:                         \n\t" /* use x86 regs for remaining bytes */
5016       "movb (%%edi,%%ebx,), %%al    \n\t"
5017       "addb (%%esi,%%ebx,), %%al    \n\t"
5018       "incl %%ebx                   \n\t"
5019       "cmpl %%ecx, %%ebx            \n\t"
5020       "movb %%al, -1(%%edi,%%ebx,)  \n\t" /* mov does not affect flags; -1 to */
5021       "jb up_lp2                    \n\t" /*  offset inc ebx */
5022
5023    "up_end:                         \n\t"
5024       "EMMS                         \n\t" /* conversion of filtered row complete */
5025 #ifdef __PIC__
5026       "popl %%ebx                   \n\t"
5027 #endif
5028
5029       : "=d" (dummy_value_d),   /* 0      // output regs (dummy) */
5030         "=S" (dummy_value_S),   /* 1 */
5031         "=D" (dummy_value_D)    /* 2 */
5032
5033       : "0" (len),              /* edx    // input regs */
5034         "1" (prev_row),         /* esi */
5035         "2" (row)               /* edi */
5036
5037       : "%eax", "%ebx", "%ecx"            /* clobber list (no input regs!) */
5038
5039 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5040       , "%mm0", "%mm1", "%mm2", "%mm3"
5041       , "%mm4", "%mm5", "%mm6", "%mm7"
5042 #endif
5043    );
5044
5045 } /* end of png_read_filter_row_mmx_up() */
5046
5047 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5048
5049
5050
5051
5052 /*===========================================================================*/
5053 /*                                                                           */
5054 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5055 /*                                                                           */
5056 /*===========================================================================*/
5057
5058
5059 /* Optimized png_read_filter_row routines */
5060
5061 void /* PRIVATE */
5062 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5063    row, png_bytep prev_row, int filter)
5064 {
5065 #ifdef PNG_DEBUG
5066    char filnm[10];
5067 #endif
5068
5069 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5070 /* GRR:  these are superseded by png_ptr->asm_flags: */
5071 #define UseMMX_sub    1   /* GRR:  converted 20000730 */
5072 #define UseMMX_up     1   /* GRR:  converted 20000729 */
5073 #define UseMMX_avg    1   /* GRR:  converted 20000828 (+ 16-bit bugfix 20000916) */
5074 #define UseMMX_paeth  1   /* GRR:  converted 20000828 */
5075
5076    if (_mmx_supported == 2) {
5077        /* this should have happened in png_init_mmx_flags() already */
5078 #if !defined(PNG_1_0_X)
5079        png_warning(png_ptr, "asm_flags may not have been initialized");
5080 #endif
5081        png_mmx_support();
5082    }
5083 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5084
5085 #ifdef PNG_DEBUG
5086    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5087    switch (filter)
5088    {
5089       case 0: sprintf(filnm, "none");
5090          break;
5091       case 1: sprintf(filnm, "sub-%s",
5092 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5093 #if !defined(PNG_1_0_X)
5094         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5095 #endif
5096 #endif
5097 "x86");
5098          break;
5099       case 2: sprintf(filnm, "up-%s",
5100 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5101 #if !defined(PNG_1_0_X)
5102         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5103 #endif
5104 #endif
5105  "x86");
5106          break;
5107       case 3: sprintf(filnm, "avg-%s",
5108 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5109 #if !defined(PNG_1_0_X)
5110         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5111 #endif
5112 #endif
5113  "x86");
5114          break;
5115       case 4: sprintf(filnm, "Paeth-%s",
5116 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5117 #if !defined(PNG_1_0_X)
5118         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5119 #endif
5120 #endif
5121 "x86");
5122          break;
5123       default: sprintf(filnm, "unknw");
5124          break;
5125    }
5126    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5127    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5128    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5129       (int)((row_info->pixel_depth + 7) >> 3));
5130    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5131 #endif /* PNG_DEBUG */
5132
5133    switch (filter)
5134    {
5135       case PNG_FILTER_VALUE_NONE:
5136          break;
5137
5138       case PNG_FILTER_VALUE_SUB:
5139 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5140 #if !defined(PNG_1_0_X)
5141          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5142              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5143              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5144 #else
5145          if (_mmx_supported)
5146 #endif
5147          {
5148             png_read_filter_row_mmx_sub(row_info, row);
5149          }
5150          else
5151 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5152          {
5153             png_uint_32 i;
5154             png_uint_32 istop = row_info->rowbytes;
5155             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5156             png_bytep rp = row + bpp;
5157             png_bytep lp = row;
5158
5159             for (i = bpp; i < istop; i++)
5160             {
5161                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5162                rp++;
5163             }
5164          }  /* end !UseMMX_sub */
5165          break;
5166
5167       case PNG_FILTER_VALUE_UP:
5168 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5169 #if !defined(PNG_1_0_X)
5170          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5171              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5172              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5173 #else
5174          if (_mmx_supported)
5175 #endif
5176          {
5177             png_read_filter_row_mmx_up(row_info, row, prev_row);
5178          }
5179           else
5180 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5181          {
5182             png_uint_32 i;
5183             png_uint_32 istop = row_info->rowbytes;
5184             png_bytep rp = row;
5185             png_bytep pp = prev_row;
5186
5187             for (i = 0; i < istop; ++i)
5188             {
5189                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5190                rp++;
5191             }
5192          }  /* end !UseMMX_up */
5193          break;
5194
5195       case PNG_FILTER_VALUE_AVG:
5196 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5197 #if !defined(PNG_1_0_X)
5198          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5199              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5200              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5201 #else
5202          if (_mmx_supported)
5203 #endif
5204          {
5205             png_read_filter_row_mmx_avg(row_info, row, prev_row);
5206          }
5207          else
5208 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5209          {
5210             png_uint_32 i;
5211             png_bytep rp = row;
5212             png_bytep pp = prev_row;
5213             png_bytep lp = row;
5214             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5215             png_uint_32 istop = row_info->rowbytes - bpp;
5216
5217             for (i = 0; i < bpp; i++)
5218             {
5219                *rp = (png_byte)(((int)(*rp) +
5220                   ((int)(*pp++) >> 1)) & 0xff);
5221                rp++;
5222             }
5223
5224             for (i = 0; i < istop; i++)
5225             {
5226                *rp = (png_byte)(((int)(*rp) +
5227                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5228                rp++;
5229             }
5230          }  /* end !UseMMX_avg */
5231          break;
5232
5233       case PNG_FILTER_VALUE_PAETH:
5234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5235 #if !defined(PNG_1_0_X)
5236          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5237              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5238              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5239 #else
5240          if (_mmx_supported)
5241 #endif
5242          {
5243             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5244          }
5245          else
5246 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5247          {
5248             png_uint_32 i;
5249             png_bytep rp = row;
5250             png_bytep pp = prev_row;
5251             png_bytep lp = row;
5252             png_bytep cp = prev_row;
5253             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5254             png_uint_32 istop = row_info->rowbytes - bpp;
5255
5256             for (i = 0; i < bpp; i++)
5257             {
5258                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5259                rp++;
5260             }
5261
5262             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5263             {
5264                int a, b, c, pa, pb, pc, p;
5265
5266                a = *lp++;
5267                b = *pp++;
5268                c = *cp++;
5269
5270                p = b - c;
5271                pc = a - c;
5272
5273 #ifdef PNG_USE_ABS
5274                pa = abs(p);
5275                pb = abs(pc);
5276                pc = abs(p + pc);
5277 #else
5278                pa = p < 0 ? -p : p;
5279                pb = pc < 0 ? -pc : pc;
5280                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5281 #endif
5282
5283                /*
5284                   if (pa <= pb && pa <= pc)
5285                      p = a;
5286                   else if (pb <= pc)
5287                      p = b;
5288                   else
5289                      p = c;
5290                 */
5291
5292                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5293
5294                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5295                rp++;
5296             }
5297          }  /* end !UseMMX_paeth */
5298          break;
5299
5300       default:
5301          png_warning(png_ptr, "Ignoring bad row-filter type");
5302          *row=0;
5303          break;
5304    }
5305 }
5306
5307 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5308
5309
5310 /*===========================================================================*/
5311 /*                                                                           */
5312 /*                      P N G _ M M X _ S U P P O R T                        */
5313 /*                                                                           */
5314 /*===========================================================================*/
5315
5316 /* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
5317  *             (2) all instructions compile with gcc 2.7.2.3 and later
5318  *             (3) the function is moved down here to prevent gcc from
5319  *                  inlining it in multiple places and then barfing be-
5320  *                  cause the ".NOT_SUPPORTED" label is multiply defined
5321  *             [is there a way to signal that a *single* function should
5322  *              not be inlined?  is there a way to modify the label for
5323  *              each inlined instance, e.g., by appending _1, _2, etc.?
5324  *              maybe if don't use leading "." in label name? (nope...sigh)]
5325  */
5326
5327 int PNGAPI
5328 png_mmx_support(void)
5329 {
5330 #if defined(PNG_MMX_CODE_SUPPORTED)
5331     __asm__ __volatile__ (
5332         "pushl %%ebx          \n\t"  /* ebx gets clobbered by CPUID instruction */
5333         "pushl %%ecx          \n\t"  /* so does ecx... */
5334         "pushl %%edx          \n\t"  /* ...and edx (but ecx & edx safe on Linux) */
5335 /*      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd */
5336 /*      "pushf                \n\t"  // 16-bit pushf */
5337         "pushfl               \n\t"  /* save Eflag to stack */
5338         "popl %%eax           \n\t"  /* get Eflag from stack into eax */
5339         "movl %%eax, %%ecx    \n\t"  /* make another copy of Eflag in ecx */
5340         "xorl $0x200000, %%eax \n\t" /* toggle ID bit in Eflag (i.e., bit 21) */
5341         "pushl %%eax          \n\t"  /* save modified Eflag back to stack */
5342 /*      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd */
5343 /*      "popf                 \n\t"  // 16-bit popf */
5344         "popfl                \n\t"  /* restore modified value to Eflag reg */
5345         "pushfl               \n\t"  /* save Eflag to stack */
5346         "popl %%eax           \n\t"  /* get Eflag from stack */
5347         "pushl %%ecx          \n\t"  /* save original Eflag to stack */
5348         "popfl                \n\t"  /* restore original Eflag */
5349         "xorl %%ecx, %%eax    \n\t"  /* compare new Eflag with original Eflag */
5350         "jz 0f                \n\t"  /* if same, CPUID instr. is not supported */
5351
5352         "xorl %%eax, %%eax    \n\t"  /* set eax to zero */
5353 /*      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode) */
5354         "cpuid                \n\t"  /* get the CPU identification info */
5355         "cmpl $1, %%eax       \n\t"  /* make sure eax return non-zero value */
5356         "jl 0f                \n\t"  /* if eax is zero, MMX is not supported */
5357
5358         "xorl %%eax, %%eax    \n\t"  /* set eax to zero and... */
5359         "incl %%eax           \n\t"  /* ...increment eax to 1.  This pair is */
5360                                      /* faster than the instruction "mov eax, 1" */
5361         "cpuid                \n\t"  /* get the CPU identification info again */
5362         "andl $0x800000, %%edx \n\t" /* mask out all bits but MMX bit (23) */
5363         "cmpl $0, %%edx       \n\t"  /* 0 = MMX not supported */
5364         "jz 0f                \n\t"  /* non-zero = yes, MMX IS supported */
5365
5366         "movl $1, %%eax       \n\t"  /* set return value to 1 */
5367         "jmp  1f              \n\t"  /* DONE:  have MMX support */
5368
5369     "0:                       \n\t"  /* .NOT_SUPPORTED: target label for jump instructions */
5370         "movl $0, %%eax       \n\t"  /* set return value to 0 */
5371     "1:                       \n\t"  /* .RETURN: target label for jump instructions */
5372         "movl %%eax, _mmx_supported \n\t" /* save in global static variable, too */
5373         "popl %%edx           \n\t"  /* restore edx */
5374         "popl %%ecx           \n\t"  /* restore ecx */
5375         "popl %%ebx           \n\t"  /* restore ebx */
5376
5377 /*      "ret                  \n\t"  // DONE:  no MMX support */
5378                                      /* (fall through to standard C "ret") */
5379
5380         :                            /* output list (none) */
5381
5382         :                            /* any variables used on input (none) */
5383
5384         : "%eax"                     /* clobber list */
5385 /*      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually */
5386 /*      , "memory"   // if write to a variable gcc thought was in a reg */
5387 /*      , "cc"       // "condition codes" (flag bits) */
5388     );
5389 #else
5390     _mmx_supported = 0;
5391 #endif /* PNG_MMX_CODE_SUPPORTED */
5392
5393     return _mmx_supported;
5394 }
5395
5396
5397 #endif /* PNG_USE_PNGGCCRD */