]> git.saurik.com Git - wxWidgets.git/blame - src/png/pngvcrd.c
Fixed tabs (i.e. turning into spaces...) Part 2...
[wxWidgets.git] / src / png / pngvcrd.c
CommitLineData
c6b71bff
GD
1/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
2b5f62a0 5 * libpng version 1.2.5rc3 - September 18, 2002
c6b71bff
GD
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2002 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
19 *
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
22 * [runtime MMX configuration, GRR 20010102]
23 *
24 */
25
26#define PNG_INTERNAL
27#include "png.h"
28
29#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30
31static int mmx_supported=2;
32
33
34int PNGAPI
35png_mmx_support(void)
36{
37 int mmx_supported_local = 0;
38 _asm {
7f88f624 39 push ebx /*CPUID will trash these */
c6b71bff
GD
40 push ecx
41 push edx
42
7f88f624
VZ
43 pushfd /*Save Eflag to stack */
44 pop eax /*Get Eflag from stack into eax */
45 mov ecx, eax /*Make another copy of Eflag in ecx */
46 xor eax, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */
47 push eax /*Save modified Eflag back to stack */
48
49 popfd /*Restored modified value back to Eflag reg */
50 pushfd /*Save Eflag to stack */
51 pop eax /*Get Eflag from stack */
52 push ecx /* save original Eflag to stack */
53 popfd /* restore original Eflag */
54 xor eax, ecx /*Compare the new Eflag with the original Eflag */
55 jz NOT_SUPPORTED /*If the same, CPUID instruction is not supported, */
56 /*skip following instructions and jump to */
57 /*NOT_SUPPORTED label */
58
59 xor eax, eax /*Set eax to zero */
60
61 _asm _emit 0x0f /*CPUID instruction (two bytes opcode) */
c6b71bff
GD
62 _asm _emit 0xa2
63
7f88f624
VZ
64 cmp eax, 1 /*make sure eax return non-zero value */
65 jl NOT_SUPPORTED /*If eax is zero, mmx not supported */
c6b71bff 66
7f88f624
VZ
67 xor eax, eax /*set eax to zero */
68 inc eax /*Now increment eax to 1. This instruction is */
69 /*faster than the instruction "mov eax, 1" */
c6b71bff 70
7f88f624 71 _asm _emit 0x0f /*CPUID instruction */
c6b71bff
GD
72 _asm _emit 0xa2
73
7f88f624
VZ
74 and edx, 0x00800000 /*mask out all bits but mmx bit(24) */
75 cmp edx, 0 /* 0 = mmx not supported */
76 jz NOT_SUPPORTED /* non-zero = Yes, mmx IS supported */
c6b71bff 77
7f88f624 78 mov mmx_supported_local, 1 /*set return value to 1 */
c6b71bff
GD
79
80NOT_SUPPORTED:
7f88f624
VZ
81 mov eax, mmx_supported_local /*move return value to eax */
82 pop edx /*CPUID trashed these */
c6b71bff
GD
83 pop ecx
84 pop ebx
85 }
86
7f88f624
VZ
87 /*mmx_supported_local=0; // test code for force don't support MMX */
88 /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */
c6b71bff
GD
89
90 mmx_supported = mmx_supported_local;
91 return mmx_supported_local;
92}
93
94/* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
104
105/* Use this routine for x86 platform - uses faster MMX routine if machine
106 supports MMX */
107
108void /* PRIVATE */
109png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110{
111#ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113#endif
114
115 png_debug(1,"in png_combine_row_asm\n");
116
117 if (mmx_supported == 2) {
118 /* this should have happened in png_init_mmx_flags() already */
119 png_warning(png_ptr, "asm_flags may not have been initialized");
120 png_mmx_support();
121 }
122
123 if (mask == 0xff)
124 {
125 png_memcpy(row, png_ptr->row_buf + 1,
126 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
127 }
128 /* GRR: add "else if (mask == 0)" case?
129 * or does png_combine_row() not even get called in that case? */
130 else
131 {
132 switch (png_ptr->row_info.pixel_depth)
133 {
134 case 1:
135 {
136 png_bytep sp;
137 png_bytep dp;
138 int s_inc, s_start, s_end;
139 int m;
140 int shift;
141 png_uint_32 i;
142
143 sp = png_ptr->row_buf + 1;
144 dp = row;
145 m = 0x80;
146#if defined(PNG_READ_PACKSWAP_SUPPORTED)
147 if (png_ptr->transformations & PNG_PACKSWAP)
148 {
149 s_start = 0;
150 s_end = 7;
151 s_inc = 1;
152 }
153 else
154#endif
155 {
156 s_start = 7;
157 s_end = 0;
158 s_inc = -1;
159 }
160
161 shift = s_start;
162
163 for (i = 0; i < png_ptr->width; i++)
164 {
165 if (m & mask)
166 {
167 int value;
168
169 value = (*sp >> shift) & 0x1;
170 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
171 *dp |= (png_byte)(value << shift);
172 }
173
174 if (shift == s_end)
175 {
176 shift = s_start;
177 sp++;
178 dp++;
179 }
180 else
181 shift += s_inc;
182
183 if (m == 1)
184 m = 0x80;
185 else
186 m >>= 1;
187 }
188 break;
189 }
190
191 case 2:
192 {
193 png_bytep sp;
194 png_bytep dp;
195 int s_start, s_end, s_inc;
196 int m;
197 int shift;
198 png_uint_32 i;
199 int value;
200
201 sp = png_ptr->row_buf + 1;
202 dp = row;
203 m = 0x80;
204#if defined(PNG_READ_PACKSWAP_SUPPORTED)
205 if (png_ptr->transformations & PNG_PACKSWAP)
206 {
207 s_start = 0;
208 s_end = 6;
209 s_inc = 2;
210 }
211 else
212#endif
213 {
214 s_start = 6;
215 s_end = 0;
216 s_inc = -2;
217 }
218
219 shift = s_start;
220
221 for (i = 0; i < png_ptr->width; i++)
222 {
223 if (m & mask)
224 {
225 value = (*sp >> shift) & 0x3;
226 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
227 *dp |= (png_byte)(value << shift);
228 }
229
230 if (shift == s_end)
231 {
232 shift = s_start;
233 sp++;
234 dp++;
235 }
236 else
237 shift += s_inc;
238 if (m == 1)
239 m = 0x80;
240 else
241 m >>= 1;
242 }
243 break;
244 }
245
246 case 4:
247 {
248 png_bytep sp;
249 png_bytep dp;
250 int s_start, s_end, s_inc;
251 int m;
252 int shift;
253 png_uint_32 i;
254 int value;
255
256 sp = png_ptr->row_buf + 1;
257 dp = row;
258 m = 0x80;
259#if defined(PNG_READ_PACKSWAP_SUPPORTED)
260 if (png_ptr->transformations & PNG_PACKSWAP)
261 {
262 s_start = 0;
263 s_end = 4;
264 s_inc = 4;
265 }
266 else
267#endif
268 {
269 s_start = 4;
270 s_end = 0;
271 s_inc = -4;
272 }
273 shift = s_start;
274
275 for (i = 0; i < png_ptr->width; i++)
276 {
277 if (m & mask)
278 {
279 value = (*sp >> shift) & 0xf;
280 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
281 *dp |= (png_byte)(value << shift);
282 }
283
284 if (shift == s_end)
285 {
286 shift = s_start;
287 sp++;
288 dp++;
289 }
290 else
291 shift += s_inc;
292 if (m == 1)
293 m = 0x80;
294 else
295 m >>= 1;
296 }
297 break;
298 }
299
300 case 8:
301 {
302 png_bytep srcptr;
303 png_bytep dstptr;
304 png_uint_32 len;
305 int m;
306 int diff, unmask;
307
308 __int64 mask0=0x0102040810204080;
309
310 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
311 /* && mmx_supported */ )
312 {
313 srcptr = png_ptr->row_buf + 1;
314 dstptr = row;
315 m = 0x80;
316 unmask = ~mask;
7f88f624
VZ
317 len = png_ptr->width &~7; /*reduce to multiple of 8 */
318 diff = png_ptr->width & 7; /*amount lost */
c6b71bff
GD
319
320 _asm
321 {
7f88f624
VZ
322 movd mm7, unmask /*load bit pattern */
323 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
324 punpcklbw mm7,mm7
325 punpcklwd mm7,mm7
7f88f624 326 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
327
328 movq mm0,mask0
329
7f88f624
VZ
330 pand mm0,mm7 /*nonzero if keep byte */
331 pcmpeqb mm0,mm6 /*zeros->1s, v versa */
c6b71bff 332
7f88f624
VZ
333 mov ecx,len /*load length of line (pixels) */
334 mov esi,srcptr /*load source */
335 mov ebx,dstptr /*load dest */
336 cmp ecx,0 /*lcr */
c6b71bff
GD
337 je mainloop8end
338
339mainloop8:
340 movq mm4,[esi]
341 pand mm4,mm0
342 movq mm6,mm0
343 pandn mm6,[ebx]
344 por mm4,mm6
345 movq [ebx],mm4
346
7f88f624 347 add esi,8 /*inc by 8 bytes processed */
c6b71bff 348 add ebx,8
7f88f624 349 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
350
351 ja mainloop8
352mainloop8end:
353
354 mov ecx,diff
355 cmp ecx,0
356 jz end8
357
358 mov edx,mask
7f88f624 359 sal edx,24 /*make low byte the high byte */
c6b71bff
GD
360
361secondloop8:
7f88f624
VZ
362 sal edx,1 /*move high bit to CF */
363 jnc skip8 /*if CF = 0 */
c6b71bff
GD
364 mov al,[esi]
365 mov [ebx],al
366skip8:
367 inc esi
368 inc ebx
369
370 dec ecx
371 jnz secondloop8
372end8:
373 emms
374 }
375 }
376 else /* mmx not supported - use modified C routine */
377 {
378 register unsigned int incr1, initial_val, final_val;
379 png_size_t pixel_bytes;
380 png_uint_32 i;
381 register int disp = png_pass_inc[png_ptr->pass];
382 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
383
384 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
385 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
386 pixel_bytes;
387 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
388 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
389 final_val = png_ptr->width*pixel_bytes;
390 incr1 = (disp)*pixel_bytes;
391 for (i = initial_val; i < final_val; i += incr1)
392 {
393 png_memcpy(dstptr, srcptr, pixel_bytes);
394 srcptr += incr1;
395 dstptr += incr1;
396 }
397 } /* end of else */
398
399 break;
7f88f624 400 } /* end 8 bpp */
c6b71bff
GD
401
402 case 16:
403 {
404 png_bytep srcptr;
405 png_bytep dstptr;
406 png_uint_32 len;
407 int unmask, diff;
408 __int64 mask1=0x0101020204040808,
409 mask0=0x1010202040408080;
410
411 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
412 /* && mmx_supported */ )
413 {
414 srcptr = png_ptr->row_buf + 1;
415 dstptr = row;
416
417 unmask = ~mask;
418 len = (png_ptr->width)&~7;
419 diff = (png_ptr->width)&7;
420 _asm
421 {
7f88f624
VZ
422 movd mm7, unmask /*load bit pattern */
423 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
424 punpcklbw mm7,mm7
425 punpcklwd mm7,mm7
7f88f624 426 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
427
428 movq mm0,mask0
429 movq mm1,mask1
430
431 pand mm0,mm7
432 pand mm1,mm7
433
434 pcmpeqb mm0,mm6
435 pcmpeqb mm1,mm6
436
7f88f624
VZ
437 mov ecx,len /*load length of line */
438 mov esi,srcptr /*load source */
439 mov ebx,dstptr /*load dest */
440 cmp ecx,0 /*lcr */
c6b71bff
GD
441 jz mainloop16end
442
443mainloop16:
444 movq mm4,[esi]
445 pand mm4,mm0
446 movq mm6,mm0
447 movq mm7,[ebx]
448 pandn mm6,mm7
449 por mm4,mm6
450 movq [ebx],mm4
451
452 movq mm5,[esi+8]
453 pand mm5,mm1
454 movq mm7,mm1
455 movq mm6,[ebx+8]
456 pandn mm7,mm6
457 por mm5,mm7
458 movq [ebx+8],mm5
459
7f88f624 460 add esi,16 /*inc by 16 bytes processed */
c6b71bff 461 add ebx,16
7f88f624 462 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
463
464 ja mainloop16
465
466mainloop16end:
467 mov ecx,diff
468 cmp ecx,0
469 jz end16
470
471 mov edx,mask
7f88f624 472 sal edx,24 /*make low byte the high byte */
c6b71bff 473secondloop16:
7f88f624
VZ
474 sal edx,1 /*move high bit to CF */
475 jnc skip16 /*if CF = 0 */
c6b71bff
GD
476 mov ax,[esi]
477 mov [ebx],ax
478skip16:
479 add esi,2
480 add ebx,2
481
482 dec ecx
483 jnz secondloop16
484end16:
485 emms
486 }
487 }
488 else /* mmx not supported - use modified C routine */
489 {
490 register unsigned int incr1, initial_val, final_val;
491 png_size_t pixel_bytes;
492 png_uint_32 i;
493 register int disp = png_pass_inc[png_ptr->pass];
494 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
495
496 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
497 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
498 pixel_bytes;
499 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
500 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
501 final_val = png_ptr->width*pixel_bytes;
502 incr1 = (disp)*pixel_bytes;
503 for (i = initial_val; i < final_val; i += incr1)
504 {
505 png_memcpy(dstptr, srcptr, pixel_bytes);
506 srcptr += incr1;
507 dstptr += incr1;
508 }
509 } /* end of else */
510
511 break;
7f88f624 512 } /* end 16 bpp */
c6b71bff
GD
513
514 case 24:
515 {
516 png_bytep srcptr;
517 png_bytep dstptr;
518 png_uint_32 len;
519 int unmask, diff;
520
7f88f624 521 __int64 mask2=0x0101010202020404, /*24bpp */
c6b71bff
GD
522 mask1=0x0408080810101020,
523 mask0=0x2020404040808080;
524
525 srcptr = png_ptr->row_buf + 1;
526 dstptr = row;
527
528 unmask = ~mask;
529 len = (png_ptr->width)&~7;
530 diff = (png_ptr->width)&7;
531
532 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
533 /* && mmx_supported */ )
534 {
535 _asm
536 {
7f88f624
VZ
537 movd mm7, unmask /*load bit pattern */
538 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
539 punpcklbw mm7,mm7
540 punpcklwd mm7,mm7
7f88f624 541 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
542
543 movq mm0,mask0
544 movq mm1,mask1
545 movq mm2,mask2
546
547 pand mm0,mm7
548 pand mm1,mm7
549 pand mm2,mm7
550
551 pcmpeqb mm0,mm6
552 pcmpeqb mm1,mm6
553 pcmpeqb mm2,mm6
554
7f88f624
VZ
555 mov ecx,len /*load length of line */
556 mov esi,srcptr /*load source */
557 mov ebx,dstptr /*load dest */
c6b71bff
GD
558 cmp ecx,0
559 jz mainloop24end
560
561mainloop24:
562 movq mm4,[esi]
563 pand mm4,mm0
564 movq mm6,mm0
565 movq mm7,[ebx]
566 pandn mm6,mm7
567 por mm4,mm6
568 movq [ebx],mm4
569
570
571 movq mm5,[esi+8]
572 pand mm5,mm1
573 movq mm7,mm1
574 movq mm6,[ebx+8]
575 pandn mm7,mm6
576 por mm5,mm7
577 movq [ebx+8],mm5
578
579 movq mm6,[esi+16]
580 pand mm6,mm2
581 movq mm4,mm2
582 movq mm7,[ebx+16]
583 pandn mm4,mm7
584 por mm6,mm4
585 movq [ebx+16],mm6
586
7f88f624 587 add esi,24 /*inc by 24 bytes processed */
c6b71bff 588 add ebx,24
7f88f624 589 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
590
591 ja mainloop24
592
593mainloop24end:
594 mov ecx,diff
595 cmp ecx,0
596 jz end24
597
598 mov edx,mask
7f88f624 599 sal edx,24 /*make low byte the high byte */
c6b71bff 600secondloop24:
7f88f624
VZ
601 sal edx,1 /*move high bit to CF */
602 jnc skip24 /*if CF = 0 */
c6b71bff
GD
603 mov ax,[esi]
604 mov [ebx],ax
605 xor eax,eax
606 mov al,[esi+2]
607 mov [ebx+2],al
608skip24:
609 add esi,3
610 add ebx,3
611
612 dec ecx
613 jnz secondloop24
614
615end24:
616 emms
617 }
618 }
619 else /* mmx not supported - use modified C routine */
620 {
621 register unsigned int incr1, initial_val, final_val;
622 png_size_t pixel_bytes;
623 png_uint_32 i;
624 register int disp = png_pass_inc[png_ptr->pass];
625 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
626
627 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
628 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
629 pixel_bytes;
630 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
631 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
632 final_val = png_ptr->width*pixel_bytes;
633 incr1 = (disp)*pixel_bytes;
634 for (i = initial_val; i < final_val; i += incr1)
635 {
636 png_memcpy(dstptr, srcptr, pixel_bytes);
637 srcptr += incr1;
638 dstptr += incr1;
639 }
640 } /* end of else */
641
642 break;
7f88f624 643 } /* end 24 bpp */
c6b71bff
GD
644
645 case 32:
646 {
647 png_bytep srcptr;
648 png_bytep dstptr;
649 png_uint_32 len;
650 int unmask, diff;
651
7f88f624 652 __int64 mask3=0x0101010102020202, /*32bpp */
c6b71bff
GD
653 mask2=0x0404040408080808,
654 mask1=0x1010101020202020,
655 mask0=0x4040404080808080;
656
657 srcptr = png_ptr->row_buf + 1;
658 dstptr = row;
659
660 unmask = ~mask;
661 len = (png_ptr->width)&~7;
662 diff = (png_ptr->width)&7;
663
664 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
665 /* && mmx_supported */ )
666 {
667 _asm
668 {
7f88f624
VZ
669 movd mm7, unmask /*load bit pattern */
670 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
671 punpcklbw mm7,mm7
672 punpcklwd mm7,mm7
7f88f624 673 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
674
675 movq mm0,mask0
676 movq mm1,mask1
677 movq mm2,mask2
678 movq mm3,mask3
679
680 pand mm0,mm7
681 pand mm1,mm7
682 pand mm2,mm7
683 pand mm3,mm7
684
685 pcmpeqb mm0,mm6
686 pcmpeqb mm1,mm6
687 pcmpeqb mm2,mm6
688 pcmpeqb mm3,mm6
689
7f88f624
VZ
690 mov ecx,len /*load length of line */
691 mov esi,srcptr /*load source */
692 mov ebx,dstptr /*load dest */
c6b71bff 693
7f88f624 694 cmp ecx,0 /*lcr */
c6b71bff
GD
695 jz mainloop32end
696
697mainloop32:
698 movq mm4,[esi]
699 pand mm4,mm0
700 movq mm6,mm0
701 movq mm7,[ebx]
702 pandn mm6,mm7
703 por mm4,mm6
704 movq [ebx],mm4
705
706 movq mm5,[esi+8]
707 pand mm5,mm1
708 movq mm7,mm1
709 movq mm6,[ebx+8]
710 pandn mm7,mm6
711 por mm5,mm7
712 movq [ebx+8],mm5
713
714 movq mm6,[esi+16]
715 pand mm6,mm2
716 movq mm4,mm2
717 movq mm7,[ebx+16]
718 pandn mm4,mm7
719 por mm6,mm4
720 movq [ebx+16],mm6
721
722 movq mm7,[esi+24]
723 pand mm7,mm3
724 movq mm5,mm3
725 movq mm4,[ebx+24]
726 pandn mm5,mm4
727 por mm7,mm5
728 movq [ebx+24],mm7
729
7f88f624 730 add esi,32 /*inc by 32 bytes processed */
c6b71bff 731 add ebx,32
7f88f624 732 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
733
734 ja mainloop32
735
736mainloop32end:
737 mov ecx,diff
738 cmp ecx,0
739 jz end32
740
741 mov edx,mask
7f88f624 742 sal edx,24 /*make low byte the high byte */
c6b71bff 743secondloop32:
7f88f624
VZ
744 sal edx,1 /*move high bit to CF */
745 jnc skip32 /*if CF = 0 */
c6b71bff
GD
746 mov eax,[esi]
747 mov [ebx],eax
748skip32:
749 add esi,4
750 add ebx,4
751
752 dec ecx
753 jnz secondloop32
754
755end32:
756 emms
757 }
758 }
759 else /* mmx _not supported - Use modified C routine */
760 {
761 register unsigned int incr1, initial_val, final_val;
762 png_size_t pixel_bytes;
763 png_uint_32 i;
764 register int disp = png_pass_inc[png_ptr->pass];
765 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
766
767 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
768 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
769 pixel_bytes;
770 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
771 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
772 final_val = png_ptr->width*pixel_bytes;
773 incr1 = (disp)*pixel_bytes;
774 for (i = initial_val; i < final_val; i += incr1)
775 {
776 png_memcpy(dstptr, srcptr, pixel_bytes);
777 srcptr += incr1;
778 dstptr += incr1;
779 }
780 } /* end of else */
781
782 break;
7f88f624 783 } /* end 32 bpp */
c6b71bff
GD
784
785 case 48:
786 {
787 png_bytep srcptr;
788 png_bytep dstptr;
789 png_uint_32 len;
790 int unmask, diff;
791
792 __int64 mask5=0x0101010101010202,
793 mask4=0x0202020204040404,
794 mask3=0x0404080808080808,
795 mask2=0x1010101010102020,
796 mask1=0x2020202040404040,
797 mask0=0x4040808080808080;
798
799 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
800 /* && mmx_supported */ )
801 {
802 srcptr = png_ptr->row_buf + 1;
803 dstptr = row;
804
805 unmask = ~mask;
806 len = (png_ptr->width)&~7;
807 diff = (png_ptr->width)&7;
808 _asm
809 {
7f88f624
VZ
810 movd mm7, unmask /*load bit pattern */
811 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
812 punpcklbw mm7,mm7
813 punpcklwd mm7,mm7
7f88f624 814 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
815
816 movq mm0,mask0
817 movq mm1,mask1
818 movq mm2,mask2
819 movq mm3,mask3
820 movq mm4,mask4
821 movq mm5,mask5
822
823 pand mm0,mm7
824 pand mm1,mm7
825 pand mm2,mm7
826 pand mm3,mm7
827 pand mm4,mm7
828 pand mm5,mm7
829
830 pcmpeqb mm0,mm6
831 pcmpeqb mm1,mm6
832 pcmpeqb mm2,mm6
833 pcmpeqb mm3,mm6
834 pcmpeqb mm4,mm6
835 pcmpeqb mm5,mm6
836
7f88f624
VZ
837 mov ecx,len /*load length of line */
838 mov esi,srcptr /*load source */
839 mov ebx,dstptr /*load dest */
c6b71bff
GD
840
841 cmp ecx,0
842 jz mainloop48end
843
844mainloop48:
845 movq mm7,[esi]
846 pand mm7,mm0
847 movq mm6,mm0
848 pandn mm6,[ebx]
849 por mm7,mm6
850 movq [ebx],mm7
851
852 movq mm6,[esi+8]
853 pand mm6,mm1
854 movq mm7,mm1
855 pandn mm7,[ebx+8]
856 por mm6,mm7
857 movq [ebx+8],mm6
858
859 movq mm6,[esi+16]
860 pand mm6,mm2
861 movq mm7,mm2
862 pandn mm7,[ebx+16]
863 por mm6,mm7
864 movq [ebx+16],mm6
865
866 movq mm7,[esi+24]
867 pand mm7,mm3
868 movq mm6,mm3
869 pandn mm6,[ebx+24]
870 por mm7,mm6
871 movq [ebx+24],mm7
872
873 movq mm6,[esi+32]
874 pand mm6,mm4
875 movq mm7,mm4
876 pandn mm7,[ebx+32]
877 por mm6,mm7
878 movq [ebx+32],mm6
879
880 movq mm7,[esi+40]
881 pand mm7,mm5
882 movq mm6,mm5
883 pandn mm6,[ebx+40]
884 por mm7,mm6
885 movq [ebx+40],mm7
886
7f88f624 887 add esi,48 /*inc by 32 bytes processed */
c6b71bff 888 add ebx,48
7f88f624 889 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
890
891 ja mainloop48
892mainloop48end:
893
894 mov ecx,diff
895 cmp ecx,0
896 jz end48
897
898 mov edx,mask
7f88f624 899 sal edx,24 /*make low byte the high byte */
c6b71bff
GD
900
901secondloop48:
7f88f624
VZ
902 sal edx,1 /*move high bit to CF */
903 jnc skip48 /*if CF = 0 */
c6b71bff
GD
904 mov eax,[esi]
905 mov [ebx],eax
906skip48:
907 add esi,4
908 add ebx,4
909
910 dec ecx
911 jnz secondloop48
912
913end48:
914 emms
915 }
916 }
917 else /* mmx _not supported - Use modified C routine */
918 {
919 register unsigned int incr1, initial_val, final_val;
920 png_size_t pixel_bytes;
921 png_uint_32 i;
922 register int disp = png_pass_inc[png_ptr->pass];
923 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
924
925 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
926 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
927 pixel_bytes;
928 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
929 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
930 final_val = png_ptr->width*pixel_bytes;
931 incr1 = (disp)*pixel_bytes;
932 for (i = initial_val; i < final_val; i += incr1)
933 {
934 png_memcpy(dstptr, srcptr, pixel_bytes);
935 srcptr += incr1;
936 dstptr += incr1;
937 }
938 } /* end of else */
939
940 break;
7f88f624 941 } /* end 48 bpp */
c6b71bff
GD
942
943 default:
944 {
945 png_bytep sptr;
946 png_bytep dp;
947 png_size_t pixel_bytes;
948 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
949 unsigned int i;
7f88f624 950 register int disp = png_pass_inc[png_ptr->pass]; /* get the offset */
c6b71bff
GD
951 register unsigned int incr1, initial_val, final_val;
952
953 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
954 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
955 pixel_bytes;
956 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
957 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
958 final_val = png_ptr->width*pixel_bytes;
959 incr1 = (disp)*pixel_bytes;
960 for (i = initial_val; i < final_val; i += incr1)
961 {
962 png_memcpy(dp, sptr, pixel_bytes);
963 sptr += incr1;
964 dp += incr1;
965 }
966 break;
967 }
968 } /* end switch (png_ptr->row_info.pixel_depth) */
969 } /* end if (non-trivial mask) */
970
971} /* end png_combine_row() */
972
973
974#if defined(PNG_READ_INTERLACING_SUPPORTED)
975
976void /* PRIVATE */
977png_do_read_interlace(png_structp png_ptr)
978{
979 png_row_infop row_info = &(png_ptr->row_info);
980 png_bytep row = png_ptr->row_buf + 1;
981 int pass = png_ptr->pass;
982 png_uint_32 transformations = png_ptr->transformations;
983#ifdef PNG_USE_LOCAL_ARRAYS
984 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
985#endif
986
987 png_debug(1,"in png_do_read_interlace\n");
988
989 if (mmx_supported == 2) {
990 /* this should have happened in png_init_mmx_flags() already */
991 png_warning(png_ptr, "asm_flags may not have been initialized");
992 png_mmx_support();
993 }
994
995 if (row != NULL && row_info != NULL)
996 {
997 png_uint_32 final_width;
998
999 final_width = row_info->width * png_pass_inc[pass];
1000
1001 switch (row_info->pixel_depth)
1002 {
1003 case 1:
1004 {
1005 png_bytep sp, dp;
1006 int sshift, dshift;
1007 int s_start, s_end, s_inc;
1008 png_byte v;
1009 png_uint_32 i;
1010 int j;
1011
1012 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1013 dp = row + (png_size_t)((final_width - 1) >> 3);
1014#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1015 if (transformations & PNG_PACKSWAP)
1016 {
1017 sshift = (int)((row_info->width + 7) & 7);
1018 dshift = (int)((final_width + 7) & 7);
1019 s_start = 7;
1020 s_end = 0;
1021 s_inc = -1;
1022 }
1023 else
1024#endif
1025 {
1026 sshift = 7 - (int)((row_info->width + 7) & 7);
1027 dshift = 7 - (int)((final_width + 7) & 7);
1028 s_start = 0;
1029 s_end = 7;
1030 s_inc = 1;
1031 }
1032
1033 for (i = row_info->width; i; i--)
1034 {
1035 v = (png_byte)((*sp >> sshift) & 0x1);
1036 for (j = 0; j < png_pass_inc[pass]; j++)
1037 {
1038 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1039 *dp |= (png_byte)(v << dshift);
1040 if (dshift == s_end)
1041 {
1042 dshift = s_start;
1043 dp--;
1044 }
1045 else
1046 dshift += s_inc;
1047 }
1048 if (sshift == s_end)
1049 {
1050 sshift = s_start;
1051 sp--;
1052 }
1053 else
1054 sshift += s_inc;
1055 }
1056 break;
1057 }
1058
1059 case 2:
1060 {
1061 png_bytep sp, dp;
1062 int sshift, dshift;
1063 int s_start, s_end, s_inc;
1064 png_uint_32 i;
1065
1066 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1067 dp = row + (png_size_t)((final_width - 1) >> 2);
1068#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1069 if (transformations & PNG_PACKSWAP)
1070 {
1071 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1072 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1073 s_start = 6;
1074 s_end = 0;
1075 s_inc = -2;
1076 }
1077 else
1078#endif
1079 {
1080 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1081 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1082 s_start = 0;
1083 s_end = 6;
1084 s_inc = 2;
1085 }
1086
1087 for (i = row_info->width; i; i--)
1088 {
1089 png_byte v;
1090 int j;
1091
1092 v = (png_byte)((*sp >> sshift) & 0x3);
1093 for (j = 0; j < png_pass_inc[pass]; j++)
1094 {
1095 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1096 *dp |= (png_byte)(v << dshift);
1097 if (dshift == s_end)
1098 {
1099 dshift = s_start;
1100 dp--;
1101 }
1102 else
1103 dshift += s_inc;
1104 }
1105 if (sshift == s_end)
1106 {
1107 sshift = s_start;
1108 sp--;
1109 }
1110 else
1111 sshift += s_inc;
1112 }
1113 break;
1114 }
1115
1116 case 4:
1117 {
1118 png_bytep sp, dp;
1119 int sshift, dshift;
1120 int s_start, s_end, s_inc;
1121 png_uint_32 i;
1122
1123 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1124 dp = row + (png_size_t)((final_width - 1) >> 1);
1125#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1126 if (transformations & PNG_PACKSWAP)
1127 {
1128 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1129 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1130 s_start = 4;
1131 s_end = 0;
1132 s_inc = -4;
1133 }
1134 else
1135#endif
1136 {
1137 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1138 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1139 s_start = 0;
1140 s_end = 4;
1141 s_inc = 4;
1142 }
1143
1144 for (i = row_info->width; i; i--)
1145 {
1146 png_byte v;
1147 int j;
1148
1149 v = (png_byte)((*sp >> sshift) & 0xf);
1150 for (j = 0; j < png_pass_inc[pass]; j++)
1151 {
1152 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1153 *dp |= (png_byte)(v << dshift);
1154 if (dshift == s_end)
1155 {
1156 dshift = s_start;
1157 dp--;
1158 }
1159 else
1160 dshift += s_inc;
1161 }
1162 if (sshift == s_end)
1163 {
1164 sshift = s_start;
1165 sp--;
1166 }
1167 else
1168 sshift += s_inc;
1169 }
1170 break;
1171 }
1172
7f88f624 1173 default: /* This is the place where the routine is modified */
c6b71bff
GD
1174 {
1175 __int64 const4 = 0x0000000000FFFFFF;
7f88f624 1176 /* __int64 const5 = 0x000000FFFFFF0000; // unused... */
c6b71bff
GD
1177 __int64 const6 = 0x00000000000000FF;
1178 png_bytep sptr, dp;
1179 png_uint_32 i;
1180 png_size_t pixel_bytes;
1181 int width = row_info->width;
1182
1183 pixel_bytes = (row_info->pixel_depth >> 3);
1184
1185 sptr = row + (width - 1) * pixel_bytes;
1186 dp = row + (final_width - 1) * pixel_bytes;
7f88f624
VZ
1187 /* New code by Nirav Chhatrapati - Intel Corporation */
1188 /* sign fix by GRR */
1189 /* NOTE: there is NO MMX code for 48-bit and 64-bit images */
c6b71bff 1190
7f88f624 1191 /* use MMX routine if machine supports it */
c6b71bff
GD
1192 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1193 /* && mmx_supported */ )
1194 {
1195 if (pixel_bytes == 3)
1196 {
1197 if (((pass == 0) || (pass == 1)) && width)
1198 {
1199 _asm
1200 {
1201 mov esi, sptr
1202 mov edi, dp
1203 mov ecx, width
7f88f624 1204 sub edi, 21 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1205loop_pass0:
1206 movd mm0, [esi] ; X X X X X v2 v1 v0
1207 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1208 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1209 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1210 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1211 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1212 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1213 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1214 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1216 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1217 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1218 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1219 movq [edi+16] , mm4
1220 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1221 movq [edi+8] , mm3
1222 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1223 sub esi, 3
1224 movq [edi], mm0
1225 sub edi, 24
7f88f624 1226 /*sub esi, 3 */
c6b71bff
GD
1227 dec ecx
1228 jnz loop_pass0
1229 EMMS
1230 }
1231 }
1232 else if (((pass == 2) || (pass == 3)) && width)
1233 {
1234 _asm
1235 {
1236 mov esi, sptr
1237 mov edi, dp
1238 mov ecx, width
7f88f624 1239 sub edi, 9 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1240loop_pass2:
1241 movd mm0, [esi] ; X X X X X v2 v1 v0
1242 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1243 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1244 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1245 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1246 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1247 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1248 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1249 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1250 movq [edi+4], mm0 ; move to memory
1251 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1252 movd [edi], mm0 ; move to memory
1253 sub esi, 3
1254 sub edi, 12
1255 dec ecx
1256 jnz loop_pass2
1257 EMMS
1258 }
1259 }
1260 else if (width) /* && ((pass == 4) || (pass == 5)) */
1261 {
1262 int width_mmx = ((width >> 1) << 1) - 8;
1263 if (width_mmx < 0)
1264 width_mmx = 0;
7f88f624 1265 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */
c6b71bff
GD
1266 if (width_mmx)
1267 {
1268 _asm
1269 {
1270 mov esi, sptr
1271 mov edi, dp
1272 mov ecx, width_mmx
1273 sub esi, 3
1274 sub edi, 9
1275loop_pass4:
1276 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1277 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1278 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1279 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1280 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1281 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1282 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1283 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1284 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1285 movq [edi], mm0 ; move quad to memory
1286 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1287 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1288 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1289 movd [edi+8], mm6 ; move double to memory
1290 sub esi, 6
1291 sub edi, 12
1292 sub ecx, 2
1293 jnz loop_pass4
1294 EMMS
1295 }
1296 }
1297
1298 sptr -= width_mmx*3;
1299 dp -= width_mmx*6;
1300 for (i = width; i; i--)
1301 {
1302 png_byte v[8];
1303 int j;
1304
1305 png_memcpy(v, sptr, 3);
1306 for (j = 0; j < png_pass_inc[pass]; j++)
1307 {
1308 png_memcpy(dp, v, 3);
1309 dp -= 3;
1310 }
1311 sptr -= 3;
1312 }
1313 }
1314 } /* end of pixel_bytes == 3 */
1315
1316 else if (pixel_bytes == 1)
1317 {
1318 if (((pass == 0) || (pass == 1)) && width)
1319 {
1320 int width_mmx = ((width >> 2) << 2);
1321 width -= width_mmx;
1322 if (width_mmx)
1323 {
1324 _asm
1325 {
1326 mov esi, sptr
1327 mov edi, dp
1328 mov ecx, width_mmx
1329 sub edi, 31
1330 sub esi, 3
1331loop1_pass0:
1332 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1333 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1334 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1335 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1336 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1337 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1338 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1339 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1340 movq [edi], mm0 ; move to memory v3
1341 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1342 movq [edi+8], mm3 ; move to memory v2
1343 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1344 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1345 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1346 movq [edi+16], mm2 ; move to memory v1
1347 movq [edi+24], mm4 ; move to memory v0
1348 sub esi, 4
1349 sub edi, 32
1350 sub ecx, 4
1351 jnz loop1_pass0
1352 EMMS
1353 }
1354 }
1355
1356 sptr -= width_mmx;
1357 dp -= width_mmx*8;
1358 for (i = width; i; i--)
1359 {
1360 int j;
1361
1362 /* I simplified this part in version 1.0.4e
1363 * here and in several other instances where
1364 * pixel_bytes == 1 -- GR-P
1365 *
1366 * Original code:
1367 *
1368 * png_byte v[8];
1369 * png_memcpy(v, sptr, pixel_bytes);
1370 * for (j = 0; j < png_pass_inc[pass]; j++)
1371 * {
1372 * png_memcpy(dp, v, pixel_bytes);
1373 * dp -= pixel_bytes;
1374 * }
1375 * sptr -= pixel_bytes;
1376 *
1377 * Replacement code is in the next three lines:
1378 */
1379
1380 for (j = 0; j < png_pass_inc[pass]; j++)
1381 *dp-- = *sptr;
1382 sptr--;
1383 }
1384 }
1385 else if (((pass == 2) || (pass == 3)) && width)
1386 {
1387 int width_mmx = ((width >> 2) << 2);
1388 width -= width_mmx;
1389 if (width_mmx)
1390 {
1391 _asm
1392 {
1393 mov esi, sptr
1394 mov edi, dp
1395 mov ecx, width_mmx
1396 sub edi, 15
1397 sub esi, 3
1398loop1_pass2:
1399 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1400 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1401 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1402 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1403 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1404 movq [edi], mm0 ; move to memory v2 and v3
1405 sub esi, 4
1406 movq [edi+8], mm1 ; move to memory v1 and v0
1407 sub edi, 16
1408 sub ecx, 4
1409 jnz loop1_pass2
1410 EMMS
1411 }
1412 }
1413
1414 sptr -= width_mmx;
1415 dp -= width_mmx*4;
1416 for (i = width; i; i--)
1417 {
1418 int j;
1419
1420 for (j = 0; j < png_pass_inc[pass]; j++)
1421 {
1422 *dp-- = *sptr;
1423 }
1424 sptr --;
1425 }
1426 }
1427 else if (width) /* && ((pass == 4) || (pass == 5))) */
1428 {
1429 int width_mmx = ((width >> 3) << 3);
1430 width -= width_mmx;
1431 if (width_mmx)
1432 {
1433 _asm
1434 {
1435 mov esi, sptr
1436 mov edi, dp
1437 mov ecx, width_mmx
1438 sub edi, 15
1439 sub esi, 7
1440loop1_pass4:
1441 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1442 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1443 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
7f88f624 1444 /*movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 */
c6b71bff
GD
1445 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1446 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1447 sub esi, 8
1448 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
7f88f624 1449 /*sub esi, 4 */
c6b71bff
GD
1450 sub edi, 16
1451 sub ecx, 8
1452 jnz loop1_pass4
1453 EMMS
1454 }
1455 }
1456
1457 sptr -= width_mmx;
1458 dp -= width_mmx*2;
1459 for (i = width; i; i--)
1460 {
1461 int j;
1462
1463 for (j = 0; j < png_pass_inc[pass]; j++)
1464 {
1465 *dp-- = *sptr;
1466 }
1467 sptr --;
1468 }
1469 }
1470 } /* end of pixel_bytes == 1 */
1471
1472 else if (pixel_bytes == 2)
1473 {
1474 if (((pass == 0) || (pass == 1)) && width)
1475 {
1476 int width_mmx = ((width >> 1) << 1);
1477 width -= width_mmx;
1478 if (width_mmx)
1479 {
1480 _asm
1481 {
1482 mov esi, sptr
1483 mov edi, dp
1484 mov ecx, width_mmx
1485 sub esi, 2
1486 sub edi, 30
1487loop2_pass0:
1488 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1489 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1490 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1491 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1492 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1493 movq [edi], mm0
1494 movq [edi + 8], mm0
1495 movq [edi + 16], mm1
1496 movq [edi + 24], mm1
1497 sub esi, 4
1498 sub edi, 32
1499 sub ecx, 2
1500 jnz loop2_pass0
1501 EMMS
1502 }
1503 }
1504
7f88f624
VZ
1505 sptr -= (width_mmx*2 - 2); /* sign fixed */
1506 dp -= (width_mmx*16 - 2); /* sign fixed */
c6b71bff
GD
1507 for (i = width; i; i--)
1508 {
1509 png_byte v[8];
1510 int j;
1511 sptr -= 2;
1512 png_memcpy(v, sptr, 2);
1513 for (j = 0; j < png_pass_inc[pass]; j++)
1514 {
1515 dp -= 2;
1516 png_memcpy(dp, v, 2);
1517 }
1518 }
1519 }
1520 else if (((pass == 2) || (pass == 3)) && width)
1521 {
1522 int width_mmx = ((width >> 1) << 1) ;
1523 width -= width_mmx;
1524 if (width_mmx)
1525 {
1526 _asm
1527 {
1528 mov esi, sptr
1529 mov edi, dp
1530 mov ecx, width_mmx
1531 sub esi, 2
1532 sub edi, 14
1533loop2_pass2:
1534 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1535 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1536 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1537 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1538 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1539 movq [edi], mm0
1540 sub esi, 4
1541 movq [edi + 8], mm1
7f88f624 1542 /*sub esi, 4 */
c6b71bff
GD
1543 sub edi, 16
1544 sub ecx, 2
1545 jnz loop2_pass2
1546 EMMS
1547 }
1548 }
1549
7f88f624
VZ
1550 sptr -= (width_mmx*2 - 2); /* sign fixed */
1551 dp -= (width_mmx*8 - 2); /* sign fixed */
c6b71bff
GD
1552 for (i = width; i; i--)
1553 {
1554 png_byte v[8];
1555 int j;
1556 sptr -= 2;
1557 png_memcpy(v, sptr, 2);
1558 for (j = 0; j < png_pass_inc[pass]; j++)
1559 {
1560 dp -= 2;
1561 png_memcpy(dp, v, 2);
1562 }
1563 }
1564 }
7f88f624 1565 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
1566 {
1567 int width_mmx = ((width >> 1) << 1) ;
1568 width -= width_mmx;
1569 if (width_mmx)
1570 {
1571 _asm
1572 {
1573 mov esi, sptr
1574 mov edi, dp
1575 mov ecx, width_mmx
1576 sub esi, 2
1577 sub edi, 6
1578loop2_pass4:
1579 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1580 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1581 sub esi, 4
1582 movq [edi], mm0
1583 sub edi, 8
1584 sub ecx, 2
1585 jnz loop2_pass4
1586 EMMS
1587 }
1588 }
1589
7f88f624
VZ
1590 sptr -= (width_mmx*2 - 2); /* sign fixed */
1591 dp -= (width_mmx*4 - 2); /* sign fixed */
c6b71bff
GD
1592 for (i = width; i; i--)
1593 {
1594 png_byte v[8];
1595 int j;
1596 sptr -= 2;
1597 png_memcpy(v, sptr, 2);
1598 for (j = 0; j < png_pass_inc[pass]; j++)
1599 {
1600 dp -= 2;
1601 png_memcpy(dp, v, 2);
1602 }
1603 }
1604 }
1605 } /* end of pixel_bytes == 2 */
1606
1607 else if (pixel_bytes == 4)
1608 {
1609 if (((pass == 0) || (pass == 1)) && width)
1610 {
1611 int width_mmx = ((width >> 1) << 1) ;
1612 width -= width_mmx;
1613 if (width_mmx)
1614 {
1615 _asm
1616 {
1617 mov esi, sptr
1618 mov edi, dp
1619 mov ecx, width_mmx
1620 sub esi, 4
1621 sub edi, 60
1622loop4_pass0:
1623 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1624 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1625 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1626 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1627 movq [edi], mm0
1628 movq [edi + 8], mm0
1629 movq [edi + 16], mm0
1630 movq [edi + 24], mm0
1631 movq [edi+32], mm1
1632 movq [edi + 40], mm1
1633 movq [edi+ 48], mm1
1634 sub esi, 8
1635 movq [edi + 56], mm1
1636 sub edi, 64
1637 sub ecx, 2
1638 jnz loop4_pass0
1639 EMMS
1640 }
1641 }
1642
7f88f624
VZ
1643 sptr -= (width_mmx*4 - 4); /* sign fixed */
1644 dp -= (width_mmx*32 - 4); /* sign fixed */
c6b71bff
GD
1645 for (i = width; i; i--)
1646 {
1647 png_byte v[8];
1648 int j;
1649 sptr -= 4;
1650 png_memcpy(v, sptr, 4);
1651 for (j = 0; j < png_pass_inc[pass]; j++)
1652 {
1653 dp -= 4;
1654 png_memcpy(dp, v, 4);
1655 }
1656 }
1657 }
1658 else if (((pass == 2) || (pass == 3)) && width)
1659 {
1660 int width_mmx = ((width >> 1) << 1) ;
1661 width -= width_mmx;
1662 if (width_mmx)
1663 {
1664 _asm
1665 {
1666 mov esi, sptr
1667 mov edi, dp
1668 mov ecx, width_mmx
1669 sub esi, 4
1670 sub edi, 28
1671loop4_pass2:
1672 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1673 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1674 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1675 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1676 movq [edi], mm0
1677 movq [edi + 8], mm0
1678 movq [edi+16], mm1
1679 movq [edi + 24], mm1
1680 sub esi, 8
1681 sub edi, 32
1682 sub ecx, 2
1683 jnz loop4_pass2
1684 EMMS
1685 }
1686 }
1687
7f88f624
VZ
1688 sptr -= (width_mmx*4 - 4); /* sign fixed */
1689 dp -= (width_mmx*16 - 4); /* sign fixed */
c6b71bff
GD
1690 for (i = width; i; i--)
1691 {
1692 png_byte v[8];
1693 int j;
1694 sptr -= 4;
1695 png_memcpy(v, sptr, 4);
1696 for (j = 0; j < png_pass_inc[pass]; j++)
1697 {
1698 dp -= 4;
1699 png_memcpy(dp, v, 4);
1700 }
1701 }
1702 }
7f88f624 1703 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
1704 {
1705 int width_mmx = ((width >> 1) << 1) ;
1706 width -= width_mmx;
1707 if (width_mmx)
1708 {
1709 _asm
1710 {
1711 mov esi, sptr
1712 mov edi, dp
1713 mov ecx, width_mmx
1714 sub esi, 4
1715 sub edi, 12
1716loop4_pass4:
1717 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1718 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1719 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1720 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1721 movq [edi], mm0
1722 sub esi, 8
1723 movq [edi + 8], mm1
1724 sub edi, 16
1725 sub ecx, 2
1726 jnz loop4_pass4
1727 EMMS
1728 }
1729 }
1730
7f88f624
VZ
1731 sptr -= (width_mmx*4 - 4); /* sign fixed */
1732 dp -= (width_mmx*8 - 4); /* sign fixed */
c6b71bff
GD
1733 for (i = width; i; i--)
1734 {
1735 png_byte v[8];
1736 int j;
1737 sptr -= 4;
1738 png_memcpy(v, sptr, 4);
1739 for (j = 0; j < png_pass_inc[pass]; j++)
1740 {
1741 dp -= 4;
1742 png_memcpy(dp, v, 4);
1743 }
1744 }
1745 }
1746
1747 } /* end of pixel_bytes == 4 */
1748
1749 else if (pixel_bytes == 6)
1750 {
1751 for (i = width; i; i--)
1752 {
1753 png_byte v[8];
1754 int j;
1755 png_memcpy(v, sptr, 6);
1756 for (j = 0; j < png_pass_inc[pass]; j++)
1757 {
1758 png_memcpy(dp, v, 6);
1759 dp -= 6;
1760 }
1761 sptr -= 6;
1762 }
1763 } /* end of pixel_bytes == 6 */
1764
1765 else
1766 {
1767 for (i = width; i; i--)
1768 {
1769 png_byte v[8];
1770 int j;
1771 png_memcpy(v, sptr, pixel_bytes);
1772 for (j = 0; j < png_pass_inc[pass]; j++)
1773 {
1774 png_memcpy(dp, v, pixel_bytes);
1775 dp -= pixel_bytes;
1776 }
1777 sptr-= pixel_bytes;
1778 }
1779 }
1780 } /* end of mmx_supported */
1781
1782 else /* MMX not supported: use modified C code - takes advantage
1783 * of inlining of memcpy for a constant */
1784 {
1785 if (pixel_bytes == 1)
1786 {
1787 for (i = width; i; i--)
1788 {
1789 int j;
1790 for (j = 0; j < png_pass_inc[pass]; j++)
1791 *dp-- = *sptr;
1792 sptr--;
1793 }
1794 }
1795 else if (pixel_bytes == 3)
1796 {
1797 for (i = width; i; i--)
1798 {
1799 png_byte v[8];
1800 int j;
1801 png_memcpy(v, sptr, pixel_bytes);
1802 for (j = 0; j < png_pass_inc[pass]; j++)
1803 {
1804 png_memcpy(dp, v, pixel_bytes);
1805 dp -= pixel_bytes;
1806 }
1807 sptr -= pixel_bytes;
1808 }
1809 }
1810 else if (pixel_bytes == 2)
1811 {
1812 for (i = width; i; i--)
1813 {
1814 png_byte v[8];
1815 int j;
1816 png_memcpy(v, sptr, pixel_bytes);
1817 for (j = 0; j < png_pass_inc[pass]; j++)
1818 {
1819 png_memcpy(dp, v, pixel_bytes);
1820 dp -= pixel_bytes;
1821 }
1822 sptr -= pixel_bytes;
1823 }
1824 }
1825 else if (pixel_bytes == 4)
1826 {
1827 for (i = width; i; i--)
1828 {
1829 png_byte v[8];
1830 int j;
1831 png_memcpy(v, sptr, pixel_bytes);
1832 for (j = 0; j < png_pass_inc[pass]; j++)
1833 {
1834 png_memcpy(dp, v, pixel_bytes);
1835 dp -= pixel_bytes;
1836 }
1837 sptr -= pixel_bytes;
1838 }
1839 }
1840 else if (pixel_bytes == 6)
1841 {
1842 for (i = width; i; i--)
1843 {
1844 png_byte v[8];
1845 int j;
1846 png_memcpy(v, sptr, pixel_bytes);
1847 for (j = 0; j < png_pass_inc[pass]; j++)
1848 {
1849 png_memcpy(dp, v, pixel_bytes);
1850 dp -= pixel_bytes;
1851 }
1852 sptr -= pixel_bytes;
1853 }
1854 }
1855 else
1856 {
1857 for (i = width; i; i--)
1858 {
1859 png_byte v[8];
1860 int j;
1861 png_memcpy(v, sptr, pixel_bytes);
1862 for (j = 0; j < png_pass_inc[pass]; j++)
1863 {
1864 png_memcpy(dp, v, pixel_bytes);
1865 dp -= pixel_bytes;
1866 }
1867 sptr -= pixel_bytes;
1868 }
1869 }
1870
1871 } /* end of MMX not supported */
1872 break;
1873 }
1874 } /* end switch (row_info->pixel_depth) */
1875
1876 row_info->width = final_width;
1877 row_info->rowbytes = ((final_width *
1878 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1879 }
1880
1881}
1882
1883#endif /* PNG_READ_INTERLACING_SUPPORTED */
1884
1885
7f88f624
VZ
1886/* These variables are utilized in the functions below. They are declared */
1887/* globally here to ensure alignment on 8-byte boundaries. */
c6b71bff
GD
1888
1889union uAll {
1890 __int64 use;
1891 double align;
1892} LBCarryMask = {0x0101010101010101},
1893 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1894 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1895
1896
7f88f624 1897/* Optimized code for PNG Average filter decoder */
c6b71bff
GD
1898void /* PRIVATE */
1899png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1900 , png_bytep prev_row)
1901{
1902 int bpp;
1903 png_uint_32 FullLength;
1904 png_uint_32 MMXLength;
7f88f624 1905 /*png_uint_32 len; */
c6b71bff
GD
1906 int diff;
1907
7f88f624
VZ
1908 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
1909 FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff 1910 _asm {
7f88f624
VZ
1911 /* Init address pointers and offset */
1912 mov edi, row /* edi ==> Avg(x) */
1913 xor ebx, ebx /* ebx ==> x */
c6b71bff 1914 mov edx, edi
7f88f624
VZ
1915 mov esi, prev_row /* esi ==> Prior(x) */
1916 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff
GD
1917
1918 xor eax, eax
7f88f624
VZ
1919 /* Compute the Raw value for the first bpp bytes */
1920 /* Raw(x) = Avg(x) + (Prior(x)/2) */
c6b71bff 1921davgrlp:
7f88f624 1922 mov al, [esi + ebx] /* Load al with Prior(x) */
c6b71bff 1923 inc ebx
7f88f624
VZ
1924 shr al, 1 /* divide by 2 */
1925 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
c6b71bff 1926 cmp ebx, bpp
7f88f624
VZ
1927 mov [edi+ebx-1], al /* Write back Raw(x); */
1928 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff 1929 jb davgrlp
7f88f624
VZ
1930 /* get # of bytes to alignment */
1931 mov diff, edi /* take start of row */
1932 add diff, ebx /* add bpp */
1933 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */
1934 and diff, 0xfffffff8 /* mask to alignment boundary */
1935 sub diff, edi /* subtract from start ==> value ebx at alignment */
c6b71bff 1936 jz davggo
7f88f624
VZ
1937 /* fix alignment */
1938 /* Compute the Raw value for the bytes upto the alignment boundary */
1939 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff
GD
1940 xor ecx, ecx
1941davglp1:
1942 xor eax, eax
7f88f624
VZ
1943 mov cl, [esi + ebx] /* load cl with Prior(x) */
1944 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
c6b71bff
GD
1945 add ax, cx
1946 inc ebx
7f88f624
VZ
1947 shr ax, 1 /* divide by 2 */
1948 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
1949 cmp ebx, diff /* Check if at alignment boundary */
1950 mov [edi+ebx-1], al /* Write back Raw(x); */
1951 /* mov does not affect flags; -1 to offset inc ebx */
1952 jb davglp1 /* Repeat until at alignment boundary */
c6b71bff
GD
1953davggo:
1954 mov eax, FullLength
1955 mov ecx, eax
7f88f624
VZ
1956 sub eax, ebx /* subtract alignment fix */
1957 and eax, 0x00000007 /* calc bytes over mult of 8 */
1958 sub ecx, eax /* drop over bytes from original length */
c6b71bff 1959 mov MMXLength, ecx
7f88f624
VZ
1960 } /* end _asm block */
1961 /* Now do the math for the rest of the row */
c6b71bff
GD
1962 switch ( bpp )
1963 {
1964 case 3:
1965 {
1966 ActiveMask.use = 0x0000000000ffffff;
7f88f624
VZ
1967 ShiftBpp.use = 24; /* == 3 * 8 */
1968 ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff 1969 _asm {
7f88f624 1970 /* Re-init address pointers and offset */
c6b71bff 1971 movq mm7, ActiveMask
7f88f624 1972 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
c6b71bff 1973 movq mm5, LBCarryMask
7f88f624 1974 mov edi, row /* edi ==> Avg(x) */
c6b71bff 1975 movq mm4, HBClearMask
7f88f624
VZ
1976 mov esi, prev_row /* esi ==> Prior(x) */
1977 /* PRIME the pump (load the first Raw(x-bpp) data set */
1978 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
1979 /* (we correct position in loop below) */
c6b71bff 1980davg3lp:
7f88f624
VZ
1981 movq mm0, [edi + ebx] /* Load mm0 with Avg(x) */
1982 /* Add (Prev_row/2) to Average */
c6b71bff 1983 movq mm3, mm5
7f88f624
VZ
1984 psrlq mm2, ShiftRem /* Correct position Raw(x-bpp) data */
1985 movq mm1, [esi + ebx] /* Load mm1 with Prior(x) */
c6b71bff 1986 movq mm6, mm7
7f88f624
VZ
1987 pand mm3, mm1 /* get lsb for each prev_row byte */
1988 psrlq mm1, 1 /* divide prev_row bytes by 2 */
1989 pand mm1, mm4 /* clear invalid bit 7 of each byte */
1990 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
1991 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
1992 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
1993 pand mm1, mm2 /* get LBCarrys for each byte where both */
1994 /* lsb's were == 1 (Only valid for active group) */
1995 psrlq mm2, 1 /* divide raw bytes by 2 */
1996 pand mm2, mm4 /* clear invalid bit 7 of each byte */
1997 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
1998 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */
1999 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2000 /* byte */
2001 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2002 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 3-5 */
2003 movq mm2, mm0 /* mov updated Raws to mm2 */
2004 psllq mm2, ShiftBpp /* shift data to position correctly */
2005 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2006 pand mm1, mm2 /* get LBCarrys for each byte where both */
2007 /* lsb's were == 1 (Only valid for active group) */
2008 psrlq mm2, 1 /* divide raw bytes by 2 */
2009 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2010 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2011 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2012 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2013 /* byte */
2014
2015 /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */
2016 psllq mm6, ShiftBpp /* shift the mm6 mask to cover the last two */
2017 /* bytes */
2018 movq mm2, mm0 /* mov updated Raws to mm2 */
2019 psllq mm2, ShiftBpp /* shift data to position correctly */
2020 /* Data only needs to be shifted once here to */
2021 /* get the correct x-bpp offset. */
2022 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2023 pand mm1, mm2 /* get LBCarrys for each byte where both */
2024 /* lsb's were == 1 (Only valid for active group) */
2025 psrlq mm2, 1 /* divide raw bytes by 2 */
2026 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2027 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2028 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
c6b71bff 2029 add ebx, 8
7f88f624
VZ
2030 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2031 /* byte */
c6b71bff 2032
7f88f624 2033 /* Now ready to write back to memory */
c6b71bff 2034 movq [edi + ebx - 8], mm0
7f88f624 2035 /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */
c6b71bff 2036 cmp ebx, MMXLength
7f88f624 2037 movq mm2, mm0 /* mov updated Raw(x) to mm2 */
c6b71bff 2038 jb davg3lp
7f88f624 2039 } /* end _asm block */
c6b71bff
GD
2040 }
2041 break;
2042
2043 case 6:
2044 case 4:
2045 case 7:
2046 case 5:
2047 {
7f88f624
VZ
2048 ActiveMask.use = 0xffffffffffffffff; /* use shift below to clear */
2049 /* appropriate inactive bytes */
c6b71bff
GD
2050 ShiftBpp.use = bpp << 3;
2051 ShiftRem.use = 64 - ShiftBpp.use;
2052 _asm {
2053 movq mm4, HBClearMask
7f88f624
VZ
2054 /* Re-init address pointers and offset */
2055 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2056 /* Load ActiveMask and clear all bytes except for 1st active group */
c6b71bff 2057 movq mm7, ActiveMask
7f88f624 2058 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2059 psrlq mm7, ShiftRem
7f88f624 2060 mov esi, prev_row /* esi ==> Prior(x) */
c6b71bff
GD
2061 movq mm6, mm7
2062 movq mm5, LBCarryMask
7f88f624
VZ
2063 psllq mm6, ShiftBpp /* Create mask for 2nd active group */
2064 /* PRIME the pump (load the first Raw(x-bpp) data set */
2065 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2066 /* (we correct position in loop below) */
c6b71bff
GD
2067davg4lp:
2068 movq mm0, [edi + ebx]
7f88f624 2069 psrlq mm2, ShiftRem /* shift data to position correctly */
c6b71bff 2070 movq mm1, [esi + ebx]
7f88f624 2071 /* Add (Prev_row/2) to Average */
c6b71bff 2072 movq mm3, mm5
7f88f624
VZ
2073 pand mm3, mm1 /* get lsb for each prev_row byte */
2074 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2075 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2076 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2077 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2078 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2079 pand mm1, mm2 /* get LBCarrys for each byte where both */
2080 /* lsb's were == 1 (Only valid for active group) */
2081 psrlq mm2, 1 /* divide raw bytes by 2 */
2082 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2083 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2084 pand mm2, mm7 /* Leave only Active Group 1 bytes to add to Avg */
2085 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2086 /* byte */
2087 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2088 movq mm2, mm0 /* mov updated Raws to mm2 */
2089 psllq mm2, ShiftBpp /* shift data to position correctly */
c6b71bff 2090 add ebx, 8
7f88f624
VZ
2091 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2092 pand mm1, mm2 /* get LBCarrys for each byte where both */
2093 /* lsb's were == 1 (Only valid for active group) */
2094 psrlq mm2, 1 /* divide raw bytes by 2 */
2095 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2096 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2097 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2098 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2099 /* byte */
c6b71bff 2100 cmp ebx, MMXLength
7f88f624 2101 /* Now ready to write back to memory */
c6b71bff 2102 movq [edi + ebx - 8], mm0
7f88f624
VZ
2103 /* Prep Raw(x-bpp) for next loop */
2104 movq mm2, mm0 /* mov updated Raws to mm2 */
c6b71bff 2105 jb davg4lp
7f88f624 2106 } /* end _asm block */
c6b71bff
GD
2107 }
2108 break;
2109 case 2:
2110 {
2111 ActiveMask.use = 0x000000000000ffff;
7f88f624
VZ
2112 ShiftBpp.use = 16; /* == 2 * 8 [BUGFIX] */
2113 ShiftRem.use = 48; /* == 64 - 16 [BUGFIX] */
c6b71bff 2114 _asm {
7f88f624 2115 /* Load ActiveMask */
c6b71bff 2116 movq mm7, ActiveMask
7f88f624
VZ
2117 /* Re-init address pointers and offset */
2118 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
c6b71bff 2119 movq mm5, LBCarryMask
7f88f624 2120 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2121 movq mm4, HBClearMask
7f88f624
VZ
2122 mov esi, prev_row /* esi ==> Prior(x) */
2123 /* PRIME the pump (load the first Raw(x-bpp) data set */
2124 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2125 /* (we correct position in loop below) */
c6b71bff
GD
2126davg2lp:
2127 movq mm0, [edi + ebx]
7f88f624 2128 psrlq mm2, ShiftRem /* shift data to position correctly [BUGFIX] */
c6b71bff 2129 movq mm1, [esi + ebx]
7f88f624 2130 /* Add (Prev_row/2) to Average */
c6b71bff 2131 movq mm3, mm5
7f88f624
VZ
2132 pand mm3, mm1 /* get lsb for each prev_row byte */
2133 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2134 pand mm1, mm4 /* clear invalid bit 7 of each byte */
c6b71bff 2135 movq mm6, mm7
7f88f624
VZ
2136 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2137 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2138 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2139 pand mm1, mm2 /* get LBCarrys for each byte where both */
2140 /* lsb's were == 1 (Only valid for active group) */
2141 psrlq mm2, 1 /* divide raw bytes by 2 */
2142 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2143 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2144 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */
2145 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2146 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2147 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 2 & 3 */
2148 movq mm2, mm0 /* mov updated Raws to mm2 */
2149 psllq mm2, ShiftBpp /* shift data to position correctly */
2150 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2151 pand mm1, mm2 /* get LBCarrys for each byte where both */
2152 /* lsb's were == 1 (Only valid for active group) */
2153 psrlq mm2, 1 /* divide raw bytes by 2 */
2154 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2155 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2156 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2157 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2158
2159 /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */
2160 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 4 & 5 */
2161 movq mm2, mm0 /* mov updated Raws to mm2 */
2162 psllq mm2, ShiftBpp /* shift data to position correctly */
2163 /* Data only needs to be shifted once here to */
2164 /* get the correct x-bpp offset. */
2165 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2166 pand mm1, mm2 /* get LBCarrys for each byte where both */
2167 /* lsb's were == 1 (Only valid for active group) */
2168 psrlq mm2, 1 /* divide raw bytes by 2 */
2169 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2170 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2171 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2172 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2173
2174 /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */
2175 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 6 & 7 */
2176 movq mm2, mm0 /* mov updated Raws to mm2 */
2177 psllq mm2, ShiftBpp /* shift data to position correctly */
2178 /* Data only needs to be shifted once here to */
2179 /* get the correct x-bpp offset. */
c6b71bff 2180 add ebx, 8
7f88f624
VZ
2181 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2182 pand mm1, mm2 /* get LBCarrys for each byte where both */
2183 /* lsb's were == 1 (Only valid for active group) */
2184 psrlq mm2, 1 /* divide raw bytes by 2 */
2185 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2186 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2187 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2188 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
c6b71bff
GD
2189
2190 cmp ebx, MMXLength
7f88f624 2191 /* Now ready to write back to memory */
c6b71bff 2192 movq [edi + ebx - 8], mm0
7f88f624
VZ
2193 /* Prep Raw(x-bpp) for next loop */
2194 movq mm2, mm0 /* mov updated Raws to mm2 */
c6b71bff 2195 jb davg2lp
7f88f624 2196 } /* end _asm block */
c6b71bff
GD
2197 }
2198 break;
2199
7f88f624 2200 case 1: /* bpp == 1 */
c6b71bff
GD
2201 {
2202 _asm {
7f88f624
VZ
2203 /* Re-init address pointers and offset */
2204 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2205 mov edi, row /* edi ==> Avg(x) */
2206 cmp ebx, FullLength /* Test if offset at end of array */
c6b71bff 2207 jnb davg1end
7f88f624
VZ
2208 /* Do Paeth decode for remaining bytes */
2209 mov esi, prev_row /* esi ==> Prior(x) */
c6b71bff 2210 mov edx, edi
7f88f624
VZ
2211 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
2212 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff 2213davg1lp:
7f88f624 2214 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 2215 xor eax, eax
7f88f624
VZ
2216 mov cl, [esi + ebx] /* load cl with Prior(x) */
2217 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
c6b71bff
GD
2218 add ax, cx
2219 inc ebx
7f88f624
VZ
2220 shr ax, 1 /* divide by 2 */
2221 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
2222 cmp ebx, FullLength /* Check if at end of array */
2223 mov [edi+ebx-1], al /* Write back Raw(x); */
2224 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
2225 jb davg1lp
2226davg1end:
7f88f624 2227 } /* end _asm block */
c6b71bff
GD
2228 }
2229 return;
2230
7f88f624 2231 case 8: /* bpp == 8 */
c6b71bff
GD
2232 {
2233 _asm {
7f88f624
VZ
2234 /* Re-init address pointers and offset */
2235 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
c6b71bff 2236 movq mm5, LBCarryMask
7f88f624 2237 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2238 movq mm4, HBClearMask
7f88f624
VZ
2239 mov esi, prev_row /* esi ==> Prior(x) */
2240 /* PRIME the pump (load the first Raw(x-bpp) data set */
2241 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2242 /* (NO NEED to correct position in loop below) */
c6b71bff
GD
2243davg8lp:
2244 movq mm0, [edi + ebx]
2245 movq mm3, mm5
2246 movq mm1, [esi + ebx]
2247 add ebx, 8
7f88f624
VZ
2248 pand mm3, mm1 /* get lsb for each prev_row byte */
2249 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2250 pand mm3, mm2 /* get LBCarrys for each byte where both */
2251 /* lsb's were == 1 */
2252 psrlq mm2, 1 /* divide raw bytes by 2 */
2253 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2254 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */
2255 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2256 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2257 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */
c6b71bff
GD
2258 cmp ebx, MMXLength
2259 movq [edi + ebx - 8], mm0
7f88f624 2260 movq mm2, mm0 /* reuse as Raw(x-bpp) */
c6b71bff 2261 jb davg8lp
7f88f624 2262 } /* end _asm block */
c6b71bff
GD
2263 }
2264 break;
7f88f624 2265 default: /* bpp greater than 8 */
c6b71bff
GD
2266 {
2267 _asm {
2268 movq mm5, LBCarryMask
7f88f624
VZ
2269 /* Re-init address pointers and offset */
2270 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2271 mov edi, row /* edi ==> Avg(x) */
c6b71bff
GD
2272 movq mm4, HBClearMask
2273 mov edx, edi
7f88f624
VZ
2274 mov esi, prev_row /* esi ==> Prior(x) */
2275 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff
GD
2276davgAlp:
2277 movq mm0, [edi + ebx]
2278 movq mm3, mm5
2279 movq mm1, [esi + ebx]
7f88f624 2280 pand mm3, mm1 /* get lsb for each prev_row byte */
c6b71bff 2281 movq mm2, [edx + ebx]
7f88f624
VZ
2282 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2283 pand mm3, mm2 /* get LBCarrys for each byte where both */
2284 /* lsb's were == 1 */
2285 psrlq mm2, 1 /* divide raw bytes by 2 */
2286 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2287 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */
2288 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2289 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
c6b71bff 2290 add ebx, 8
7f88f624 2291 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */
c6b71bff
GD
2292 cmp ebx, MMXLength
2293 movq [edi + ebx - 8], mm0
2294 jb davgAlp
7f88f624 2295 } /* end _asm block */
c6b71bff
GD
2296 }
2297 break;
7f88f624 2298 } /* end switch ( bpp ) */
c6b71bff
GD
2299
2300 _asm {
7f88f624
VZ
2301 /* MMX acceleration complete now do clean-up */
2302 /* Check if any remaining bytes left to decode */
2303 mov ebx, MMXLength /* ebx ==> x = offset bytes remaining after MMX */
2304 mov edi, row /* edi ==> Avg(x) */
2305 cmp ebx, FullLength /* Test if offset at end of array */
c6b71bff 2306 jnb davgend
7f88f624
VZ
2307 /* Do Paeth decode for remaining bytes */
2308 mov esi, prev_row /* esi ==> Prior(x) */
c6b71bff 2309 mov edx, edi
7f88f624
VZ
2310 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
2311 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff 2312davglp2:
7f88f624 2313 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 2314 xor eax, eax
7f88f624
VZ
2315 mov cl, [esi + ebx] /* load cl with Prior(x) */
2316 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
c6b71bff
GD
2317 add ax, cx
2318 inc ebx
7f88f624
VZ
2319 shr ax, 1 /* divide by 2 */
2320 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
2321 cmp ebx, FullLength /* Check if at end of array */
2322 mov [edi+ebx-1], al /* Write back Raw(x); */
2323 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
2324 jb davglp2
2325davgend:
7f88f624
VZ
2326 emms /* End MMX instructions; prep for possible FP instrs. */
2327 } /* end _asm block */
c6b71bff
GD
2328}
2329
7f88f624 2330/* Optimized code for PNG Paeth filter decoder */
c6b71bff
GD
2331void /* PRIVATE */
2332png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2333 png_bytep prev_row)
2334{
2335 png_uint_32 FullLength;
2336 png_uint_32 MMXLength;
7f88f624 2337 /*png_uint_32 len; */
c6b71bff
GD
2338 int bpp;
2339 int diff;
7f88f624 2340 /*int ptemp; */
c6b71bff
GD
2341 int patemp, pbtemp, pctemp;
2342
7f88f624
VZ
2343 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
2344 FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
2345 _asm
2346 {
7f88f624 2347 xor ebx, ebx /* ebx ==> x offset */
c6b71bff 2348 mov edi, row
7f88f624 2349 xor edx, edx /* edx ==> x-bpp offset */
c6b71bff
GD
2350 mov esi, prev_row
2351 xor eax, eax
2352
7f88f624
VZ
2353 /* Compute the Raw value for the first bpp bytes */
2354 /* Note: the formula works out to be always */
2355 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
c6b71bff
GD
2356dpthrlp:
2357 mov al, [edi + ebx]
2358 add al, [esi + ebx]
2359 inc ebx
2360 cmp ebx, bpp
2361 mov [edi + ebx - 1], al
2362 jb dpthrlp
7f88f624
VZ
2363 /* get # of bytes to alignment */
2364 mov diff, edi /* take start of row */
2365 add diff, ebx /* add bpp */
c6b71bff 2366 xor ecx, ecx
7f88f624
VZ
2367 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */
2368 and diff, 0xfffffff8 /* mask to alignment boundary */
2369 sub diff, edi /* subtract from start ==> value ebx at alignment */
c6b71bff 2370 jz dpthgo
7f88f624 2371 /* fix alignment */
c6b71bff
GD
2372dpthlp1:
2373 xor eax, eax
7f88f624
VZ
2374 /* pav = p - a = (a + b - c) - a = b - c */
2375 mov al, [esi + ebx] /* load Prior(x) into al */
2376 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
2377 sub eax, ecx /* subtract Prior(x-bpp) */
2378 mov patemp, eax /* Save pav for later use */
c6b71bff 2379 xor eax, eax
7f88f624
VZ
2380 /* pbv = p - b = (a + b - c) - b = a - c */
2381 mov al, [edi + edx] /* load Raw(x-bpp) into al */
2382 sub eax, ecx /* subtract Prior(x-bpp) */
c6b71bff 2383 mov ecx, eax
7f88f624
VZ
2384 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2385 add eax, patemp /* pcv = pav + pbv */
2386 /* pc = abs(pcv) */
c6b71bff
GD
2387 test eax, 0x80000000
2388 jz dpthpca
7f88f624 2389 neg eax /* reverse sign of neg values */
c6b71bff 2390dpthpca:
7f88f624
VZ
2391 mov pctemp, eax /* save pc for later use */
2392 /* pb = abs(pbv) */
c6b71bff
GD
2393 test ecx, 0x80000000
2394 jz dpthpba
7f88f624 2395 neg ecx /* reverse sign of neg values */
c6b71bff 2396dpthpba:
7f88f624
VZ
2397 mov pbtemp, ecx /* save pb for later use */
2398 /* pa = abs(pav) */
c6b71bff
GD
2399 mov eax, patemp
2400 test eax, 0x80000000
2401 jz dpthpaa
7f88f624 2402 neg eax /* reverse sign of neg values */
c6b71bff 2403dpthpaa:
7f88f624
VZ
2404 mov patemp, eax /* save pa for later use */
2405 /* test if pa <= pb */
c6b71bff
GD
2406 cmp eax, ecx
2407 jna dpthabb
7f88f624 2408 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
2409 cmp ecx, pctemp
2410 jna dpthbbc
7f88f624
VZ
2411 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2412 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
2413 jmp dpthpaeth
2414dpthbbc:
7f88f624
VZ
2415 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
2416 mov cl, [esi + ebx] /* load Prior(x) into cl */
c6b71bff
GD
2417 jmp dpthpaeth
2418dpthabb:
7f88f624 2419 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
2420 cmp eax, pctemp
2421 jna dpthabc
7f88f624
VZ
2422 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2423 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
2424 jmp dpthpaeth
2425dpthabc:
7f88f624
VZ
2426 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
2427 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
c6b71bff
GD
2428dpthpaeth:
2429 inc ebx
2430 inc edx
7f88f624 2431 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
2432 add [edi + ebx - 1], cl
2433 cmp ebx, diff
2434 jb dpthlp1
2435dpthgo:
2436 mov ecx, FullLength
2437 mov eax, ecx
7f88f624
VZ
2438 sub eax, ebx /* subtract alignment fix */
2439 and eax, 0x00000007 /* calc bytes over mult of 8 */
2440 sub ecx, eax /* drop over bytes from original length */
c6b71bff 2441 mov MMXLength, ecx
7f88f624
VZ
2442 } /* end _asm block */
2443 /* Now do the math for the rest of the row */
c6b71bff
GD
2444 switch ( bpp )
2445 {
2446 case 3:
2447 {
2448 ActiveMask.use = 0x0000000000ffffff;
2449 ActiveMaskEnd.use = 0xffff000000000000;
7f88f624
VZ
2450 ShiftBpp.use = 24; /* == bpp(3) * 8 */
2451 ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
2452 _asm
2453 {
2454 mov ebx, diff
2455 mov edi, row
2456 mov esi, prev_row
2457 pxor mm0, mm0
7f88f624 2458 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
2459 movq mm1, [edi+ebx-8]
2460dpth3lp:
7f88f624
VZ
2461 psrlq mm1, ShiftRem /* shift last 3 bytes to 1st 3 bytes */
2462 movq mm2, [esi + ebx] /* load b=Prior(x) */
2463 punpcklbw mm1, mm0 /* Unpack High bytes of a */
2464 movq mm3, [esi+ebx-8] /* Prep c=Prior(x-bpp) bytes */
2465 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2466 psrlq mm3, ShiftRem /* shift last 3 bytes to 1st 3 bytes */
2467 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2468 movq mm4, mm2
7f88f624
VZ
2469 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2470 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2471 movq mm5, mm1
2472 psubw mm4, mm3
2473 pxor mm7, mm7
7f88f624 2474 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2475 movq mm6, mm4
2476 psubw mm5, mm3
2477
7f88f624
VZ
2478 /* pa = abs(p-a) = abs(pav) */
2479 /* pb = abs(p-b) = abs(pbv) */
2480 /* pc = abs(p-c) = abs(pcv) */
2481 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2482 paddw mm6, mm5
7f88f624
VZ
2483 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2484 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2485 psubw mm4, mm0
7f88f624 2486 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2487 psubw mm4, mm0
2488 psubw mm5, mm7
2489 pxor mm0, mm0
7f88f624
VZ
2490 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2491 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2492 psubw mm5, mm7
2493 psubw mm6, mm0
7f88f624 2494 /* test pa <= pb */
c6b71bff
GD
2495 movq mm7, mm4
2496 psubw mm6, mm0
7f88f624 2497 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2498 movq mm0, mm7
7f88f624 2499 /* use mm7 mask to merge pa & pb */
c6b71bff 2500 pand mm5, mm7
7f88f624 2501 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2502 pand mm2, mm0
2503 pandn mm7, mm4
2504 pandn mm0, mm1
2505 paddw mm7, mm5
2506 paddw mm0, mm2
7f88f624
VZ
2507 /* test ((pa <= pb)? pa:pb) <= pc */
2508 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2509 pxor mm1, mm1
2510 pand mm3, mm7
2511 pandn mm7, mm0
2512 paddw mm7, mm3
2513 pxor mm0, mm0
2514 packuswb mm7, mm1
7f88f624 2515 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */
c6b71bff 2516 pand mm7, ActiveMask
7f88f624
VZ
2517 movq mm2, mm3 /* load b=Prior(x) step 1 */
2518 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2519 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2520 movq [edi + ebx], mm7 /* write back updated value */
2521 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */
2522 /* Now do Paeth for 2nd set of bytes (3-5) */
2523 psrlq mm2, ShiftBpp /* load b=Prior(x) step 2 */
2524 punpcklbw mm1, mm0 /* Unpack High bytes of a */
c6b71bff 2525 pxor mm7, mm7
7f88f624
VZ
2526 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2527 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 2528 movq mm5, mm1
7f88f624 2529 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff
GD
2530 movq mm4, mm2
2531 psubw mm5, mm3
2532 psubw mm4, mm3
7f88f624
VZ
2533 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
2534 /* pav + pbv = pbv + pav */
c6b71bff
GD
2535 movq mm6, mm5
2536 paddw mm6, mm4
2537
7f88f624
VZ
2538 /* pa = abs(p-a) = abs(pav) */
2539 /* pb = abs(p-b) = abs(pbv) */
2540 /* pc = abs(p-c) = abs(pcv) */
2541 pcmpgtw mm0, mm5 /* Create mask pbv bytes < 0 */
2542 pcmpgtw mm7, mm4 /* Create mask pav bytes < 0 */
2543 pand mm0, mm5 /* Only pbv bytes < 0 in mm0 */
2544 pand mm7, mm4 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2545 psubw mm5, mm0
2546 psubw mm4, mm7
2547 psubw mm5, mm0
2548 psubw mm4, mm7
2549 pxor mm0, mm0
7f88f624
VZ
2550 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2551 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff 2552 psubw mm6, mm0
7f88f624 2553 /* test pa <= pb */
c6b71bff
GD
2554 movq mm7, mm4
2555 psubw mm6, mm0
7f88f624 2556 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2557 movq mm0, mm7
7f88f624 2558 /* use mm7 mask to merge pa & pb */
c6b71bff 2559 pand mm5, mm7
7f88f624 2560 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2561 pand mm2, mm0
2562 pandn mm7, mm4
2563 pandn mm0, mm1
2564 paddw mm7, mm5
2565 paddw mm0, mm2
7f88f624
VZ
2566 /* test ((pa <= pb)? pa:pb) <= pc */
2567 pcmpgtw mm7, mm6 /* pab > pc? */
2568 movq mm2, [esi + ebx] /* load b=Prior(x) */
c6b71bff
GD
2569 pand mm3, mm7
2570 pandn mm7, mm0
2571 pxor mm1, mm1
2572 paddw mm7, mm3
2573 pxor mm0, mm0
2574 packuswb mm7, mm1
7f88f624 2575 movq mm3, mm2 /* load c=Prior(x-bpp) step 1 */
c6b71bff 2576 pand mm7, ActiveMask
7f88f624
VZ
2577 punpckhbw mm2, mm0 /* Unpack High bytes of b */
2578 psllq mm7, ShiftBpp /* Shift bytes to 2nd group of 3 bytes */
2579 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2580 movq mm4, mm2
7f88f624
VZ
2581 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2582 psllq mm3, ShiftBpp /* load c=Prior(x-bpp) step 2 */
2583 movq [edi + ebx], mm7 /* write back updated value */
c6b71bff 2584 movq mm1, mm7
7f88f624
VZ
2585 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2586 psllq mm1, ShiftBpp /* Shift bytes */
2587 /* Now mm1 will be used as Raw(x-bpp) */
2588 /* Now do Paeth for 3rd, and final, set of bytes (6-7) */
c6b71bff 2589 pxor mm7, mm7
7f88f624 2590 punpckhbw mm1, mm0 /* Unpack High bytes of a */
c6b71bff 2591 psubw mm4, mm3
7f88f624 2592 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 2593 movq mm5, mm1
7f88f624 2594 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2595 movq mm6, mm4
2596 psubw mm5, mm3
2597 pxor mm0, mm0
2598 paddw mm6, mm5
2599
7f88f624
VZ
2600 /* pa = abs(p-a) = abs(pav) */
2601 /* pb = abs(p-b) = abs(pbv) */
2602 /* pc = abs(p-c) = abs(pcv) */
2603 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2604 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2605 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2606 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2607 psubw mm4, mm0
2608 psubw mm5, mm7
2609 psubw mm4, mm0
2610 psubw mm5, mm7
2611 pxor mm0, mm0
7f88f624
VZ
2612 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2613 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff 2614 psubw mm6, mm0
7f88f624 2615 /* test pa <= pb */
c6b71bff
GD
2616 movq mm7, mm4
2617 psubw mm6, mm0
7f88f624 2618 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2619 movq mm0, mm7
7f88f624 2620 /* use mm0 mask copy to merge a & b */
c6b71bff 2621 pand mm2, mm0
7f88f624 2622 /* use mm7 mask to merge pa & pb */
c6b71bff
GD
2623 pand mm5, mm7
2624 pandn mm0, mm1
2625 pandn mm7, mm4
2626 paddw mm0, mm2
2627 paddw mm7, mm5
7f88f624
VZ
2628 /* test ((pa <= pb)? pa:pb) <= pc */
2629 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2630 pand mm3, mm7
2631 pandn mm7, mm0
2632 paddw mm7, mm3
2633 pxor mm1, mm1
2634 packuswb mm1, mm7
7f88f624 2635 /* Step ebx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
2636 add ebx, 8
2637 pand mm1, ActiveMaskEnd
7f88f624 2638 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff
GD
2639
2640 cmp ebx, MMXLength
7f88f624
VZ
2641 pxor mm0, mm0 /* pxor does not affect flags */
2642 movq [edi + ebx - 8], mm1 /* write back updated value */
2643 /* mm1 will be used as Raw(x-bpp) next loop */
2644 /* mm3 ready to be used as Prior(x-bpp) next loop */
c6b71bff 2645 jb dpth3lp
7f88f624 2646 } /* end _asm block */
c6b71bff
GD
2647 }
2648 break;
2649
2650 case 6:
2651 case 7:
2652 case 5:
2653 {
2654 ActiveMask.use = 0x00000000ffffffff;
2655 ActiveMask2.use = 0xffffffff00000000;
7f88f624 2656 ShiftBpp.use = bpp << 3; /* == bpp * 8 */
c6b71bff
GD
2657 ShiftRem.use = 64 - ShiftBpp.use;
2658 _asm
2659 {
2660 mov ebx, diff
2661 mov edi, row
2662 mov esi, prev_row
7f88f624 2663 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
2664 movq mm1, [edi+ebx-8]
2665 pxor mm0, mm0
2666dpth6lp:
7f88f624 2667 /* Must shift to position Raw(x-bpp) data */
c6b71bff 2668 psrlq mm1, ShiftRem
7f88f624
VZ
2669 /* Do first set of 4 bytes */
2670 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2671 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2672 movq mm2, [esi + ebx] /* load b=Prior(x) */
2673 punpcklbw mm2, mm0 /* Unpack Low bytes of b */
2674 /* Must shift to position Prior(x-bpp) data */
c6b71bff 2675 psrlq mm3, ShiftRem
7f88f624 2676 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2677 movq mm4, mm2
7f88f624
VZ
2678 punpcklbw mm3, mm0 /* Unpack Low bytes of c */
2679 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2680 movq mm5, mm1
2681 psubw mm4, mm3
2682 pxor mm7, mm7
7f88f624 2683 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2684 movq mm6, mm4
2685 psubw mm5, mm3
7f88f624
VZ
2686 /* pa = abs(p-a) = abs(pav) */
2687 /* pb = abs(p-b) = abs(pbv) */
2688 /* pc = abs(p-c) = abs(pcv) */
2689 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2690 paddw mm6, mm5
7f88f624
VZ
2691 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2692 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2693 psubw mm4, mm0
7f88f624 2694 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2695 psubw mm4, mm0
2696 psubw mm5, mm7
2697 pxor mm0, mm0
7f88f624
VZ
2698 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2699 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2700 psubw mm5, mm7
2701 psubw mm6, mm0
7f88f624 2702 /* test pa <= pb */
c6b71bff
GD
2703 movq mm7, mm4
2704 psubw mm6, mm0
7f88f624 2705 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2706 movq mm0, mm7
7f88f624 2707 /* use mm7 mask to merge pa & pb */
c6b71bff 2708 pand mm5, mm7
7f88f624 2709 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2710 pand mm2, mm0
2711 pandn mm7, mm4
2712 pandn mm0, mm1
2713 paddw mm7, mm5
2714 paddw mm0, mm2
7f88f624
VZ
2715 /* test ((pa <= pb)? pa:pb) <= pc */
2716 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2717 pxor mm1, mm1
2718 pand mm3, mm7
2719 pandn mm7, mm0
2720 paddw mm7, mm3
2721 pxor mm0, mm0
2722 packuswb mm7, mm1
7f88f624 2723 movq mm3, [esi + ebx - 8] /* load c=Prior(x-bpp) */
c6b71bff
GD
2724 pand mm7, ActiveMask
2725 psrlq mm3, ShiftRem
7f88f624
VZ
2726 movq mm2, [esi + ebx] /* load b=Prior(x) step 1 */
2727 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
c6b71bff 2728 movq mm6, mm2
7f88f624 2729 movq [edi + ebx], mm7 /* write back updated value */
c6b71bff
GD
2730 movq mm1, [edi+ebx-8]
2731 psllq mm6, ShiftBpp
2732 movq mm5, mm7
2733 psrlq mm1, ShiftRem
2734 por mm3, mm6
2735 psllq mm5, ShiftBpp
7f88f624 2736 punpckhbw mm3, mm0 /* Unpack High bytes of c */
c6b71bff 2737 por mm1, mm5
7f88f624
VZ
2738 /* Do second set of 4 bytes */
2739 punpckhbw mm2, mm0 /* Unpack High bytes of b */
2740 punpckhbw mm1, mm0 /* Unpack High bytes of a */
2741 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2742 movq mm4, mm2
7f88f624 2743 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2744 movq mm5, mm1
2745 psubw mm4, mm3
2746 pxor mm7, mm7
7f88f624 2747 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2748 movq mm6, mm4
2749 psubw mm5, mm3
7f88f624
VZ
2750 /* pa = abs(p-a) = abs(pav) */
2751 /* pb = abs(p-b) = abs(pbv) */
2752 /* pc = abs(p-c) = abs(pcv) */
2753 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2754 paddw mm6, mm5
7f88f624
VZ
2755 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2756 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2757 psubw mm4, mm0
7f88f624 2758 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2759 psubw mm4, mm0
2760 psubw mm5, mm7
2761 pxor mm0, mm0
7f88f624
VZ
2762 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2763 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2764 psubw mm5, mm7
2765 psubw mm6, mm0
7f88f624 2766 /* test pa <= pb */
c6b71bff
GD
2767 movq mm7, mm4
2768 psubw mm6, mm0
7f88f624 2769 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2770 movq mm0, mm7
7f88f624 2771 /* use mm7 mask to merge pa & pb */
c6b71bff 2772 pand mm5, mm7
7f88f624 2773 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2774 pand mm2, mm0
2775 pandn mm7, mm4
2776 pandn mm0, mm1
2777 paddw mm7, mm5
2778 paddw mm0, mm2
7f88f624
VZ
2779 /* test ((pa <= pb)? pa:pb) <= pc */
2780 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2781 pxor mm1, mm1
2782 pand mm3, mm7
2783 pandn mm7, mm0
2784 pxor mm1, mm1
2785 paddw mm7, mm3
2786 pxor mm0, mm0
7f88f624 2787 /* Step ex to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
2788 add ebx, 8
2789 packuswb mm1, mm7
7f88f624 2790 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff 2791 cmp ebx, MMXLength
7f88f624
VZ
2792 movq [edi + ebx - 8], mm1 /* write back updated value */
2793 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff 2794 jb dpth6lp
7f88f624 2795 } /* end _asm block */
c6b71bff
GD
2796 }
2797 break;
2798
2799 case 4:
2800 {
2801 ActiveMask.use = 0x00000000ffffffff;
2802 _asm {
2803 mov ebx, diff
2804 mov edi, row
2805 mov esi, prev_row
2806 pxor mm0, mm0
7f88f624
VZ
2807 /* PRIME the pump (load the first Raw(x-bpp) data set */
2808 movq mm1, [edi+ebx-8] /* Only time should need to read */
2809 /* a=Raw(x-bpp) bytes */
c6b71bff 2810dpth4lp:
7f88f624
VZ
2811 /* Do first set of 4 bytes */
2812 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2813 punpckhbw mm1, mm0 /* Unpack Low bytes of a */
2814 movq mm2, [esi + ebx] /* load b=Prior(x) */
2815 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2816 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2817 movq mm4, mm2
7f88f624
VZ
2818 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2819 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2820 movq mm5, mm1
2821 psubw mm4, mm3
2822 pxor mm7, mm7
7f88f624 2823 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2824 movq mm6, mm4
2825 psubw mm5, mm3
7f88f624
VZ
2826 /* pa = abs(p-a) = abs(pav) */
2827 /* pb = abs(p-b) = abs(pbv) */
2828 /* pc = abs(p-c) = abs(pcv) */
2829 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2830 paddw mm6, mm5
7f88f624
VZ
2831 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2832 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2833 psubw mm4, mm0
7f88f624 2834 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2835 psubw mm4, mm0
2836 psubw mm5, mm7
2837 pxor mm0, mm0
7f88f624
VZ
2838 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2839 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2840 psubw mm5, mm7
2841 psubw mm6, mm0
7f88f624 2842 /* test pa <= pb */
c6b71bff
GD
2843 movq mm7, mm4
2844 psubw mm6, mm0
7f88f624 2845 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2846 movq mm0, mm7
7f88f624 2847 /* use mm7 mask to merge pa & pb */
c6b71bff 2848 pand mm5, mm7
7f88f624 2849 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2850 pand mm2, mm0
2851 pandn mm7, mm4
2852 pandn mm0, mm1
2853 paddw mm7, mm5
2854 paddw mm0, mm2
7f88f624
VZ
2855 /* test ((pa <= pb)? pa:pb) <= pc */
2856 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2857 pxor mm1, mm1
2858 pand mm3, mm7
2859 pandn mm7, mm0
2860 paddw mm7, mm3
2861 pxor mm0, mm0
2862 packuswb mm7, mm1
7f88f624 2863 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */
c6b71bff 2864 pand mm7, ActiveMask
7f88f624
VZ
2865 movq mm2, mm3 /* load b=Prior(x) step 1 */
2866 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2867 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2868 movq [edi + ebx], mm7 /* write back updated value */
2869 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */
2870 /* Do second set of 4 bytes */
2871 punpckhbw mm2, mm0 /* Unpack Low bytes of b */
2872 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2873 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2874 movq mm4, mm2
7f88f624 2875 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2876 movq mm5, mm1
2877 psubw mm4, mm3
2878 pxor mm7, mm7
7f88f624 2879 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2880 movq mm6, mm4
2881 psubw mm5, mm3
7f88f624
VZ
2882 /* pa = abs(p-a) = abs(pav) */
2883 /* pb = abs(p-b) = abs(pbv) */
2884 /* pc = abs(p-c) = abs(pcv) */
2885 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2886 paddw mm6, mm5
7f88f624
VZ
2887 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2888 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2889 psubw mm4, mm0
7f88f624 2890 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2891 psubw mm4, mm0
2892 psubw mm5, mm7
2893 pxor mm0, mm0
7f88f624
VZ
2894 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2895 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2896 psubw mm5, mm7
2897 psubw mm6, mm0
7f88f624 2898 /* test pa <= pb */
c6b71bff
GD
2899 movq mm7, mm4
2900 psubw mm6, mm0
7f88f624 2901 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2902 movq mm0, mm7
7f88f624 2903 /* use mm7 mask to merge pa & pb */
c6b71bff 2904 pand mm5, mm7
7f88f624 2905 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2906 pand mm2, mm0
2907 pandn mm7, mm4
2908 pandn mm0, mm1
2909 paddw mm7, mm5
2910 paddw mm0, mm2
7f88f624
VZ
2911 /* test ((pa <= pb)? pa:pb) <= pc */
2912 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2913 pxor mm1, mm1
2914 pand mm3, mm7
2915 pandn mm7, mm0
2916 pxor mm1, mm1
2917 paddw mm7, mm3
2918 pxor mm0, mm0
7f88f624 2919 /* Step ex to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
2920 add ebx, 8
2921 packuswb mm1, mm7
7f88f624 2922 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff 2923 cmp ebx, MMXLength
7f88f624
VZ
2924 movq [edi + ebx - 8], mm1 /* write back updated value */
2925 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff 2926 jb dpth4lp
7f88f624 2927 } /* end _asm block */
c6b71bff
GD
2928 }
2929 break;
7f88f624 2930 case 8: /* bpp == 8 */
c6b71bff
GD
2931 {
2932 ActiveMask.use = 0x00000000ffffffff;
2933 _asm {
2934 mov ebx, diff
2935 mov edi, row
2936 mov esi, prev_row
2937 pxor mm0, mm0
7f88f624
VZ
2938 /* PRIME the pump (load the first Raw(x-bpp) data set */
2939 movq mm1, [edi+ebx-8] /* Only time should need to read */
2940 /* a=Raw(x-bpp) bytes */
c6b71bff 2941dpth8lp:
7f88f624
VZ
2942 /* Do first set of 4 bytes */
2943 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2944 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2945 movq mm2, [esi + ebx] /* load b=Prior(x) */
2946 punpcklbw mm2, mm0 /* Unpack Low bytes of b */
2947 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2948 movq mm4, mm2
7f88f624
VZ
2949 punpcklbw mm3, mm0 /* Unpack Low bytes of c */
2950 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2951 movq mm5, mm1
2952 psubw mm4, mm3
2953 pxor mm7, mm7
7f88f624 2954 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2955 movq mm6, mm4
2956 psubw mm5, mm3
7f88f624
VZ
2957 /* pa = abs(p-a) = abs(pav) */
2958 /* pb = abs(p-b) = abs(pbv) */
2959 /* pc = abs(p-c) = abs(pcv) */
2960 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2961 paddw mm6, mm5
7f88f624
VZ
2962 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2963 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2964 psubw mm4, mm0
7f88f624 2965 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2966 psubw mm4, mm0
2967 psubw mm5, mm7
2968 pxor mm0, mm0
7f88f624
VZ
2969 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2970 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2971 psubw mm5, mm7
2972 psubw mm6, mm0
7f88f624 2973 /* test pa <= pb */
c6b71bff
GD
2974 movq mm7, mm4
2975 psubw mm6, mm0
7f88f624 2976 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2977 movq mm0, mm7
7f88f624 2978 /* use mm7 mask to merge pa & pb */
c6b71bff 2979 pand mm5, mm7
7f88f624 2980 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2981 pand mm2, mm0
2982 pandn mm7, mm4
2983 pandn mm0, mm1
2984 paddw mm7, mm5
2985 paddw mm0, mm2
7f88f624
VZ
2986 /* test ((pa <= pb)? pa:pb) <= pc */
2987 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2988 pxor mm1, mm1
2989 pand mm3, mm7
2990 pandn mm7, mm0
2991 paddw mm7, mm3
2992 pxor mm0, mm0
2993 packuswb mm7, mm1
7f88f624 2994 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
c6b71bff 2995 pand mm7, ActiveMask
7f88f624
VZ
2996 movq mm2, [esi + ebx] /* load b=Prior(x) */
2997 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2998 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2999 movq [edi + ebx], mm7 /* write back updated value */
3000 movq mm1, [edi+ebx-8] /* read a=Raw(x-bpp) bytes */
3001
3002 /* Do second set of 4 bytes */
3003 punpckhbw mm2, mm0 /* Unpack High bytes of b */
3004 punpckhbw mm1, mm0 /* Unpack High bytes of a */
3005 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3006 movq mm4, mm2
7f88f624 3007 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3008 movq mm5, mm1
3009 psubw mm4, mm3
3010 pxor mm7, mm7
7f88f624 3011 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3012 movq mm6, mm4
3013 psubw mm5, mm3
7f88f624
VZ
3014 /* pa = abs(p-a) = abs(pav) */
3015 /* pb = abs(p-b) = abs(pbv) */
3016 /* pc = abs(p-c) = abs(pcv) */
3017 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 3018 paddw mm6, mm5
7f88f624
VZ
3019 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
3020 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 3021 psubw mm4, mm0
7f88f624 3022 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
3023 psubw mm4, mm0
3024 psubw mm5, mm7
3025 pxor mm0, mm0
7f88f624
VZ
3026 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
3027 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
3028 psubw mm5, mm7
3029 psubw mm6, mm0
7f88f624 3030 /* test pa <= pb */
c6b71bff
GD
3031 movq mm7, mm4
3032 psubw mm6, mm0
7f88f624 3033 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 3034 movq mm0, mm7
7f88f624 3035 /* use mm7 mask to merge pa & pb */
c6b71bff 3036 pand mm5, mm7
7f88f624 3037 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3038 pand mm2, mm0
3039 pandn mm7, mm4
3040 pandn mm0, mm1
3041 paddw mm7, mm5
3042 paddw mm0, mm2
7f88f624
VZ
3043 /* test ((pa <= pb)? pa:pb) <= pc */
3044 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
3045 pxor mm1, mm1
3046 pand mm3, mm7
3047 pandn mm7, mm0
3048 pxor mm1, mm1
3049 paddw mm7, mm3
3050 pxor mm0, mm0
7f88f624 3051 /* Step ex to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
3052 add ebx, 8
3053 packuswb mm1, mm7
7f88f624 3054 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff 3055 cmp ebx, MMXLength
7f88f624
VZ
3056 movq [edi + ebx - 8], mm1 /* write back updated value */
3057 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff 3058 jb dpth8lp
7f88f624 3059 } /* end _asm block */
c6b71bff
GD
3060 }
3061 break;
3062
7f88f624
VZ
3063 case 1: /* bpp = 1 */
3064 case 2: /* bpp = 2 */
3065 default: /* bpp > 8 */
c6b71bff
GD
3066 {
3067 _asm {
3068 mov ebx, diff
3069 cmp ebx, FullLength
3070 jnb dpthdend
3071 mov edi, row
3072 mov esi, prev_row
7f88f624 3073 /* Do Paeth decode for remaining bytes */
c6b71bff 3074 mov edx, ebx
7f88f624
VZ
3075 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
3076 sub edx, bpp /* Set edx = ebx - bpp */
c6b71bff
GD
3077dpthdlp:
3078 xor eax, eax
7f88f624
VZ
3079 /* pav = p - a = (a + b - c) - a = b - c */
3080 mov al, [esi + ebx] /* load Prior(x) into al */
3081 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3082 sub eax, ecx /* subtract Prior(x-bpp) */
3083 mov patemp, eax /* Save pav for later use */
c6b71bff 3084 xor eax, eax
7f88f624
VZ
3085 /* pbv = p - b = (a + b - c) - b = a - c */
3086 mov al, [edi + edx] /* load Raw(x-bpp) into al */
3087 sub eax, ecx /* subtract Prior(x-bpp) */
c6b71bff 3088 mov ecx, eax
7f88f624
VZ
3089 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3090 add eax, patemp /* pcv = pav + pbv */
3091 /* pc = abs(pcv) */
c6b71bff
GD
3092 test eax, 0x80000000
3093 jz dpthdpca
7f88f624 3094 neg eax /* reverse sign of neg values */
c6b71bff 3095dpthdpca:
7f88f624
VZ
3096 mov pctemp, eax /* save pc for later use */
3097 /* pb = abs(pbv) */
c6b71bff
GD
3098 test ecx, 0x80000000
3099 jz dpthdpba
7f88f624 3100 neg ecx /* reverse sign of neg values */
c6b71bff 3101dpthdpba:
7f88f624
VZ
3102 mov pbtemp, ecx /* save pb for later use */
3103 /* pa = abs(pav) */
c6b71bff
GD
3104 mov eax, patemp
3105 test eax, 0x80000000
3106 jz dpthdpaa
7f88f624 3107 neg eax /* reverse sign of neg values */
c6b71bff 3108dpthdpaa:
7f88f624
VZ
3109 mov patemp, eax /* save pa for later use */
3110 /* test if pa <= pb */
c6b71bff
GD
3111 cmp eax, ecx
3112 jna dpthdabb
7f88f624 3113 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
3114 cmp ecx, pctemp
3115 jna dpthdbbc
7f88f624
VZ
3116 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3117 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3118 jmp dpthdpaeth
3119dpthdbbc:
7f88f624
VZ
3120 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3121 mov cl, [esi + ebx] /* load Prior(x) into cl */
c6b71bff
GD
3122 jmp dpthdpaeth
3123dpthdabb:
7f88f624 3124 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
3125 cmp eax, pctemp
3126 jna dpthdabc
7f88f624
VZ
3127 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3128 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3129 jmp dpthdpaeth
3130dpthdabc:
7f88f624
VZ
3131 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3132 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
c6b71bff
GD
3133dpthdpaeth:
3134 inc ebx
3135 inc edx
7f88f624 3136 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
3137 add [edi + ebx - 1], cl
3138 cmp ebx, FullLength
3139 jb dpthdlp
3140dpthdend:
7f88f624 3141 } /* end _asm block */
c6b71bff 3142 }
7f88f624
VZ
3143 return; /* No need to go further with this one */
3144 } /* end switch ( bpp ) */
c6b71bff
GD
3145 _asm
3146 {
7f88f624
VZ
3147 /* MMX acceleration complete now do clean-up */
3148 /* Check if any remaining bytes left to decode */
c6b71bff
GD
3149 mov ebx, MMXLength
3150 cmp ebx, FullLength
3151 jnb dpthend
3152 mov edi, row
3153 mov esi, prev_row
7f88f624 3154 /* Do Paeth decode for remaining bytes */
c6b71bff 3155 mov edx, ebx
7f88f624
VZ
3156 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
3157 sub edx, bpp /* Set edx = ebx - bpp */
c6b71bff
GD
3158dpthlp2:
3159 xor eax, eax
7f88f624
VZ
3160 /* pav = p - a = (a + b - c) - a = b - c */
3161 mov al, [esi + ebx] /* load Prior(x) into al */
3162 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3163 sub eax, ecx /* subtract Prior(x-bpp) */
3164 mov patemp, eax /* Save pav for later use */
c6b71bff 3165 xor eax, eax
7f88f624
VZ
3166 /* pbv = p - b = (a + b - c) - b = a - c */
3167 mov al, [edi + edx] /* load Raw(x-bpp) into al */
3168 sub eax, ecx /* subtract Prior(x-bpp) */
c6b71bff 3169 mov ecx, eax
7f88f624
VZ
3170 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3171 add eax, patemp /* pcv = pav + pbv */
3172 /* pc = abs(pcv) */
c6b71bff
GD
3173 test eax, 0x80000000
3174 jz dpthpca2
7f88f624 3175 neg eax /* reverse sign of neg values */
c6b71bff 3176dpthpca2:
7f88f624
VZ
3177 mov pctemp, eax /* save pc for later use */
3178 /* pb = abs(pbv) */
c6b71bff
GD
3179 test ecx, 0x80000000
3180 jz dpthpba2
7f88f624 3181 neg ecx /* reverse sign of neg values */
c6b71bff 3182dpthpba2:
7f88f624
VZ
3183 mov pbtemp, ecx /* save pb for later use */
3184 /* pa = abs(pav) */
c6b71bff
GD
3185 mov eax, patemp
3186 test eax, 0x80000000
3187 jz dpthpaa2
7f88f624 3188 neg eax /* reverse sign of neg values */
c6b71bff 3189dpthpaa2:
7f88f624
VZ
3190 mov patemp, eax /* save pa for later use */
3191 /* test if pa <= pb */
c6b71bff
GD
3192 cmp eax, ecx
3193 jna dpthabb2
7f88f624 3194 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
3195 cmp ecx, pctemp
3196 jna dpthbbc2
7f88f624
VZ
3197 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3198 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3199 jmp dpthpaeth2
3200dpthbbc2:
7f88f624
VZ
3201 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3202 mov cl, [esi + ebx] /* load Prior(x) into cl */
c6b71bff
GD
3203 jmp dpthpaeth2
3204dpthabb2:
7f88f624 3205 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
3206 cmp eax, pctemp
3207 jna dpthabc2
7f88f624
VZ
3208 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3209 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3210 jmp dpthpaeth2
3211dpthabc2:
7f88f624
VZ
3212 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3213 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
c6b71bff
GD
3214dpthpaeth2:
3215 inc ebx
3216 inc edx
7f88f624 3217 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
3218 add [edi + ebx - 1], cl
3219 cmp ebx, FullLength
3220 jb dpthlp2
3221dpthend:
7f88f624
VZ
3222 emms /* End MMX instructions; prep for possible FP instrs. */
3223 } /* end _asm block */
c6b71bff
GD
3224}
3225
7f88f624 3226/* Optimized code for PNG Sub filter decoder */
c6b71bff
GD
3227void /* PRIVATE */
3228png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3229{
7f88f624 3230 /*int test; */
c6b71bff
GD
3231 int bpp;
3232 png_uint_32 FullLength;
3233 png_uint_32 MMXLength;
3234 int diff;
3235
7f88f624
VZ
3236 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3237 FullLength = row_info->rowbytes - bpp; /* # of bytes to filter */
c6b71bff
GD
3238 _asm {
3239 mov edi, row
7f88f624
VZ
3240 mov esi, edi /* lp = row */
3241 add edi, bpp /* rp = row + bpp */
c6b71bff 3242 xor eax, eax
7f88f624
VZ
3243 /* get # of bytes to alignment */
3244 mov diff, edi /* take start of row */
3245 add diff, 0xf /* add 7 + 8 to incr past */
3246 /* alignment boundary */
c6b71bff 3247 xor ebx, ebx
7f88f624
VZ
3248 and diff, 0xfffffff8 /* mask to alignment boundary */
3249 sub diff, edi /* subtract from start ==> value */
3250 /* ebx at alignment */
c6b71bff 3251 jz dsubgo
7f88f624 3252 /* fix alignment */
c6b71bff
GD
3253dsublp1:
3254 mov al, [esi+ebx]
3255 add [edi+ebx], al
3256 inc ebx
3257 cmp ebx, diff
3258 jb dsublp1
3259dsubgo:
3260 mov ecx, FullLength
3261 mov edx, ecx
7f88f624
VZ
3262 sub edx, ebx /* subtract alignment fix */
3263 and edx, 0x00000007 /* calc bytes over mult of 8 */
3264 sub ecx, edx /* drop over bytes from length */
c6b71bff 3265 mov MMXLength, ecx
7f88f624 3266 } /* end _asm block */
c6b71bff 3267
7f88f624 3268 /* Now do the math for the rest of the row */
c6b71bff
GD
3269 switch ( bpp )
3270 {
3271 case 3:
3272 {
3273 ActiveMask.use = 0x0000ffffff000000;
7f88f624
VZ
3274 ShiftBpp.use = 24; /* == 3 * 8 */
3275 ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
3276 _asm {
3277 mov edi, row
7f88f624
VZ
3278 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */
3279 mov esi, edi /* lp = row */
3280 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3281 movq mm6, mm7
3282 mov ebx, diff
7f88f624
VZ
3283 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */
3284 /* byte group */
3285 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
3286 movq mm1, [edi+ebx-8]
3287dsub3lp:
7f88f624
VZ
3288 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3289 /* no need for mask; shift clears inactive bytes */
3290 /* Add 1st active group */
c6b71bff
GD
3291 movq mm0, [edi+ebx]
3292 paddb mm0, mm1
7f88f624
VZ
3293 /* Add 2nd active group */
3294 movq mm1, mm0 /* mov updated Raws to mm1 */
3295 psllq mm1, ShiftBpp /* shift data to position correctly */
3296 pand mm1, mm7 /* mask to use only 2nd active group */
c6b71bff 3297 paddb mm0, mm1
7f88f624
VZ
3298 /* Add 3rd active group */
3299 movq mm1, mm0 /* mov updated Raws to mm1 */
3300 psllq mm1, ShiftBpp /* shift data to position correctly */
3301 pand mm1, mm6 /* mask to use only 3rd active group */
c6b71bff
GD
3302 add ebx, 8
3303 paddb mm0, mm1
3304 cmp ebx, MMXLength
7f88f624
VZ
3305 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */
3306 /* Prep for doing 1st add at top of loop */
c6b71bff
GD
3307 movq mm1, mm0
3308 jb dsub3lp
7f88f624 3309 } /* end _asm block */
c6b71bff
GD
3310 }
3311 break;
3312
3313 case 1:
3314 {
7f88f624
VZ
3315 /* Placed here just in case this is a duplicate of the */
3316 /* non-MMX code for the SUB filter in png_read_filter_row below */
c6b71bff 3317 //
7f88f624
VZ
3318 /* png_bytep rp; */
3319 /* png_bytep lp; */
3320 /* png_uint_32 i; */
3321 /* bpp = (row_info->pixel_depth + 7) >> 3; */
3322 /* for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */
3323 /* i < row_info->rowbytes; i++, rp++, lp++) */
3324 /* { */
3325 /* *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */
3326 /* } */
c6b71bff
GD
3327 _asm {
3328 mov ebx, diff
3329 mov edi, row
3330 cmp ebx, FullLength
3331 jnb dsub1end
7f88f624 3332 mov esi, edi /* lp = row */
c6b71bff 3333 xor eax, eax
7f88f624 3334 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3335dsub1lp:
3336 mov al, [esi+ebx]
3337 add [edi+ebx], al
3338 inc ebx
3339 cmp ebx, FullLength
3340 jb dsub1lp
3341dsub1end:
7f88f624 3342 } /* end _asm block */
c6b71bff
GD
3343 }
3344 return;
3345
3346 case 6:
3347 case 7:
3348 case 4:
3349 case 5:
3350 {
3351 ShiftBpp.use = bpp << 3;
3352 ShiftRem.use = 64 - ShiftBpp.use;
3353 _asm {
3354 mov edi, row
3355 mov ebx, diff
7f88f624
VZ
3356 mov esi, edi /* lp = row */
3357 add edi, bpp /* rp = row + bpp */
3358 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
3359 movq mm1, [edi+ebx-8]
3360dsub4lp:
7f88f624
VZ
3361 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3362 /* no need for mask; shift clears inactive bytes */
c6b71bff
GD
3363 movq mm0, [edi+ebx]
3364 paddb mm0, mm1
7f88f624
VZ
3365 /* Add 2nd active group */
3366 movq mm1, mm0 /* mov updated Raws to mm1 */
3367 psllq mm1, ShiftBpp /* shift data to position correctly */
3368 /* there is no need for any mask */
3369 /* since shift clears inactive bits/bytes */
c6b71bff
GD
3370 add ebx, 8
3371 paddb mm0, mm1
3372 cmp ebx, MMXLength
3373 movq [edi+ebx-8], mm0
7f88f624 3374 movq mm1, mm0 /* Prep for doing 1st add at top of loop */
c6b71bff 3375 jb dsub4lp
7f88f624 3376 } /* end _asm block */
c6b71bff
GD
3377 }
3378 break;
3379
3380 case 2:
3381 {
3382 ActiveMask.use = 0x00000000ffff0000;
7f88f624
VZ
3383 ShiftBpp.use = 16; /* == 2 * 8 */
3384 ShiftRem.use = 48; /* == 64 - 16 */
c6b71bff 3385 _asm {
7f88f624 3386 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */
c6b71bff
GD
3387 mov ebx, diff
3388 movq mm6, mm7
3389 mov edi, row
7f88f624
VZ
3390 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */
3391 /* byte group */
3392 mov esi, edi /* lp = row */
c6b71bff 3393 movq mm5, mm6
7f88f624
VZ
3394 add edi, bpp /* rp = row + bpp */
3395 psllq mm5, ShiftBpp /* Move mask in mm5 to cover 4th active */
3396 /* byte group */
3397 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
3398 movq mm1, [edi+ebx-8]
3399dsub2lp:
7f88f624
VZ
3400 /* Add 1st active group */
3401 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3402 /* no need for mask; shift clears inactive */
3403 /* bytes */
c6b71bff
GD
3404 movq mm0, [edi+ebx]
3405 paddb mm0, mm1
7f88f624
VZ
3406 /* Add 2nd active group */
3407 movq mm1, mm0 /* mov updated Raws to mm1 */
3408 psllq mm1, ShiftBpp /* shift data to position correctly */
3409 pand mm1, mm7 /* mask to use only 2nd active group */
c6b71bff 3410 paddb mm0, mm1
7f88f624
VZ
3411 /* Add 3rd active group */
3412 movq mm1, mm0 /* mov updated Raws to mm1 */
3413 psllq mm1, ShiftBpp /* shift data to position correctly */
3414 pand mm1, mm6 /* mask to use only 3rd active group */
c6b71bff 3415 paddb mm0, mm1
7f88f624
VZ
3416 /* Add 4th active group */
3417 movq mm1, mm0 /* mov updated Raws to mm1 */
3418 psllq mm1, ShiftBpp /* shift data to position correctly */
3419 pand mm1, mm5 /* mask to use only 4th active group */
c6b71bff
GD
3420 add ebx, 8
3421 paddb mm0, mm1
3422 cmp ebx, MMXLength
7f88f624
VZ
3423 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */
3424 movq mm1, mm0 /* Prep for doing 1st add at top of loop */
c6b71bff 3425 jb dsub2lp
7f88f624 3426 } /* end _asm block */
c6b71bff
GD
3427 }
3428 break;
3429 case 8:
3430 {
3431 _asm {
3432 mov edi, row
3433 mov ebx, diff
7f88f624
VZ
3434 mov esi, edi /* lp = row */
3435 add edi, bpp /* rp = row + bpp */
c6b71bff 3436 mov ecx, MMXLength
7f88f624
VZ
3437 movq mm7, [edi+ebx-8] /* PRIME the pump (load the first */
3438 /* Raw(x-bpp) data set */
3439 and ecx, 0x0000003f /* calc bytes over mult of 64 */
c6b71bff 3440dsub8lp:
7f88f624 3441 movq mm0, [edi+ebx] /* Load Sub(x) for 1st 8 bytes */
c6b71bff 3442 paddb mm0, mm7
7f88f624
VZ
3443 movq mm1, [edi+ebx+8] /* Load Sub(x) for 2nd 8 bytes */
3444 movq [edi+ebx], mm0 /* Write Raw(x) for 1st 8 bytes */
3445 /* Now mm0 will be used as Raw(x-bpp) for */
3446 /* the 2nd group of 8 bytes. This will be */
3447 /* repeated for each group of 8 bytes with */
3448 /* the 8th group being used as the Raw(x-bpp) */
3449 /* for the 1st group of the next loop. */
c6b71bff 3450 paddb mm1, mm0
7f88f624
VZ
3451 movq mm2, [edi+ebx+16] /* Load Sub(x) for 3rd 8 bytes */
3452 movq [edi+ebx+8], mm1 /* Write Raw(x) for 2nd 8 bytes */
c6b71bff 3453 paddb mm2, mm1
7f88f624
VZ
3454 movq mm3, [edi+ebx+24] /* Load Sub(x) for 4th 8 bytes */
3455 movq [edi+ebx+16], mm2 /* Write Raw(x) for 3rd 8 bytes */
c6b71bff 3456 paddb mm3, mm2
7f88f624
VZ
3457 movq mm4, [edi+ebx+32] /* Load Sub(x) for 5th 8 bytes */
3458 movq [edi+ebx+24], mm3 /* Write Raw(x) for 4th 8 bytes */
c6b71bff 3459 paddb mm4, mm3
7f88f624
VZ
3460 movq mm5, [edi+ebx+40] /* Load Sub(x) for 6th 8 bytes */
3461 movq [edi+ebx+32], mm4 /* Write Raw(x) for 5th 8 bytes */
c6b71bff 3462 paddb mm5, mm4
7f88f624
VZ
3463 movq mm6, [edi+ebx+48] /* Load Sub(x) for 7th 8 bytes */
3464 movq [edi+ebx+40], mm5 /* Write Raw(x) for 6th 8 bytes */
c6b71bff 3465 paddb mm6, mm5
7f88f624
VZ
3466 movq mm7, [edi+ebx+56] /* Load Sub(x) for 8th 8 bytes */
3467 movq [edi+ebx+48], mm6 /* Write Raw(x) for 7th 8 bytes */
c6b71bff
GD
3468 add ebx, 64
3469 paddb mm7, mm6
3470 cmp ebx, ecx
7f88f624 3471 movq [edi+ebx-8], mm7 /* Write Raw(x) for 8th 8 bytes */
c6b71bff
GD
3472 jb dsub8lp
3473 cmp ebx, MMXLength
3474 jnb dsub8lt8
3475dsub8lpA:
3476 movq mm0, [edi+ebx]
3477 add ebx, 8
3478 paddb mm0, mm7
3479 cmp ebx, MMXLength
7f88f624
VZ
3480 movq [edi+ebx-8], mm0 /* use -8 to offset early add to ebx */
3481 movq mm7, mm0 /* Move calculated Raw(x) data to mm1 to */
3482 /* be the new Raw(x-bpp) for the next loop */
c6b71bff
GD
3483 jb dsub8lpA
3484dsub8lt8:
7f88f624 3485 } /* end _asm block */
c6b71bff
GD
3486 }
3487 break;
3488
7f88f624 3489 default: /* bpp greater than 8 bytes */
c6b71bff
GD
3490 {
3491 _asm {
3492 mov ebx, diff
3493 mov edi, row
7f88f624
VZ
3494 mov esi, edi /* lp = row */
3495 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3496dsubAlp:
3497 movq mm0, [edi+ebx]
3498 movq mm1, [esi+ebx]
3499 add ebx, 8
3500 paddb mm0, mm1
3501 cmp ebx, MMXLength
7f88f624
VZ
3502 movq [edi+ebx-8], mm0 /* mov does not affect flags; -8 to offset */
3503 /* add ebx */
c6b71bff 3504 jb dsubAlp
7f88f624 3505 } /* end _asm block */
c6b71bff
GD
3506 }
3507 break;
3508
7f88f624 3509 } /* end switch ( bpp ) */
c6b71bff
GD
3510
3511 _asm {
3512 mov ebx, MMXLength
3513 mov edi, row
3514 cmp ebx, FullLength
3515 jnb dsubend
7f88f624 3516 mov esi, edi /* lp = row */
c6b71bff 3517 xor eax, eax
7f88f624 3518 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3519dsublp2:
3520 mov al, [esi+ebx]
3521 add [edi+ebx], al
3522 inc ebx
3523 cmp ebx, FullLength
3524 jb dsublp2
3525dsubend:
7f88f624
VZ
3526 emms /* End MMX instructions; prep for possible FP instrs. */
3527 } /* end _asm block */
c6b71bff
GD
3528}
3529
7f88f624 3530/* Optimized code for PNG Up filter decoder */
c6b71bff
GD
3531void /* PRIVATE */
3532png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3533 png_bytep prev_row)
3534{
3535 png_uint_32 len;
7f88f624 3536 len = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
3537 _asm {
3538 mov edi, row
7f88f624 3539 /* get # of bytes to alignment */
c6b71bff
GD
3540 mov ecx, edi
3541 xor ebx, ebx
3542 add ecx, 0x7
3543 xor eax, eax
3544 and ecx, 0xfffffff8
3545 mov esi, prev_row
3546 sub ecx, edi
3547 jz dupgo
7f88f624 3548 /* fix alignment */
c6b71bff
GD
3549duplp1:
3550 mov al, [edi+ebx]
3551 add al, [esi+ebx]
3552 inc ebx
3553 cmp ebx, ecx
7f88f624 3554 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
3555 jb duplp1
3556dupgo:
3557 mov ecx, len
3558 mov edx, ecx
7f88f624
VZ
3559 sub edx, ebx /* subtract alignment fix */
3560 and edx, 0x0000003f /* calc bytes over mult of 64 */
3561 sub ecx, edx /* drop over bytes from length */
3562 /* Unrolled loop - use all MMX registers and interleave to reduce */
3563 /* number of branch instructions (loops) and reduce partial stalls */
c6b71bff
GD
3564duploop:
3565 movq mm1, [esi+ebx]
3566 movq mm0, [edi+ebx]
3567 movq mm3, [esi+ebx+8]
3568 paddb mm0, mm1
3569 movq mm2, [edi+ebx+8]
3570 movq [edi+ebx], mm0
3571 paddb mm2, mm3
3572 movq mm5, [esi+ebx+16]
3573 movq [edi+ebx+8], mm2
3574 movq mm4, [edi+ebx+16]
3575 movq mm7, [esi+ebx+24]
3576 paddb mm4, mm5
3577 movq mm6, [edi+ebx+24]
3578 movq [edi+ebx+16], mm4
3579 paddb mm6, mm7
3580 movq mm1, [esi+ebx+32]
3581 movq [edi+ebx+24], mm6
3582 movq mm0, [edi+ebx+32]
3583 movq mm3, [esi+ebx+40]
3584 paddb mm0, mm1
3585 movq mm2, [edi+ebx+40]
3586 movq [edi+ebx+32], mm0
3587 paddb mm2, mm3
3588 movq mm5, [esi+ebx+48]
3589 movq [edi+ebx+40], mm2
3590 movq mm4, [edi+ebx+48]
3591 movq mm7, [esi+ebx+56]
3592 paddb mm4, mm5
3593 movq mm6, [edi+ebx+56]
3594 movq [edi+ebx+48], mm4
3595 add ebx, 64
3596 paddb mm6, mm7
3597 cmp ebx, ecx
7f88f624
VZ
3598 movq [edi+ebx-8], mm6 /* (+56)movq does not affect flags; */
3599 /* -8 to offset add ebx */
c6b71bff
GD
3600 jb duploop
3601
7f88f624 3602 cmp edx, 0 /* Test for bytes over mult of 64 */
c6b71bff
GD
3603 jz dupend
3604
3605
7f88f624
VZ
3606 /* 2 lines added by lcreeve@netins.net */
3607 /* (mail 11 Jul 98 in png-implement list) */
3608 cmp edx, 8 /*test for less than 8 bytes */
c6b71bff
GD
3609 jb duplt8
3610
3611
3612 add ecx, edx
7f88f624
VZ
3613 and edx, 0x00000007 /* calc bytes over mult of 8 */
3614 sub ecx, edx /* drop over bytes from length */
c6b71bff 3615 jz duplt8
7f88f624 3616 /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */
c6b71bff
GD
3617duplpA:
3618 movq mm1, [esi+ebx]
3619 movq mm0, [edi+ebx]
3620 add ebx, 8
3621 paddb mm0, mm1
3622 cmp ebx, ecx
7f88f624 3623 movq [edi+ebx-8], mm0 /* movq does not affect flags; -8 to offset add ebx */
c6b71bff 3624 jb duplpA
7f88f624 3625 cmp edx, 0 /* Test for bytes over mult of 8 */
c6b71bff
GD
3626 jz dupend
3627duplt8:
3628 xor eax, eax
7f88f624
VZ
3629 add ecx, edx /* move over byte count into counter */
3630 /* Loop using x86 registers to update remaining bytes */
c6b71bff
GD
3631duplp2:
3632 mov al, [edi + ebx]
3633 add al, [esi + ebx]
3634 inc ebx
3635 cmp ebx, ecx
7f88f624 3636 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
3637 jb duplp2
3638dupend:
7f88f624
VZ
3639 /* Conversion of filtered row completed */
3640 emms /* End MMX instructions; prep for possible FP instrs. */
3641 } /* end _asm block */
c6b71bff
GD
3642}
3643
3644
7f88f624 3645/* Optimized png_read_filter_row routines */
c6b71bff
GD
3646void /* PRIVATE */
3647png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3648 row, png_bytep prev_row, int filter)
3649{
3650#ifdef PNG_DEBUG
3651 char filnm[10];
3652#endif
3653
3654 if (mmx_supported == 2) {
3655 /* this should have happened in png_init_mmx_flags() already */
3656 png_warning(png_ptr, "asm_flags may not have been initialized");
3657 png_mmx_support();
3658 }
3659
3660#ifdef PNG_DEBUG
3661 png_debug(1, "in png_read_filter_row\n");
3662 switch (filter)
3663 {
3664 case 0: sprintf(filnm, "none");
3665 break;
3666 case 1: sprintf(filnm, "sub-%s",
3667 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3668 break;
3669 case 2: sprintf(filnm, "up-%s",
3670 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3671 break;
3672 case 3: sprintf(filnm, "avg-%s",
3673 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3674 break;
3675 case 4: sprintf(filnm, "Paeth-%s",
3676 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3677 break;
3678 default: sprintf(filnm, "unknw");
3679 break;
3680 }
3681 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3682 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3683 (int)((row_info->pixel_depth + 7) >> 3));
3684 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3685#endif /* PNG_DEBUG */
3686
3687 switch (filter)
3688 {
3689 case PNG_FILTER_VALUE_NONE:
3690 break;
3691
3692 case PNG_FILTER_VALUE_SUB:
3693 {
3694 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3695 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3696 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3697 {
3698 png_read_filter_row_mmx_sub(row_info, row);
3699 }
3700 else
3701 {
3702 png_uint_32 i;
3703 png_uint_32 istop = row_info->rowbytes;
3704 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3705 png_bytep rp = row + bpp;
3706 png_bytep lp = row;
3707
3708 for (i = bpp; i < istop; i++)
3709 {
3710 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3711 rp++;
3712 }
3713 }
3714 break;
3715 }
3716
3717 case PNG_FILTER_VALUE_UP:
3718 {
3719 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3720 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3721 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3722 {
3723 png_read_filter_row_mmx_up(row_info, row, prev_row);
3724 }
3725 else
3726 {
3727 png_uint_32 i;
3728 png_uint_32 istop = row_info->rowbytes;
3729 png_bytep rp = row;
3730 png_bytep pp = prev_row;
3731
3732 for (i = 0; i < istop; ++i)
3733 {
3734 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3735 rp++;
3736 }
3737 }
3738 break;
3739 }
3740
3741 case PNG_FILTER_VALUE_AVG:
3742 {
3743 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3744 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3745 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3746 {
3747 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3748 }
3749 else
3750 {
3751 png_uint_32 i;
3752 png_bytep rp = row;
3753 png_bytep pp = prev_row;
3754 png_bytep lp = row;
3755 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3756 png_uint_32 istop = row_info->rowbytes - bpp;
3757
3758 for (i = 0; i < bpp; i++)
3759 {
3760 *rp = (png_byte)(((int)(*rp) +
3761 ((int)(*pp++) >> 1)) & 0xff);
3762 rp++;
3763 }
3764
3765 for (i = 0; i < istop; i++)
3766 {
3767 *rp = (png_byte)(((int)(*rp) +
3768 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3769 rp++;
3770 }
3771 }
3772 break;
3773 }
3774
3775 case PNG_FILTER_VALUE_PAETH:
3776 {
3777 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3778 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3779 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3780 {
3781 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3782 }
3783 else
3784 {
3785 png_uint_32 i;
3786 png_bytep rp = row;
3787 png_bytep pp = prev_row;
3788 png_bytep lp = row;
3789 png_bytep cp = prev_row;
3790 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3791 png_uint_32 istop=row_info->rowbytes - bpp;
3792
3793 for (i = 0; i < bpp; i++)
3794 {
3795 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3796 rp++;
3797 }
3798
7f88f624 3799 for (i = 0; i < istop; i++) /* use leftover rp,pp */
c6b71bff
GD
3800 {
3801 int a, b, c, pa, pb, pc, p;
3802
3803 a = *lp++;
3804 b = *pp++;
3805 c = *cp++;
3806
3807 p = b - c;
3808 pc = a - c;
3809
3810#ifdef PNG_USE_ABS
3811 pa = abs(p);
3812 pb = abs(pc);
3813 pc = abs(p + pc);
3814#else
3815 pa = p < 0 ? -p : p;
3816 pb = pc < 0 ? -pc : pc;
3817 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3818#endif
3819
3820 /*
3821 if (pa <= pb && pa <= pc)
3822 p = a;
3823 else if (pb <= pc)
3824 p = b;
3825 else
3826 p = c;
3827 */
3828
3829 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3830
3831 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3832 rp++;
3833 }
3834 }
3835 break;
3836 }
3837
3838 default:
3839 png_warning(png_ptr, "Ignoring bad row filter type");
3840 *row=0;
3841 break;
3842 }
3843}
3844
3845#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */