]> git.saurik.com Git - wxWidgets.git/blame - src/png/pngvcrd.c
reduce black background flicker a bit - there is still some left sometimes initially...
[wxWidgets.git] / src / png / pngvcrd.c
CommitLineData
c6b71bff
GD
1/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
706b8807 5 * libpng version 1.2.7 - September 12, 2004
c6b71bff 6 * For conditions of distribution and use, see copyright notice in png.h
5b02c8a1 7 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
c6b71bff
GD
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
19 *
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
22 * [runtime MMX configuration, GRR 20010102]
23 *
24 */
25
26#define PNG_INTERNAL
27#include "png.h"
28
29#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30
31static int mmx_supported=2;
32
33
34int PNGAPI
35png_mmx_support(void)
36{
37 int mmx_supported_local = 0;
38 _asm {
7f88f624 39 push ebx /*CPUID will trash these */
c6b71bff
GD
40 push ecx
41 push edx
42
7f88f624
VZ
43 pushfd /*Save Eflag to stack */
44 pop eax /*Get Eflag from stack into eax */
45 mov ecx, eax /*Make another copy of Eflag in ecx */
46 xor eax, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */
47 push eax /*Save modified Eflag back to stack */
48
49 popfd /*Restored modified value back to Eflag reg */
50 pushfd /*Save Eflag to stack */
51 pop eax /*Get Eflag from stack */
52 push ecx /* save original Eflag to stack */
53 popfd /* restore original Eflag */
54 xor eax, ecx /*Compare the new Eflag with the original Eflag */
55 jz NOT_SUPPORTED /*If the same, CPUID instruction is not supported, */
56 /*skip following instructions and jump to */
57 /*NOT_SUPPORTED label */
58
59 xor eax, eax /*Set eax to zero */
60
61 _asm _emit 0x0f /*CPUID instruction (two bytes opcode) */
c6b71bff
GD
62 _asm _emit 0xa2
63
7f88f624
VZ
64 cmp eax, 1 /*make sure eax return non-zero value */
65 jl NOT_SUPPORTED /*If eax is zero, mmx not supported */
c6b71bff 66
7f88f624
VZ
67 xor eax, eax /*set eax to zero */
68 inc eax /*Now increment eax to 1. This instruction is */
69 /*faster than the instruction "mov eax, 1" */
c6b71bff 70
7f88f624 71 _asm _emit 0x0f /*CPUID instruction */
c6b71bff
GD
72 _asm _emit 0xa2
73
7f88f624
VZ
74 and edx, 0x00800000 /*mask out all bits but mmx bit(24) */
75 cmp edx, 0 /* 0 = mmx not supported */
76 jz NOT_SUPPORTED /* non-zero = Yes, mmx IS supported */
c6b71bff 77
7f88f624 78 mov mmx_supported_local, 1 /*set return value to 1 */
c6b71bff
GD
79
80NOT_SUPPORTED:
7f88f624
VZ
81 mov eax, mmx_supported_local /*move return value to eax */
82 pop edx /*CPUID trashed these */
c6b71bff
GD
83 pop ecx
84 pop ebx
85 }
86
7f88f624
VZ
87 /*mmx_supported_local=0; // test code for force don't support MMX */
88 /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */
c6b71bff
GD
89
90 mmx_supported = mmx_supported_local;
91 return mmx_supported_local;
92}
93
94/* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
104
105/* Use this routine for x86 platform - uses faster MMX routine if machine
106 supports MMX */
107
108void /* PRIVATE */
109png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110{
111#ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113#endif
114
115 png_debug(1,"in png_combine_row_asm\n");
116
117 if (mmx_supported == 2) {
5b02c8a1 118#if !defined(PNG_1_0_X)
c6b71bff
GD
119 /* this should have happened in png_init_mmx_flags() already */
120 png_warning(png_ptr, "asm_flags may not have been initialized");
5b02c8a1 121#endif
c6b71bff
GD
122 png_mmx_support();
123 }
124
125 if (mask == 0xff)
126 {
127 png_memcpy(row, png_ptr->row_buf + 1,
5b02c8a1
VS
128 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
129 png_ptr->width));
c6b71bff
GD
130 }
131 /* GRR: add "else if (mask == 0)" case?
132 * or does png_combine_row() not even get called in that case? */
133 else
134 {
135 switch (png_ptr->row_info.pixel_depth)
136 {
137 case 1:
138 {
139 png_bytep sp;
140 png_bytep dp;
141 int s_inc, s_start, s_end;
142 int m;
143 int shift;
144 png_uint_32 i;
145
146 sp = png_ptr->row_buf + 1;
147 dp = row;
148 m = 0x80;
149#if defined(PNG_READ_PACKSWAP_SUPPORTED)
150 if (png_ptr->transformations & PNG_PACKSWAP)
151 {
152 s_start = 0;
153 s_end = 7;
154 s_inc = 1;
155 }
156 else
157#endif
158 {
159 s_start = 7;
160 s_end = 0;
161 s_inc = -1;
162 }
163
164 shift = s_start;
165
166 for (i = 0; i < png_ptr->width; i++)
167 {
168 if (m & mask)
169 {
170 int value;
171
172 value = (*sp >> shift) & 0x1;
173 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
174 *dp |= (png_byte)(value << shift);
175 }
176
177 if (shift == s_end)
178 {
179 shift = s_start;
180 sp++;
181 dp++;
182 }
183 else
184 shift += s_inc;
185
186 if (m == 1)
187 m = 0x80;
188 else
189 m >>= 1;
190 }
191 break;
192 }
193
194 case 2:
195 {
196 png_bytep sp;
197 png_bytep dp;
198 int s_start, s_end, s_inc;
199 int m;
200 int shift;
201 png_uint_32 i;
202 int value;
203
204 sp = png_ptr->row_buf + 1;
205 dp = row;
206 m = 0x80;
207#if defined(PNG_READ_PACKSWAP_SUPPORTED)
208 if (png_ptr->transformations & PNG_PACKSWAP)
209 {
210 s_start = 0;
211 s_end = 6;
212 s_inc = 2;
213 }
214 else
215#endif
216 {
217 s_start = 6;
218 s_end = 0;
219 s_inc = -2;
220 }
221
222 shift = s_start;
223
224 for (i = 0; i < png_ptr->width; i++)
225 {
226 if (m & mask)
227 {
228 value = (*sp >> shift) & 0x3;
229 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
230 *dp |= (png_byte)(value << shift);
231 }
232
233 if (shift == s_end)
234 {
235 shift = s_start;
236 sp++;
237 dp++;
238 }
239 else
240 shift += s_inc;
241 if (m == 1)
242 m = 0x80;
243 else
244 m >>= 1;
245 }
246 break;
247 }
248
249 case 4:
250 {
251 png_bytep sp;
252 png_bytep dp;
253 int s_start, s_end, s_inc;
254 int m;
255 int shift;
256 png_uint_32 i;
257 int value;
258
259 sp = png_ptr->row_buf + 1;
260 dp = row;
261 m = 0x80;
262#if defined(PNG_READ_PACKSWAP_SUPPORTED)
263 if (png_ptr->transformations & PNG_PACKSWAP)
264 {
265 s_start = 0;
266 s_end = 4;
267 s_inc = 4;
268 }
269 else
270#endif
271 {
272 s_start = 4;
273 s_end = 0;
274 s_inc = -4;
275 }
276 shift = s_start;
277
278 for (i = 0; i < png_ptr->width; i++)
279 {
280 if (m & mask)
281 {
282 value = (*sp >> shift) & 0xf;
283 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
284 *dp |= (png_byte)(value << shift);
285 }
286
287 if (shift == s_end)
288 {
289 shift = s_start;
290 sp++;
291 dp++;
292 }
293 else
294 shift += s_inc;
295 if (m == 1)
296 m = 0x80;
297 else
298 m >>= 1;
299 }
300 break;
301 }
302
303 case 8:
304 {
305 png_bytep srcptr;
306 png_bytep dstptr;
307 png_uint_32 len;
308 int m;
309 int diff, unmask;
310
311 __int64 mask0=0x0102040810204080;
312
5b02c8a1 313#if !defined(PNG_1_0_X)
c6b71bff
GD
314 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
315 /* && mmx_supported */ )
5b02c8a1
VS
316#else
317 if (mmx_supported)
318#endif
c6b71bff
GD
319 {
320 srcptr = png_ptr->row_buf + 1;
321 dstptr = row;
322 m = 0x80;
323 unmask = ~mask;
7f88f624
VZ
324 len = png_ptr->width &~7; /*reduce to multiple of 8 */
325 diff = png_ptr->width & 7; /*amount lost */
c6b71bff
GD
326
327 _asm
328 {
7f88f624
VZ
329 movd mm7, unmask /*load bit pattern */
330 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
331 punpcklbw mm7,mm7
332 punpcklwd mm7,mm7
7f88f624 333 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
334
335 movq mm0,mask0
336
7f88f624
VZ
337 pand mm0,mm7 /*nonzero if keep byte */
338 pcmpeqb mm0,mm6 /*zeros->1s, v versa */
c6b71bff 339
7f88f624
VZ
340 mov ecx,len /*load length of line (pixels) */
341 mov esi,srcptr /*load source */
342 mov ebx,dstptr /*load dest */
343 cmp ecx,0 /*lcr */
c6b71bff
GD
344 je mainloop8end
345
346mainloop8:
347 movq mm4,[esi]
348 pand mm4,mm0
349 movq mm6,mm0
350 pandn mm6,[ebx]
351 por mm4,mm6
352 movq [ebx],mm4
353
7f88f624 354 add esi,8 /*inc by 8 bytes processed */
c6b71bff 355 add ebx,8
7f88f624 356 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
357
358 ja mainloop8
359mainloop8end:
360
361 mov ecx,diff
362 cmp ecx,0
363 jz end8
364
365 mov edx,mask
7f88f624 366 sal edx,24 /*make low byte the high byte */
c6b71bff
GD
367
368secondloop8:
7f88f624
VZ
369 sal edx,1 /*move high bit to CF */
370 jnc skip8 /*if CF = 0 */
c6b71bff
GD
371 mov al,[esi]
372 mov [ebx],al
373skip8:
374 inc esi
375 inc ebx
376
377 dec ecx
378 jnz secondloop8
379end8:
380 emms
381 }
382 }
383 else /* mmx not supported - use modified C routine */
384 {
385 register unsigned int incr1, initial_val, final_val;
386 png_size_t pixel_bytes;
387 png_uint_32 i;
388 register int disp = png_pass_inc[png_ptr->pass];
389 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
390
391 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
392 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
393 pixel_bytes;
394 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
395 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
396 final_val = png_ptr->width*pixel_bytes;
397 incr1 = (disp)*pixel_bytes;
398 for (i = initial_val; i < final_val; i += incr1)
399 {
400 png_memcpy(dstptr, srcptr, pixel_bytes);
401 srcptr += incr1;
402 dstptr += incr1;
403 }
404 } /* end of else */
405
406 break;
7f88f624 407 } /* end 8 bpp */
c6b71bff
GD
408
409 case 16:
410 {
411 png_bytep srcptr;
412 png_bytep dstptr;
413 png_uint_32 len;
414 int unmask, diff;
415 __int64 mask1=0x0101020204040808,
416 mask0=0x1010202040408080;
417
5b02c8a1 418#if !defined(PNG_1_0_X)
c6b71bff
GD
419 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
420 /* && mmx_supported */ )
5b02c8a1
VS
421#else
422 if (mmx_supported)
423#endif
c6b71bff
GD
424 {
425 srcptr = png_ptr->row_buf + 1;
426 dstptr = row;
427
428 unmask = ~mask;
429 len = (png_ptr->width)&~7;
430 diff = (png_ptr->width)&7;
431 _asm
432 {
7f88f624
VZ
433 movd mm7, unmask /*load bit pattern */
434 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
435 punpcklbw mm7,mm7
436 punpcklwd mm7,mm7
7f88f624 437 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
438
439 movq mm0,mask0
440 movq mm1,mask1
441
442 pand mm0,mm7
443 pand mm1,mm7
444
445 pcmpeqb mm0,mm6
446 pcmpeqb mm1,mm6
447
7f88f624
VZ
448 mov ecx,len /*load length of line */
449 mov esi,srcptr /*load source */
450 mov ebx,dstptr /*load dest */
451 cmp ecx,0 /*lcr */
c6b71bff
GD
452 jz mainloop16end
453
454mainloop16:
455 movq mm4,[esi]
456 pand mm4,mm0
457 movq mm6,mm0
458 movq mm7,[ebx]
459 pandn mm6,mm7
460 por mm4,mm6
461 movq [ebx],mm4
462
463 movq mm5,[esi+8]
464 pand mm5,mm1
465 movq mm7,mm1
466 movq mm6,[ebx+8]
467 pandn mm7,mm6
468 por mm5,mm7
469 movq [ebx+8],mm5
470
7f88f624 471 add esi,16 /*inc by 16 bytes processed */
c6b71bff 472 add ebx,16
7f88f624 473 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
474
475 ja mainloop16
476
477mainloop16end:
478 mov ecx,diff
479 cmp ecx,0
480 jz end16
481
482 mov edx,mask
7f88f624 483 sal edx,24 /*make low byte the high byte */
c6b71bff 484secondloop16:
7f88f624
VZ
485 sal edx,1 /*move high bit to CF */
486 jnc skip16 /*if CF = 0 */
c6b71bff
GD
487 mov ax,[esi]
488 mov [ebx],ax
489skip16:
490 add esi,2
491 add ebx,2
492
493 dec ecx
494 jnz secondloop16
495end16:
496 emms
497 }
498 }
499 else /* mmx not supported - use modified C routine */
500 {
501 register unsigned int incr1, initial_val, final_val;
502 png_size_t pixel_bytes;
503 png_uint_32 i;
504 register int disp = png_pass_inc[png_ptr->pass];
505 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
506
507 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
508 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
509 pixel_bytes;
510 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
511 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
512 final_val = png_ptr->width*pixel_bytes;
513 incr1 = (disp)*pixel_bytes;
514 for (i = initial_val; i < final_val; i += incr1)
515 {
516 png_memcpy(dstptr, srcptr, pixel_bytes);
517 srcptr += incr1;
518 dstptr += incr1;
519 }
520 } /* end of else */
521
522 break;
7f88f624 523 } /* end 16 bpp */
c6b71bff
GD
524
525 case 24:
526 {
527 png_bytep srcptr;
528 png_bytep dstptr;
529 png_uint_32 len;
530 int unmask, diff;
531
7f88f624 532 __int64 mask2=0x0101010202020404, /*24bpp */
c6b71bff
GD
533 mask1=0x0408080810101020,
534 mask0=0x2020404040808080;
535
536 srcptr = png_ptr->row_buf + 1;
537 dstptr = row;
538
539 unmask = ~mask;
540 len = (png_ptr->width)&~7;
541 diff = (png_ptr->width)&7;
542
5b02c8a1 543#if !defined(PNG_1_0_X)
c6b71bff
GD
544 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
545 /* && mmx_supported */ )
5b02c8a1
VS
546#else
547 if (mmx_supported)
548#endif
c6b71bff
GD
549 {
550 _asm
551 {
7f88f624
VZ
552 movd mm7, unmask /*load bit pattern */
553 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
554 punpcklbw mm7,mm7
555 punpcklwd mm7,mm7
7f88f624 556 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
557
558 movq mm0,mask0
559 movq mm1,mask1
560 movq mm2,mask2
561
562 pand mm0,mm7
563 pand mm1,mm7
564 pand mm2,mm7
565
566 pcmpeqb mm0,mm6
567 pcmpeqb mm1,mm6
568 pcmpeqb mm2,mm6
569
7f88f624
VZ
570 mov ecx,len /*load length of line */
571 mov esi,srcptr /*load source */
572 mov ebx,dstptr /*load dest */
c6b71bff
GD
573 cmp ecx,0
574 jz mainloop24end
575
576mainloop24:
577 movq mm4,[esi]
578 pand mm4,mm0
579 movq mm6,mm0
580 movq mm7,[ebx]
581 pandn mm6,mm7
582 por mm4,mm6
583 movq [ebx],mm4
584
585
586 movq mm5,[esi+8]
587 pand mm5,mm1
588 movq mm7,mm1
589 movq mm6,[ebx+8]
590 pandn mm7,mm6
591 por mm5,mm7
592 movq [ebx+8],mm5
593
594 movq mm6,[esi+16]
595 pand mm6,mm2
596 movq mm4,mm2
597 movq mm7,[ebx+16]
598 pandn mm4,mm7
599 por mm6,mm4
600 movq [ebx+16],mm6
601
7f88f624 602 add esi,24 /*inc by 24 bytes processed */
c6b71bff 603 add ebx,24
7f88f624 604 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
605
606 ja mainloop24
607
608mainloop24end:
609 mov ecx,diff
610 cmp ecx,0
611 jz end24
612
613 mov edx,mask
7f88f624 614 sal edx,24 /*make low byte the high byte */
c6b71bff 615secondloop24:
7f88f624
VZ
616 sal edx,1 /*move high bit to CF */
617 jnc skip24 /*if CF = 0 */
c6b71bff
GD
618 mov ax,[esi]
619 mov [ebx],ax
620 xor eax,eax
621 mov al,[esi+2]
622 mov [ebx+2],al
623skip24:
624 add esi,3
625 add ebx,3
626
627 dec ecx
628 jnz secondloop24
629
630end24:
631 emms
632 }
633 }
634 else /* mmx not supported - use modified C routine */
635 {
636 register unsigned int incr1, initial_val, final_val;
637 png_size_t pixel_bytes;
638 png_uint_32 i;
639 register int disp = png_pass_inc[png_ptr->pass];
640 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
641
642 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
643 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
644 pixel_bytes;
645 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
646 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
647 final_val = png_ptr->width*pixel_bytes;
648 incr1 = (disp)*pixel_bytes;
649 for (i = initial_val; i < final_val; i += incr1)
650 {
651 png_memcpy(dstptr, srcptr, pixel_bytes);
652 srcptr += incr1;
653 dstptr += incr1;
654 }
655 } /* end of else */
656
657 break;
7f88f624 658 } /* end 24 bpp */
c6b71bff
GD
659
660 case 32:
661 {
662 png_bytep srcptr;
663 png_bytep dstptr;
664 png_uint_32 len;
665 int unmask, diff;
666
7f88f624 667 __int64 mask3=0x0101010102020202, /*32bpp */
c6b71bff
GD
668 mask2=0x0404040408080808,
669 mask1=0x1010101020202020,
670 mask0=0x4040404080808080;
671
672 srcptr = png_ptr->row_buf + 1;
673 dstptr = row;
674
675 unmask = ~mask;
676 len = (png_ptr->width)&~7;
677 diff = (png_ptr->width)&7;
678
5b02c8a1 679#if !defined(PNG_1_0_X)
c6b71bff
GD
680 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
681 /* && mmx_supported */ )
5b02c8a1
VS
682#else
683 if (mmx_supported)
684#endif
c6b71bff
GD
685 {
686 _asm
687 {
7f88f624
VZ
688 movd mm7, unmask /*load bit pattern */
689 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
690 punpcklbw mm7,mm7
691 punpcklwd mm7,mm7
7f88f624 692 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
693
694 movq mm0,mask0
695 movq mm1,mask1
696 movq mm2,mask2
697 movq mm3,mask3
698
699 pand mm0,mm7
700 pand mm1,mm7
701 pand mm2,mm7
702 pand mm3,mm7
703
704 pcmpeqb mm0,mm6
705 pcmpeqb mm1,mm6
706 pcmpeqb mm2,mm6
707 pcmpeqb mm3,mm6
708
7f88f624
VZ
709 mov ecx,len /*load length of line */
710 mov esi,srcptr /*load source */
711 mov ebx,dstptr /*load dest */
c6b71bff 712
7f88f624 713 cmp ecx,0 /*lcr */
c6b71bff
GD
714 jz mainloop32end
715
716mainloop32:
717 movq mm4,[esi]
718 pand mm4,mm0
719 movq mm6,mm0
720 movq mm7,[ebx]
721 pandn mm6,mm7
722 por mm4,mm6
723 movq [ebx],mm4
724
725 movq mm5,[esi+8]
726 pand mm5,mm1
727 movq mm7,mm1
728 movq mm6,[ebx+8]
729 pandn mm7,mm6
730 por mm5,mm7
731 movq [ebx+8],mm5
732
733 movq mm6,[esi+16]
734 pand mm6,mm2
735 movq mm4,mm2
736 movq mm7,[ebx+16]
737 pandn mm4,mm7
738 por mm6,mm4
739 movq [ebx+16],mm6
740
741 movq mm7,[esi+24]
742 pand mm7,mm3
743 movq mm5,mm3
744 movq mm4,[ebx+24]
745 pandn mm5,mm4
746 por mm7,mm5
747 movq [ebx+24],mm7
748
7f88f624 749 add esi,32 /*inc by 32 bytes processed */
c6b71bff 750 add ebx,32
7f88f624 751 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
752
753 ja mainloop32
754
755mainloop32end:
756 mov ecx,diff
757 cmp ecx,0
758 jz end32
759
760 mov edx,mask
7f88f624 761 sal edx,24 /*make low byte the high byte */
c6b71bff 762secondloop32:
7f88f624
VZ
763 sal edx,1 /*move high bit to CF */
764 jnc skip32 /*if CF = 0 */
c6b71bff
GD
765 mov eax,[esi]
766 mov [ebx],eax
767skip32:
768 add esi,4
769 add ebx,4
770
771 dec ecx
772 jnz secondloop32
773
774end32:
775 emms
776 }
777 }
778 else /* mmx _not supported - Use modified C routine */
779 {
780 register unsigned int incr1, initial_val, final_val;
781 png_size_t pixel_bytes;
782 png_uint_32 i;
783 register int disp = png_pass_inc[png_ptr->pass];
784 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
785
786 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
787 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
788 pixel_bytes;
789 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
790 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
791 final_val = png_ptr->width*pixel_bytes;
792 incr1 = (disp)*pixel_bytes;
793 for (i = initial_val; i < final_val; i += incr1)
794 {
795 png_memcpy(dstptr, srcptr, pixel_bytes);
796 srcptr += incr1;
797 dstptr += incr1;
798 }
799 } /* end of else */
800
801 break;
7f88f624 802 } /* end 32 bpp */
c6b71bff
GD
803
804 case 48:
805 {
806 png_bytep srcptr;
807 png_bytep dstptr;
808 png_uint_32 len;
809 int unmask, diff;
810
811 __int64 mask5=0x0101010101010202,
812 mask4=0x0202020204040404,
813 mask3=0x0404080808080808,
814 mask2=0x1010101010102020,
815 mask1=0x2020202040404040,
816 mask0=0x4040808080808080;
817
5b02c8a1 818#if !defined(PNG_1_0_X)
c6b71bff
GD
819 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
820 /* && mmx_supported */ )
5b02c8a1
VS
821#else
822 if (mmx_supported)
823#endif
c6b71bff
GD
824 {
825 srcptr = png_ptr->row_buf + 1;
826 dstptr = row;
827
828 unmask = ~mask;
829 len = (png_ptr->width)&~7;
830 diff = (png_ptr->width)&7;
831 _asm
832 {
7f88f624
VZ
833 movd mm7, unmask /*load bit pattern */
834 psubb mm6,mm6 /*zero mm6 */
c6b71bff
GD
835 punpcklbw mm7,mm7
836 punpcklwd mm7,mm7
7f88f624 837 punpckldq mm7,mm7 /*fill register with 8 masks */
c6b71bff
GD
838
839 movq mm0,mask0
840 movq mm1,mask1
841 movq mm2,mask2
842 movq mm3,mask3
843 movq mm4,mask4
844 movq mm5,mask5
845
846 pand mm0,mm7
847 pand mm1,mm7
848 pand mm2,mm7
849 pand mm3,mm7
850 pand mm4,mm7
851 pand mm5,mm7
852
853 pcmpeqb mm0,mm6
854 pcmpeqb mm1,mm6
855 pcmpeqb mm2,mm6
856 pcmpeqb mm3,mm6
857 pcmpeqb mm4,mm6
858 pcmpeqb mm5,mm6
859
7f88f624
VZ
860 mov ecx,len /*load length of line */
861 mov esi,srcptr /*load source */
862 mov ebx,dstptr /*load dest */
c6b71bff
GD
863
864 cmp ecx,0
865 jz mainloop48end
866
867mainloop48:
868 movq mm7,[esi]
869 pand mm7,mm0
870 movq mm6,mm0
871 pandn mm6,[ebx]
872 por mm7,mm6
873 movq [ebx],mm7
874
875 movq mm6,[esi+8]
876 pand mm6,mm1
877 movq mm7,mm1
878 pandn mm7,[ebx+8]
879 por mm6,mm7
880 movq [ebx+8],mm6
881
882 movq mm6,[esi+16]
883 pand mm6,mm2
884 movq mm7,mm2
885 pandn mm7,[ebx+16]
886 por mm6,mm7
887 movq [ebx+16],mm6
888
889 movq mm7,[esi+24]
890 pand mm7,mm3
891 movq mm6,mm3
892 pandn mm6,[ebx+24]
893 por mm7,mm6
894 movq [ebx+24],mm7
895
896 movq mm6,[esi+32]
897 pand mm6,mm4
898 movq mm7,mm4
899 pandn mm7,[ebx+32]
900 por mm6,mm7
901 movq [ebx+32],mm6
902
903 movq mm7,[esi+40]
904 pand mm7,mm5
905 movq mm6,mm5
906 pandn mm6,[ebx+40]
907 por mm7,mm6
908 movq [ebx+40],mm7
909
7f88f624 910 add esi,48 /*inc by 32 bytes processed */
c6b71bff 911 add ebx,48
7f88f624 912 sub ecx,8 /*dec by 8 pixels processed */
c6b71bff
GD
913
914 ja mainloop48
915mainloop48end:
916
917 mov ecx,diff
918 cmp ecx,0
919 jz end48
920
921 mov edx,mask
7f88f624 922 sal edx,24 /*make low byte the high byte */
c6b71bff
GD
923
924secondloop48:
7f88f624
VZ
925 sal edx,1 /*move high bit to CF */
926 jnc skip48 /*if CF = 0 */
c6b71bff
GD
927 mov eax,[esi]
928 mov [ebx],eax
929skip48:
930 add esi,4
931 add ebx,4
932
933 dec ecx
934 jnz secondloop48
935
936end48:
937 emms
938 }
939 }
940 else /* mmx _not supported - Use modified C routine */
941 {
942 register unsigned int incr1, initial_val, final_val;
943 png_size_t pixel_bytes;
944 png_uint_32 i;
945 register int disp = png_pass_inc[png_ptr->pass];
946 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947
948 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
949 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
950 pixel_bytes;
951 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
952 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
953 final_val = png_ptr->width*pixel_bytes;
954 incr1 = (disp)*pixel_bytes;
955 for (i = initial_val; i < final_val; i += incr1)
956 {
957 png_memcpy(dstptr, srcptr, pixel_bytes);
958 srcptr += incr1;
959 dstptr += incr1;
960 }
961 } /* end of else */
962
963 break;
7f88f624 964 } /* end 48 bpp */
c6b71bff
GD
965
966 default:
967 {
968 png_bytep sptr;
969 png_bytep dp;
970 png_size_t pixel_bytes;
971 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
972 unsigned int i;
7f88f624 973 register int disp = png_pass_inc[png_ptr->pass]; /* get the offset */
c6b71bff
GD
974 register unsigned int incr1, initial_val, final_val;
975
976 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
977 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
978 pixel_bytes;
979 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
980 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
981 final_val = png_ptr->width*pixel_bytes;
982 incr1 = (disp)*pixel_bytes;
983 for (i = initial_val; i < final_val; i += incr1)
984 {
985 png_memcpy(dp, sptr, pixel_bytes);
986 sptr += incr1;
987 dp += incr1;
988 }
989 break;
990 }
991 } /* end switch (png_ptr->row_info.pixel_depth) */
992 } /* end if (non-trivial mask) */
993
994} /* end png_combine_row() */
995
996
997#if defined(PNG_READ_INTERLACING_SUPPORTED)
998
999void /* PRIVATE */
1000png_do_read_interlace(png_structp png_ptr)
1001{
1002 png_row_infop row_info = &(png_ptr->row_info);
1003 png_bytep row = png_ptr->row_buf + 1;
1004 int pass = png_ptr->pass;
1005 png_uint_32 transformations = png_ptr->transformations;
1006#ifdef PNG_USE_LOCAL_ARRAYS
1007 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1008#endif
1009
1010 png_debug(1,"in png_do_read_interlace\n");
1011
1012 if (mmx_supported == 2) {
5b02c8a1 1013#if !defined(PNG_1_0_X)
c6b71bff
GD
1014 /* this should have happened in png_init_mmx_flags() already */
1015 png_warning(png_ptr, "asm_flags may not have been initialized");
5b02c8a1 1016#endif
c6b71bff
GD
1017 png_mmx_support();
1018 }
1019
1020 if (row != NULL && row_info != NULL)
1021 {
1022 png_uint_32 final_width;
1023
1024 final_width = row_info->width * png_pass_inc[pass];
1025
1026 switch (row_info->pixel_depth)
1027 {
1028 case 1:
1029 {
1030 png_bytep sp, dp;
1031 int sshift, dshift;
1032 int s_start, s_end, s_inc;
1033 png_byte v;
1034 png_uint_32 i;
1035 int j;
1036
1037 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1038 dp = row + (png_size_t)((final_width - 1) >> 3);
1039#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040 if (transformations & PNG_PACKSWAP)
1041 {
1042 sshift = (int)((row_info->width + 7) & 7);
1043 dshift = (int)((final_width + 7) & 7);
1044 s_start = 7;
1045 s_end = 0;
1046 s_inc = -1;
1047 }
1048 else
1049#endif
1050 {
1051 sshift = 7 - (int)((row_info->width + 7) & 7);
1052 dshift = 7 - (int)((final_width + 7) & 7);
1053 s_start = 0;
1054 s_end = 7;
1055 s_inc = 1;
1056 }
1057
1058 for (i = row_info->width; i; i--)
1059 {
1060 v = (png_byte)((*sp >> sshift) & 0x1);
1061 for (j = 0; j < png_pass_inc[pass]; j++)
1062 {
1063 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1064 *dp |= (png_byte)(v << dshift);
1065 if (dshift == s_end)
1066 {
1067 dshift = s_start;
1068 dp--;
1069 }
1070 else
1071 dshift += s_inc;
1072 }
1073 if (sshift == s_end)
1074 {
1075 sshift = s_start;
1076 sp--;
1077 }
1078 else
1079 sshift += s_inc;
1080 }
1081 break;
1082 }
1083
1084 case 2:
1085 {
1086 png_bytep sp, dp;
1087 int sshift, dshift;
1088 int s_start, s_end, s_inc;
1089 png_uint_32 i;
1090
1091 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1092 dp = row + (png_size_t)((final_width - 1) >> 2);
1093#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094 if (transformations & PNG_PACKSWAP)
1095 {
1096 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1097 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1098 s_start = 6;
1099 s_end = 0;
1100 s_inc = -2;
1101 }
1102 else
1103#endif
1104 {
1105 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1106 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1107 s_start = 0;
1108 s_end = 6;
1109 s_inc = 2;
1110 }
1111
1112 for (i = row_info->width; i; i--)
1113 {
1114 png_byte v;
1115 int j;
1116
1117 v = (png_byte)((*sp >> sshift) & 0x3);
1118 for (j = 0; j < png_pass_inc[pass]; j++)
1119 {
1120 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1121 *dp |= (png_byte)(v << dshift);
1122 if (dshift == s_end)
1123 {
1124 dshift = s_start;
1125 dp--;
1126 }
1127 else
1128 dshift += s_inc;
1129 }
1130 if (sshift == s_end)
1131 {
1132 sshift = s_start;
1133 sp--;
1134 }
1135 else
1136 sshift += s_inc;
1137 }
1138 break;
1139 }
1140
1141 case 4:
1142 {
1143 png_bytep sp, dp;
1144 int sshift, dshift;
1145 int s_start, s_end, s_inc;
1146 png_uint_32 i;
1147
1148 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1149 dp = row + (png_size_t)((final_width - 1) >> 1);
1150#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151 if (transformations & PNG_PACKSWAP)
1152 {
1153 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1154 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1155 s_start = 4;
1156 s_end = 0;
1157 s_inc = -4;
1158 }
1159 else
1160#endif
1161 {
1162 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1163 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1164 s_start = 0;
1165 s_end = 4;
1166 s_inc = 4;
1167 }
1168
1169 for (i = row_info->width; i; i--)
1170 {
1171 png_byte v;
1172 int j;
1173
1174 v = (png_byte)((*sp >> sshift) & 0xf);
1175 for (j = 0; j < png_pass_inc[pass]; j++)
1176 {
1177 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1178 *dp |= (png_byte)(v << dshift);
1179 if (dshift == s_end)
1180 {
1181 dshift = s_start;
1182 dp--;
1183 }
1184 else
1185 dshift += s_inc;
1186 }
1187 if (sshift == s_end)
1188 {
1189 sshift = s_start;
1190 sp--;
1191 }
1192 else
1193 sshift += s_inc;
1194 }
1195 break;
1196 }
1197
7f88f624 1198 default: /* This is the place where the routine is modified */
c6b71bff
GD
1199 {
1200 __int64 const4 = 0x0000000000FFFFFF;
7f88f624 1201 /* __int64 const5 = 0x000000FFFFFF0000; // unused... */
c6b71bff
GD
1202 __int64 const6 = 0x00000000000000FF;
1203 png_bytep sptr, dp;
1204 png_uint_32 i;
1205 png_size_t pixel_bytes;
1206 int width = row_info->width;
1207
1208 pixel_bytes = (row_info->pixel_depth >> 3);
1209
1210 sptr = row + (width - 1) * pixel_bytes;
1211 dp = row + (final_width - 1) * pixel_bytes;
7f88f624
VZ
1212 /* New code by Nirav Chhatrapati - Intel Corporation */
1213 /* sign fix by GRR */
1214 /* NOTE: there is NO MMX code for 48-bit and 64-bit images */
c6b71bff 1215
5b02c8a1
VS
1216 // use MMX routine if machine supports it
1217#if !defined(PNG_1_0_X)
c6b71bff
GD
1218 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1219 /* && mmx_supported */ )
5b02c8a1
VS
1220#else
1221 if (mmx_supported)
1222#endif
c6b71bff
GD
1223 {
1224 if (pixel_bytes == 3)
1225 {
1226 if (((pass == 0) || (pass == 1)) && width)
1227 {
1228 _asm
1229 {
1230 mov esi, sptr
1231 mov edi, dp
1232 mov ecx, width
7f88f624 1233 sub edi, 21 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1234loop_pass0:
1235 movd mm0, [esi] ; X X X X X v2 v1 v0
1236 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1237 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1238 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1239 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1240 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1241 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1242 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1243 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1244 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1245 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1246 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1247 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1248 movq [edi+16] , mm4
1249 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1250 movq [edi+8] , mm3
1251 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1252 sub esi, 3
1253 movq [edi], mm0
1254 sub edi, 24
7f88f624 1255 /*sub esi, 3 */
c6b71bff
GD
1256 dec ecx
1257 jnz loop_pass0
1258 EMMS
1259 }
1260 }
1261 else if (((pass == 2) || (pass == 3)) && width)
1262 {
1263 _asm
1264 {
1265 mov esi, sptr
1266 mov edi, dp
1267 mov ecx, width
7f88f624 1268 sub edi, 9 /* (png_pass_inc[pass] - 1)*pixel_bytes */
c6b71bff
GD
1269loop_pass2:
1270 movd mm0, [esi] ; X X X X X v2 v1 v0
1271 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1272 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1273 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1274 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1275 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1276 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1277 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1278 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1279 movq [edi+4], mm0 ; move to memory
1280 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1281 movd [edi], mm0 ; move to memory
1282 sub esi, 3
1283 sub edi, 12
1284 dec ecx
1285 jnz loop_pass2
1286 EMMS
1287 }
1288 }
1289 else if (width) /* && ((pass == 4) || (pass == 5)) */
1290 {
1291 int width_mmx = ((width >> 1) << 1) - 8;
1292 if (width_mmx < 0)
1293 width_mmx = 0;
7f88f624 1294 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */
c6b71bff
GD
1295 if (width_mmx)
1296 {
1297 _asm
1298 {
1299 mov esi, sptr
1300 mov edi, dp
1301 mov ecx, width_mmx
1302 sub esi, 3
1303 sub edi, 9
1304loop_pass4:
1305 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1306 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1307 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1308 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1309 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1310 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1311 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1312 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1313 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1314 movq [edi], mm0 ; move quad to memory
1315 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1316 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1317 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1318 movd [edi+8], mm6 ; move double to memory
1319 sub esi, 6
1320 sub edi, 12
1321 sub ecx, 2
1322 jnz loop_pass4
1323 EMMS
1324 }
1325 }
1326
1327 sptr -= width_mmx*3;
1328 dp -= width_mmx*6;
1329 for (i = width; i; i--)
1330 {
1331 png_byte v[8];
1332 int j;
1333
1334 png_memcpy(v, sptr, 3);
1335 for (j = 0; j < png_pass_inc[pass]; j++)
1336 {
1337 png_memcpy(dp, v, 3);
1338 dp -= 3;
1339 }
1340 sptr -= 3;
1341 }
1342 }
1343 } /* end of pixel_bytes == 3 */
1344
1345 else if (pixel_bytes == 1)
1346 {
1347 if (((pass == 0) || (pass == 1)) && width)
1348 {
1349 int width_mmx = ((width >> 2) << 2);
1350 width -= width_mmx;
1351 if (width_mmx)
1352 {
1353 _asm
1354 {
1355 mov esi, sptr
1356 mov edi, dp
1357 mov ecx, width_mmx
1358 sub edi, 31
1359 sub esi, 3
1360loop1_pass0:
1361 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1362 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1363 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1364 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1365 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1366 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1367 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1368 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1369 movq [edi], mm0 ; move to memory v3
1370 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1371 movq [edi+8], mm3 ; move to memory v2
1372 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1373 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1374 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1375 movq [edi+16], mm2 ; move to memory v1
1376 movq [edi+24], mm4 ; move to memory v0
1377 sub esi, 4
1378 sub edi, 32
1379 sub ecx, 4
1380 jnz loop1_pass0
1381 EMMS
1382 }
1383 }
1384
1385 sptr -= width_mmx;
1386 dp -= width_mmx*8;
1387 for (i = width; i; i--)
1388 {
1389 int j;
1390
1391 /* I simplified this part in version 1.0.4e
1392 * here and in several other instances where
1393 * pixel_bytes == 1 -- GR-P
1394 *
1395 * Original code:
1396 *
1397 * png_byte v[8];
1398 * png_memcpy(v, sptr, pixel_bytes);
1399 * for (j = 0; j < png_pass_inc[pass]; j++)
1400 * {
1401 * png_memcpy(dp, v, pixel_bytes);
1402 * dp -= pixel_bytes;
1403 * }
1404 * sptr -= pixel_bytes;
1405 *
1406 * Replacement code is in the next three lines:
1407 */
1408
1409 for (j = 0; j < png_pass_inc[pass]; j++)
1410 *dp-- = *sptr;
1411 sptr--;
1412 }
1413 }
1414 else if (((pass == 2) || (pass == 3)) && width)
1415 {
1416 int width_mmx = ((width >> 2) << 2);
1417 width -= width_mmx;
1418 if (width_mmx)
1419 {
1420 _asm
1421 {
1422 mov esi, sptr
1423 mov edi, dp
1424 mov ecx, width_mmx
1425 sub edi, 15
1426 sub esi, 3
1427loop1_pass2:
1428 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1429 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1430 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1431 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1432 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1433 movq [edi], mm0 ; move to memory v2 and v3
1434 sub esi, 4
1435 movq [edi+8], mm1 ; move to memory v1 and v0
1436 sub edi, 16
1437 sub ecx, 4
1438 jnz loop1_pass2
1439 EMMS
1440 }
1441 }
1442
1443 sptr -= width_mmx;
1444 dp -= width_mmx*4;
1445 for (i = width; i; i--)
1446 {
1447 int j;
1448
1449 for (j = 0; j < png_pass_inc[pass]; j++)
1450 {
1451 *dp-- = *sptr;
1452 }
1453 sptr --;
1454 }
1455 }
1456 else if (width) /* && ((pass == 4) || (pass == 5))) */
1457 {
1458 int width_mmx = ((width >> 3) << 3);
1459 width -= width_mmx;
1460 if (width_mmx)
1461 {
1462 _asm
1463 {
1464 mov esi, sptr
1465 mov edi, dp
1466 mov ecx, width_mmx
1467 sub edi, 15
1468 sub esi, 7
1469loop1_pass4:
1470 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1471 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1472 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
7f88f624 1473 /*movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 */
c6b71bff
GD
1474 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1475 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1476 sub esi, 8
1477 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
7f88f624 1478 /*sub esi, 4 */
c6b71bff
GD
1479 sub edi, 16
1480 sub ecx, 8
1481 jnz loop1_pass4
1482 EMMS
1483 }
1484 }
1485
1486 sptr -= width_mmx;
1487 dp -= width_mmx*2;
1488 for (i = width; i; i--)
1489 {
1490 int j;
1491
1492 for (j = 0; j < png_pass_inc[pass]; j++)
1493 {
1494 *dp-- = *sptr;
1495 }
1496 sptr --;
1497 }
1498 }
1499 } /* end of pixel_bytes == 1 */
1500
1501 else if (pixel_bytes == 2)
1502 {
1503 if (((pass == 0) || (pass == 1)) && width)
1504 {
1505 int width_mmx = ((width >> 1) << 1);
1506 width -= width_mmx;
1507 if (width_mmx)
1508 {
1509 _asm
1510 {
1511 mov esi, sptr
1512 mov edi, dp
1513 mov ecx, width_mmx
1514 sub esi, 2
1515 sub edi, 30
1516loop2_pass0:
1517 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1518 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1519 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1520 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1521 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1522 movq [edi], mm0
1523 movq [edi + 8], mm0
1524 movq [edi + 16], mm1
1525 movq [edi + 24], mm1
1526 sub esi, 4
1527 sub edi, 32
1528 sub ecx, 2
1529 jnz loop2_pass0
1530 EMMS
1531 }
1532 }
1533
7f88f624
VZ
1534 sptr -= (width_mmx*2 - 2); /* sign fixed */
1535 dp -= (width_mmx*16 - 2); /* sign fixed */
c6b71bff
GD
1536 for (i = width; i; i--)
1537 {
1538 png_byte v[8];
1539 int j;
1540 sptr -= 2;
1541 png_memcpy(v, sptr, 2);
1542 for (j = 0; j < png_pass_inc[pass]; j++)
1543 {
1544 dp -= 2;
1545 png_memcpy(dp, v, 2);
1546 }
1547 }
1548 }
1549 else if (((pass == 2) || (pass == 3)) && width)
1550 {
1551 int width_mmx = ((width >> 1) << 1) ;
1552 width -= width_mmx;
1553 if (width_mmx)
1554 {
1555 _asm
1556 {
1557 mov esi, sptr
1558 mov edi, dp
1559 mov ecx, width_mmx
1560 sub esi, 2
1561 sub edi, 14
1562loop2_pass2:
1563 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1564 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1565 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1566 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1567 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1568 movq [edi], mm0
1569 sub esi, 4
1570 movq [edi + 8], mm1
7f88f624 1571 /*sub esi, 4 */
c6b71bff
GD
1572 sub edi, 16
1573 sub ecx, 2
1574 jnz loop2_pass2
1575 EMMS
1576 }
1577 }
1578
7f88f624
VZ
1579 sptr -= (width_mmx*2 - 2); /* sign fixed */
1580 dp -= (width_mmx*8 - 2); /* sign fixed */
c6b71bff
GD
1581 for (i = width; i; i--)
1582 {
1583 png_byte v[8];
1584 int j;
1585 sptr -= 2;
1586 png_memcpy(v, sptr, 2);
1587 for (j = 0; j < png_pass_inc[pass]; j++)
1588 {
1589 dp -= 2;
1590 png_memcpy(dp, v, 2);
1591 }
1592 }
1593 }
7f88f624 1594 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
1595 {
1596 int width_mmx = ((width >> 1) << 1) ;
1597 width -= width_mmx;
1598 if (width_mmx)
1599 {
1600 _asm
1601 {
1602 mov esi, sptr
1603 mov edi, dp
1604 mov ecx, width_mmx
1605 sub esi, 2
1606 sub edi, 6
1607loop2_pass4:
1608 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1609 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1610 sub esi, 4
1611 movq [edi], mm0
1612 sub edi, 8
1613 sub ecx, 2
1614 jnz loop2_pass4
1615 EMMS
1616 }
1617 }
1618
7f88f624
VZ
1619 sptr -= (width_mmx*2 - 2); /* sign fixed */
1620 dp -= (width_mmx*4 - 2); /* sign fixed */
c6b71bff
GD
1621 for (i = width; i; i--)
1622 {
1623 png_byte v[8];
1624 int j;
1625 sptr -= 2;
1626 png_memcpy(v, sptr, 2);
1627 for (j = 0; j < png_pass_inc[pass]; j++)
1628 {
1629 dp -= 2;
1630 png_memcpy(dp, v, 2);
1631 }
1632 }
1633 }
1634 } /* end of pixel_bytes == 2 */
1635
1636 else if (pixel_bytes == 4)
1637 {
1638 if (((pass == 0) || (pass == 1)) && width)
1639 {
1640 int width_mmx = ((width >> 1) << 1) ;
1641 width -= width_mmx;
1642 if (width_mmx)
1643 {
1644 _asm
1645 {
1646 mov esi, sptr
1647 mov edi, dp
1648 mov ecx, width_mmx
1649 sub esi, 4
1650 sub edi, 60
1651loop4_pass0:
1652 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1653 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1654 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1655 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1656 movq [edi], mm0
1657 movq [edi + 8], mm0
1658 movq [edi + 16], mm0
1659 movq [edi + 24], mm0
1660 movq [edi+32], mm1
1661 movq [edi + 40], mm1
1662 movq [edi+ 48], mm1
1663 sub esi, 8
1664 movq [edi + 56], mm1
1665 sub edi, 64
1666 sub ecx, 2
1667 jnz loop4_pass0
1668 EMMS
1669 }
1670 }
1671
7f88f624
VZ
1672 sptr -= (width_mmx*4 - 4); /* sign fixed */
1673 dp -= (width_mmx*32 - 4); /* sign fixed */
c6b71bff
GD
1674 for (i = width; i; i--)
1675 {
1676 png_byte v[8];
1677 int j;
1678 sptr -= 4;
1679 png_memcpy(v, sptr, 4);
1680 for (j = 0; j < png_pass_inc[pass]; j++)
1681 {
1682 dp -= 4;
1683 png_memcpy(dp, v, 4);
1684 }
1685 }
1686 }
1687 else if (((pass == 2) || (pass == 3)) && width)
1688 {
1689 int width_mmx = ((width >> 1) << 1) ;
1690 width -= width_mmx;
1691 if (width_mmx)
1692 {
1693 _asm
1694 {
1695 mov esi, sptr
1696 mov edi, dp
1697 mov ecx, width_mmx
1698 sub esi, 4
1699 sub edi, 28
1700loop4_pass2:
1701 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1702 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1703 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1704 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1705 movq [edi], mm0
1706 movq [edi + 8], mm0
1707 movq [edi+16], mm1
1708 movq [edi + 24], mm1
1709 sub esi, 8
1710 sub edi, 32
1711 sub ecx, 2
1712 jnz loop4_pass2
1713 EMMS
1714 }
1715 }
1716
7f88f624
VZ
1717 sptr -= (width_mmx*4 - 4); /* sign fixed */
1718 dp -= (width_mmx*16 - 4); /* sign fixed */
c6b71bff
GD
1719 for (i = width; i; i--)
1720 {
1721 png_byte v[8];
1722 int j;
1723 sptr -= 4;
1724 png_memcpy(v, sptr, 4);
1725 for (j = 0; j < png_pass_inc[pass]; j++)
1726 {
1727 dp -= 4;
1728 png_memcpy(dp, v, 4);
1729 }
1730 }
1731 }
7f88f624 1732 else if (width) /* pass == 4 or 5 */
c6b71bff
GD
1733 {
1734 int width_mmx = ((width >> 1) << 1) ;
1735 width -= width_mmx;
1736 if (width_mmx)
1737 {
1738 _asm
1739 {
1740 mov esi, sptr
1741 mov edi, dp
1742 mov ecx, width_mmx
1743 sub esi, 4
1744 sub edi, 12
1745loop4_pass4:
1746 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1747 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1748 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1749 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1750 movq [edi], mm0
1751 sub esi, 8
1752 movq [edi + 8], mm1
1753 sub edi, 16
1754 sub ecx, 2
1755 jnz loop4_pass4
1756 EMMS
1757 }
1758 }
1759
7f88f624
VZ
1760 sptr -= (width_mmx*4 - 4); /* sign fixed */
1761 dp -= (width_mmx*8 - 4); /* sign fixed */
c6b71bff
GD
1762 for (i = width; i; i--)
1763 {
1764 png_byte v[8];
1765 int j;
1766 sptr -= 4;
1767 png_memcpy(v, sptr, 4);
1768 for (j = 0; j < png_pass_inc[pass]; j++)
1769 {
1770 dp -= 4;
1771 png_memcpy(dp, v, 4);
1772 }
1773 }
1774 }
1775
1776 } /* end of pixel_bytes == 4 */
1777
1778 else if (pixel_bytes == 6)
1779 {
1780 for (i = width; i; i--)
1781 {
1782 png_byte v[8];
1783 int j;
1784 png_memcpy(v, sptr, 6);
1785 for (j = 0; j < png_pass_inc[pass]; j++)
1786 {
1787 png_memcpy(dp, v, 6);
1788 dp -= 6;
1789 }
1790 sptr -= 6;
1791 }
1792 } /* end of pixel_bytes == 6 */
1793
1794 else
1795 {
1796 for (i = width; i; i--)
1797 {
1798 png_byte v[8];
1799 int j;
1800 png_memcpy(v, sptr, pixel_bytes);
1801 for (j = 0; j < png_pass_inc[pass]; j++)
1802 {
1803 png_memcpy(dp, v, pixel_bytes);
1804 dp -= pixel_bytes;
1805 }
1806 sptr-= pixel_bytes;
1807 }
1808 }
1809 } /* end of mmx_supported */
1810
1811 else /* MMX not supported: use modified C code - takes advantage
1812 * of inlining of memcpy for a constant */
1813 {
1814 if (pixel_bytes == 1)
1815 {
1816 for (i = width; i; i--)
1817 {
1818 int j;
1819 for (j = 0; j < png_pass_inc[pass]; j++)
1820 *dp-- = *sptr;
1821 sptr--;
1822 }
1823 }
1824 else if (pixel_bytes == 3)
1825 {
1826 for (i = width; i; i--)
1827 {
1828 png_byte v[8];
1829 int j;
1830 png_memcpy(v, sptr, pixel_bytes);
1831 for (j = 0; j < png_pass_inc[pass]; j++)
1832 {
1833 png_memcpy(dp, v, pixel_bytes);
1834 dp -= pixel_bytes;
1835 }
1836 sptr -= pixel_bytes;
1837 }
1838 }
1839 else if (pixel_bytes == 2)
1840 {
1841 for (i = width; i; i--)
1842 {
1843 png_byte v[8];
1844 int j;
1845 png_memcpy(v, sptr, pixel_bytes);
1846 for (j = 0; j < png_pass_inc[pass]; j++)
1847 {
1848 png_memcpy(dp, v, pixel_bytes);
1849 dp -= pixel_bytes;
1850 }
1851 sptr -= pixel_bytes;
1852 }
1853 }
1854 else if (pixel_bytes == 4)
1855 {
1856 for (i = width; i; i--)
1857 {
1858 png_byte v[8];
1859 int j;
1860 png_memcpy(v, sptr, pixel_bytes);
1861 for (j = 0; j < png_pass_inc[pass]; j++)
1862 {
1863 png_memcpy(dp, v, pixel_bytes);
1864 dp -= pixel_bytes;
1865 }
1866 sptr -= pixel_bytes;
1867 }
1868 }
1869 else if (pixel_bytes == 6)
1870 {
1871 for (i = width; i; i--)
1872 {
1873 png_byte v[8];
1874 int j;
1875 png_memcpy(v, sptr, pixel_bytes);
1876 for (j = 0; j < png_pass_inc[pass]; j++)
1877 {
1878 png_memcpy(dp, v, pixel_bytes);
1879 dp -= pixel_bytes;
1880 }
1881 sptr -= pixel_bytes;
1882 }
1883 }
1884 else
1885 {
1886 for (i = width; i; i--)
1887 {
1888 png_byte v[8];
1889 int j;
1890 png_memcpy(v, sptr, pixel_bytes);
1891 for (j = 0; j < png_pass_inc[pass]; j++)
1892 {
1893 png_memcpy(dp, v, pixel_bytes);
1894 dp -= pixel_bytes;
1895 }
1896 sptr -= pixel_bytes;
1897 }
1898 }
1899
1900 } /* end of MMX not supported */
1901 break;
1902 }
1903 } /* end switch (row_info->pixel_depth) */
1904
1905 row_info->width = final_width;
5b02c8a1
VS
1906
1907 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
c6b71bff
GD
1908 }
1909
1910}
1911
1912#endif /* PNG_READ_INTERLACING_SUPPORTED */
1913
1914
7f88f624
VZ
1915/* These variables are utilized in the functions below. They are declared */
1916/* globally here to ensure alignment on 8-byte boundaries. */
c6b71bff
GD
1917
1918union uAll {
1919 __int64 use;
1920 double align;
1921} LBCarryMask = {0x0101010101010101},
1922 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1923 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1924
1925
7f88f624 1926/* Optimized code for PNG Average filter decoder */
c6b71bff
GD
1927void /* PRIVATE */
1928png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1929 , png_bytep prev_row)
1930{
1931 int bpp;
1932 png_uint_32 FullLength;
1933 png_uint_32 MMXLength;
7f88f624 1934 /*png_uint_32 len; */
c6b71bff
GD
1935 int diff;
1936
7f88f624
VZ
1937 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
1938 FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff 1939 _asm {
7f88f624
VZ
1940 /* Init address pointers and offset */
1941 mov edi, row /* edi ==> Avg(x) */
1942 xor ebx, ebx /* ebx ==> x */
c6b71bff 1943 mov edx, edi
7f88f624
VZ
1944 mov esi, prev_row /* esi ==> Prior(x) */
1945 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff
GD
1946
1947 xor eax, eax
7f88f624
VZ
1948 /* Compute the Raw value for the first bpp bytes */
1949 /* Raw(x) = Avg(x) + (Prior(x)/2) */
c6b71bff 1950davgrlp:
7f88f624 1951 mov al, [esi + ebx] /* Load al with Prior(x) */
c6b71bff 1952 inc ebx
7f88f624
VZ
1953 shr al, 1 /* divide by 2 */
1954 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
c6b71bff 1955 cmp ebx, bpp
7f88f624
VZ
1956 mov [edi+ebx-1], al /* Write back Raw(x); */
1957 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff 1958 jb davgrlp
7f88f624
VZ
1959 /* get # of bytes to alignment */
1960 mov diff, edi /* take start of row */
1961 add diff, ebx /* add bpp */
1962 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */
1963 and diff, 0xfffffff8 /* mask to alignment boundary */
1964 sub diff, edi /* subtract from start ==> value ebx at alignment */
c6b71bff 1965 jz davggo
7f88f624
VZ
1966 /* fix alignment */
1967 /* Compute the Raw value for the bytes upto the alignment boundary */
1968 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff
GD
1969 xor ecx, ecx
1970davglp1:
1971 xor eax, eax
7f88f624
VZ
1972 mov cl, [esi + ebx] /* load cl with Prior(x) */
1973 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
c6b71bff
GD
1974 add ax, cx
1975 inc ebx
7f88f624
VZ
1976 shr ax, 1 /* divide by 2 */
1977 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
1978 cmp ebx, diff /* Check if at alignment boundary */
1979 mov [edi+ebx-1], al /* Write back Raw(x); */
1980 /* mov does not affect flags; -1 to offset inc ebx */
1981 jb davglp1 /* Repeat until at alignment boundary */
c6b71bff
GD
1982davggo:
1983 mov eax, FullLength
1984 mov ecx, eax
7f88f624
VZ
1985 sub eax, ebx /* subtract alignment fix */
1986 and eax, 0x00000007 /* calc bytes over mult of 8 */
1987 sub ecx, eax /* drop over bytes from original length */
c6b71bff 1988 mov MMXLength, ecx
7f88f624
VZ
1989 } /* end _asm block */
1990 /* Now do the math for the rest of the row */
c6b71bff
GD
1991 switch ( bpp )
1992 {
1993 case 3:
1994 {
1995 ActiveMask.use = 0x0000000000ffffff;
7f88f624
VZ
1996 ShiftBpp.use = 24; /* == 3 * 8 */
1997 ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff 1998 _asm {
7f88f624 1999 /* Re-init address pointers and offset */
c6b71bff 2000 movq mm7, ActiveMask
7f88f624 2001 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
c6b71bff 2002 movq mm5, LBCarryMask
7f88f624 2003 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2004 movq mm4, HBClearMask
7f88f624
VZ
2005 mov esi, prev_row /* esi ==> Prior(x) */
2006 /* PRIME the pump (load the first Raw(x-bpp) data set */
2007 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2008 /* (we correct position in loop below) */
c6b71bff 2009davg3lp:
7f88f624
VZ
2010 movq mm0, [edi + ebx] /* Load mm0 with Avg(x) */
2011 /* Add (Prev_row/2) to Average */
c6b71bff 2012 movq mm3, mm5
7f88f624
VZ
2013 psrlq mm2, ShiftRem /* Correct position Raw(x-bpp) data */
2014 movq mm1, [esi + ebx] /* Load mm1 with Prior(x) */
c6b71bff 2015 movq mm6, mm7
7f88f624
VZ
2016 pand mm3, mm1 /* get lsb for each prev_row byte */
2017 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2018 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2019 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2020 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2021 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2022 pand mm1, mm2 /* get LBCarrys for each byte where both */
2023 /* lsb's were == 1 (Only valid for active group) */
2024 psrlq mm2, 1 /* divide raw bytes by 2 */
2025 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2026 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2027 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */
2028 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2029 /* byte */
2030 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2031 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 3-5 */
2032 movq mm2, mm0 /* mov updated Raws to mm2 */
2033 psllq mm2, ShiftBpp /* shift data to position correctly */
2034 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2035 pand mm1, mm2 /* get LBCarrys for each byte where both */
2036 /* lsb's were == 1 (Only valid for active group) */
2037 psrlq mm2, 1 /* divide raw bytes by 2 */
2038 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2039 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2040 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2041 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2042 /* byte */
2043
2044 /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */
2045 psllq mm6, ShiftBpp /* shift the mm6 mask to cover the last two */
2046 /* bytes */
2047 movq mm2, mm0 /* mov updated Raws to mm2 */
2048 psllq mm2, ShiftBpp /* shift data to position correctly */
2049 /* Data only needs to be shifted once here to */
2050 /* get the correct x-bpp offset. */
2051 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2052 pand mm1, mm2 /* get LBCarrys for each byte where both */
2053 /* lsb's were == 1 (Only valid for active group) */
2054 psrlq mm2, 1 /* divide raw bytes by 2 */
2055 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2056 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2057 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
c6b71bff 2058 add ebx, 8
7f88f624
VZ
2059 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2060 /* byte */
c6b71bff 2061
7f88f624 2062 /* Now ready to write back to memory */
c6b71bff 2063 movq [edi + ebx - 8], mm0
7f88f624 2064 /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */
c6b71bff 2065 cmp ebx, MMXLength
7f88f624 2066 movq mm2, mm0 /* mov updated Raw(x) to mm2 */
c6b71bff 2067 jb davg3lp
7f88f624 2068 } /* end _asm block */
c6b71bff
GD
2069 }
2070 break;
2071
2072 case 6:
2073 case 4:
2074 case 7:
2075 case 5:
2076 {
7f88f624
VZ
2077 ActiveMask.use = 0xffffffffffffffff; /* use shift below to clear */
2078 /* appropriate inactive bytes */
c6b71bff
GD
2079 ShiftBpp.use = bpp << 3;
2080 ShiftRem.use = 64 - ShiftBpp.use;
2081 _asm {
2082 movq mm4, HBClearMask
7f88f624
VZ
2083 /* Re-init address pointers and offset */
2084 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2085 /* Load ActiveMask and clear all bytes except for 1st active group */
c6b71bff 2086 movq mm7, ActiveMask
7f88f624 2087 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2088 psrlq mm7, ShiftRem
7f88f624 2089 mov esi, prev_row /* esi ==> Prior(x) */
c6b71bff
GD
2090 movq mm6, mm7
2091 movq mm5, LBCarryMask
7f88f624
VZ
2092 psllq mm6, ShiftBpp /* Create mask for 2nd active group */
2093 /* PRIME the pump (load the first Raw(x-bpp) data set */
2094 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2095 /* (we correct position in loop below) */
c6b71bff
GD
2096davg4lp:
2097 movq mm0, [edi + ebx]
7f88f624 2098 psrlq mm2, ShiftRem /* shift data to position correctly */
c6b71bff 2099 movq mm1, [esi + ebx]
7f88f624 2100 /* Add (Prev_row/2) to Average */
c6b71bff 2101 movq mm3, mm5
7f88f624
VZ
2102 pand mm3, mm1 /* get lsb for each prev_row byte */
2103 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2104 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2105 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2106 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2107 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2108 pand mm1, mm2 /* get LBCarrys for each byte where both */
2109 /* lsb's were == 1 (Only valid for active group) */
2110 psrlq mm2, 1 /* divide raw bytes by 2 */
2111 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2112 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2113 pand mm2, mm7 /* Leave only Active Group 1 bytes to add to Avg */
2114 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2115 /* byte */
2116 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2117 movq mm2, mm0 /* mov updated Raws to mm2 */
2118 psllq mm2, ShiftBpp /* shift data to position correctly */
c6b71bff 2119 add ebx, 8
7f88f624
VZ
2120 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2121 pand mm1, mm2 /* get LBCarrys for each byte where both */
2122 /* lsb's were == 1 (Only valid for active group) */
2123 psrlq mm2, 1 /* divide raw bytes by 2 */
2124 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2125 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2126 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2127 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2128 /* byte */
c6b71bff 2129 cmp ebx, MMXLength
7f88f624 2130 /* Now ready to write back to memory */
c6b71bff 2131 movq [edi + ebx - 8], mm0
7f88f624
VZ
2132 /* Prep Raw(x-bpp) for next loop */
2133 movq mm2, mm0 /* mov updated Raws to mm2 */
c6b71bff 2134 jb davg4lp
7f88f624 2135 } /* end _asm block */
c6b71bff
GD
2136 }
2137 break;
2138 case 2:
2139 {
2140 ActiveMask.use = 0x000000000000ffff;
7f88f624
VZ
2141 ShiftBpp.use = 16; /* == 2 * 8 [BUGFIX] */
2142 ShiftRem.use = 48; /* == 64 - 16 [BUGFIX] */
c6b71bff 2143 _asm {
7f88f624 2144 /* Load ActiveMask */
c6b71bff 2145 movq mm7, ActiveMask
7f88f624
VZ
2146 /* Re-init address pointers and offset */
2147 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
c6b71bff 2148 movq mm5, LBCarryMask
7f88f624 2149 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2150 movq mm4, HBClearMask
7f88f624
VZ
2151 mov esi, prev_row /* esi ==> Prior(x) */
2152 /* PRIME the pump (load the first Raw(x-bpp) data set */
2153 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2154 /* (we correct position in loop below) */
c6b71bff
GD
2155davg2lp:
2156 movq mm0, [edi + ebx]
7f88f624 2157 psrlq mm2, ShiftRem /* shift data to position correctly [BUGFIX] */
c6b71bff 2158 movq mm1, [esi + ebx]
7f88f624 2159 /* Add (Prev_row/2) to Average */
c6b71bff 2160 movq mm3, mm5
7f88f624
VZ
2161 pand mm3, mm1 /* get lsb for each prev_row byte */
2162 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2163 pand mm1, mm4 /* clear invalid bit 7 of each byte */
c6b71bff 2164 movq mm6, mm7
7f88f624
VZ
2165 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2166 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2167 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2168 pand mm1, mm2 /* get LBCarrys for each byte where both */
2169 /* lsb's were == 1 (Only valid for active group) */
2170 psrlq mm2, 1 /* divide raw bytes by 2 */
2171 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2172 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2173 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */
2174 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2175 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2176 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 2 & 3 */
2177 movq mm2, mm0 /* mov updated Raws to mm2 */
2178 psllq mm2, ShiftBpp /* shift data to position correctly */
2179 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2180 pand mm1, mm2 /* get LBCarrys for each byte where both */
2181 /* lsb's were == 1 (Only valid for active group) */
2182 psrlq mm2, 1 /* divide raw bytes by 2 */
2183 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2184 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2185 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2186 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2187
2188 /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */
2189 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 4 & 5 */
2190 movq mm2, mm0 /* mov updated Raws to mm2 */
2191 psllq mm2, ShiftBpp /* shift data to position correctly */
2192 /* Data only needs to be shifted once here to */
2193 /* get the correct x-bpp offset. */
2194 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2195 pand mm1, mm2 /* get LBCarrys for each byte where both */
2196 /* lsb's were == 1 (Only valid for active group) */
2197 psrlq mm2, 1 /* divide raw bytes by 2 */
2198 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2199 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2200 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2201 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2202
2203 /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */
2204 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 6 & 7 */
2205 movq mm2, mm0 /* mov updated Raws to mm2 */
2206 psllq mm2, ShiftBpp /* shift data to position correctly */
2207 /* Data only needs to be shifted once here to */
2208 /* get the correct x-bpp offset. */
c6b71bff 2209 add ebx, 8
7f88f624
VZ
2210 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2211 pand mm1, mm2 /* get LBCarrys for each byte where both */
2212 /* lsb's were == 1 (Only valid for active group) */
2213 psrlq mm2, 1 /* divide raw bytes by 2 */
2214 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2215 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2216 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2217 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
c6b71bff
GD
2218
2219 cmp ebx, MMXLength
7f88f624 2220 /* Now ready to write back to memory */
c6b71bff 2221 movq [edi + ebx - 8], mm0
7f88f624
VZ
2222 /* Prep Raw(x-bpp) for next loop */
2223 movq mm2, mm0 /* mov updated Raws to mm2 */
c6b71bff 2224 jb davg2lp
7f88f624 2225 } /* end _asm block */
c6b71bff
GD
2226 }
2227 break;
2228
7f88f624 2229 case 1: /* bpp == 1 */
c6b71bff
GD
2230 {
2231 _asm {
7f88f624
VZ
2232 /* Re-init address pointers and offset */
2233 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2234 mov edi, row /* edi ==> Avg(x) */
2235 cmp ebx, FullLength /* Test if offset at end of array */
c6b71bff 2236 jnb davg1end
7f88f624
VZ
2237 /* Do Paeth decode for remaining bytes */
2238 mov esi, prev_row /* esi ==> Prior(x) */
c6b71bff 2239 mov edx, edi
7f88f624
VZ
2240 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
2241 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff 2242davg1lp:
7f88f624 2243 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 2244 xor eax, eax
7f88f624
VZ
2245 mov cl, [esi + ebx] /* load cl with Prior(x) */
2246 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
c6b71bff
GD
2247 add ax, cx
2248 inc ebx
7f88f624
VZ
2249 shr ax, 1 /* divide by 2 */
2250 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
2251 cmp ebx, FullLength /* Check if at end of array */
2252 mov [edi+ebx-1], al /* Write back Raw(x); */
2253 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
2254 jb davg1lp
2255davg1end:
7f88f624 2256 } /* end _asm block */
c6b71bff
GD
2257 }
2258 return;
2259
7f88f624 2260 case 8: /* bpp == 8 */
c6b71bff
GD
2261 {
2262 _asm {
7f88f624
VZ
2263 /* Re-init address pointers and offset */
2264 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
c6b71bff 2265 movq mm5, LBCarryMask
7f88f624 2266 mov edi, row /* edi ==> Avg(x) */
c6b71bff 2267 movq mm4, HBClearMask
7f88f624
VZ
2268 mov esi, prev_row /* esi ==> Prior(x) */
2269 /* PRIME the pump (load the first Raw(x-bpp) data set */
2270 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2271 /* (NO NEED to correct position in loop below) */
c6b71bff
GD
2272davg8lp:
2273 movq mm0, [edi + ebx]
2274 movq mm3, mm5
2275 movq mm1, [esi + ebx]
2276 add ebx, 8
7f88f624
VZ
2277 pand mm3, mm1 /* get lsb for each prev_row byte */
2278 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2279 pand mm3, mm2 /* get LBCarrys for each byte where both */
2280 /* lsb's were == 1 */
2281 psrlq mm2, 1 /* divide raw bytes by 2 */
2282 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2283 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */
2284 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2285 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2286 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */
c6b71bff
GD
2287 cmp ebx, MMXLength
2288 movq [edi + ebx - 8], mm0
7f88f624 2289 movq mm2, mm0 /* reuse as Raw(x-bpp) */
c6b71bff 2290 jb davg8lp
7f88f624 2291 } /* end _asm block */
c6b71bff
GD
2292 }
2293 break;
7f88f624 2294 default: /* bpp greater than 8 */
c6b71bff
GD
2295 {
2296 _asm {
2297 movq mm5, LBCarryMask
7f88f624
VZ
2298 /* Re-init address pointers and offset */
2299 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2300 mov edi, row /* edi ==> Avg(x) */
c6b71bff
GD
2301 movq mm4, HBClearMask
2302 mov edx, edi
7f88f624
VZ
2303 mov esi, prev_row /* esi ==> Prior(x) */
2304 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff
GD
2305davgAlp:
2306 movq mm0, [edi + ebx]
2307 movq mm3, mm5
2308 movq mm1, [esi + ebx]
7f88f624 2309 pand mm3, mm1 /* get lsb for each prev_row byte */
c6b71bff 2310 movq mm2, [edx + ebx]
7f88f624
VZ
2311 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2312 pand mm3, mm2 /* get LBCarrys for each byte where both */
2313 /* lsb's were == 1 */
2314 psrlq mm2, 1 /* divide raw bytes by 2 */
2315 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2316 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */
2317 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2318 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
c6b71bff 2319 add ebx, 8
7f88f624 2320 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */
c6b71bff
GD
2321 cmp ebx, MMXLength
2322 movq [edi + ebx - 8], mm0
2323 jb davgAlp
7f88f624 2324 } /* end _asm block */
c6b71bff
GD
2325 }
2326 break;
7f88f624 2327 } /* end switch ( bpp ) */
c6b71bff
GD
2328
2329 _asm {
7f88f624
VZ
2330 /* MMX acceleration complete now do clean-up */
2331 /* Check if any remaining bytes left to decode */
2332 mov ebx, MMXLength /* ebx ==> x = offset bytes remaining after MMX */
2333 mov edi, row /* edi ==> Avg(x) */
2334 cmp ebx, FullLength /* Test if offset at end of array */
c6b71bff 2335 jnb davgend
7f88f624
VZ
2336 /* Do Paeth decode for remaining bytes */
2337 mov esi, prev_row /* esi ==> Prior(x) */
c6b71bff 2338 mov edx, edi
7f88f624
VZ
2339 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
2340 sub edx, bpp /* edx ==> Raw(x-bpp) */
c6b71bff 2341davglp2:
7f88f624 2342 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
c6b71bff 2343 xor eax, eax
7f88f624
VZ
2344 mov cl, [esi + ebx] /* load cl with Prior(x) */
2345 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
c6b71bff
GD
2346 add ax, cx
2347 inc ebx
7f88f624
VZ
2348 shr ax, 1 /* divide by 2 */
2349 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
2350 cmp ebx, FullLength /* Check if at end of array */
2351 mov [edi+ebx-1], al /* Write back Raw(x); */
2352 /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
2353 jb davglp2
2354davgend:
7f88f624
VZ
2355 emms /* End MMX instructions; prep for possible FP instrs. */
2356 } /* end _asm block */
c6b71bff
GD
2357}
2358
7f88f624 2359/* Optimized code for PNG Paeth filter decoder */
c6b71bff
GD
2360void /* PRIVATE */
2361png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2362 png_bytep prev_row)
2363{
2364 png_uint_32 FullLength;
2365 png_uint_32 MMXLength;
7f88f624 2366 /*png_uint_32 len; */
c6b71bff
GD
2367 int bpp;
2368 int diff;
7f88f624 2369 /*int ptemp; */
c6b71bff
GD
2370 int patemp, pbtemp, pctemp;
2371
7f88f624
VZ
2372 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
2373 FullLength = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
2374 _asm
2375 {
7f88f624 2376 xor ebx, ebx /* ebx ==> x offset */
c6b71bff 2377 mov edi, row
7f88f624 2378 xor edx, edx /* edx ==> x-bpp offset */
c6b71bff
GD
2379 mov esi, prev_row
2380 xor eax, eax
2381
7f88f624
VZ
2382 /* Compute the Raw value for the first bpp bytes */
2383 /* Note: the formula works out to be always */
2384 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
c6b71bff
GD
2385dpthrlp:
2386 mov al, [edi + ebx]
2387 add al, [esi + ebx]
2388 inc ebx
2389 cmp ebx, bpp
2390 mov [edi + ebx - 1], al
2391 jb dpthrlp
7f88f624
VZ
2392 /* get # of bytes to alignment */
2393 mov diff, edi /* take start of row */
2394 add diff, ebx /* add bpp */
c6b71bff 2395 xor ecx, ecx
7f88f624
VZ
2396 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */
2397 and diff, 0xfffffff8 /* mask to alignment boundary */
2398 sub diff, edi /* subtract from start ==> value ebx at alignment */
c6b71bff 2399 jz dpthgo
7f88f624 2400 /* fix alignment */
c6b71bff
GD
2401dpthlp1:
2402 xor eax, eax
7f88f624
VZ
2403 /* pav = p - a = (a + b - c) - a = b - c */
2404 mov al, [esi + ebx] /* load Prior(x) into al */
2405 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
2406 sub eax, ecx /* subtract Prior(x-bpp) */
2407 mov patemp, eax /* Save pav for later use */
c6b71bff 2408 xor eax, eax
7f88f624
VZ
2409 /* pbv = p - b = (a + b - c) - b = a - c */
2410 mov al, [edi + edx] /* load Raw(x-bpp) into al */
2411 sub eax, ecx /* subtract Prior(x-bpp) */
c6b71bff 2412 mov ecx, eax
7f88f624
VZ
2413 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2414 add eax, patemp /* pcv = pav + pbv */
2415 /* pc = abs(pcv) */
c6b71bff
GD
2416 test eax, 0x80000000
2417 jz dpthpca
7f88f624 2418 neg eax /* reverse sign of neg values */
c6b71bff 2419dpthpca:
7f88f624
VZ
2420 mov pctemp, eax /* save pc for later use */
2421 /* pb = abs(pbv) */
c6b71bff
GD
2422 test ecx, 0x80000000
2423 jz dpthpba
7f88f624 2424 neg ecx /* reverse sign of neg values */
c6b71bff 2425dpthpba:
7f88f624
VZ
2426 mov pbtemp, ecx /* save pb for later use */
2427 /* pa = abs(pav) */
c6b71bff
GD
2428 mov eax, patemp
2429 test eax, 0x80000000
2430 jz dpthpaa
7f88f624 2431 neg eax /* reverse sign of neg values */
c6b71bff 2432dpthpaa:
7f88f624
VZ
2433 mov patemp, eax /* save pa for later use */
2434 /* test if pa <= pb */
c6b71bff
GD
2435 cmp eax, ecx
2436 jna dpthabb
7f88f624 2437 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
2438 cmp ecx, pctemp
2439 jna dpthbbc
7f88f624
VZ
2440 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2441 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
2442 jmp dpthpaeth
2443dpthbbc:
7f88f624
VZ
2444 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
2445 mov cl, [esi + ebx] /* load Prior(x) into cl */
c6b71bff
GD
2446 jmp dpthpaeth
2447dpthabb:
7f88f624 2448 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
2449 cmp eax, pctemp
2450 jna dpthabc
7f88f624
VZ
2451 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2452 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
2453 jmp dpthpaeth
2454dpthabc:
7f88f624
VZ
2455 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
2456 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
c6b71bff
GD
2457dpthpaeth:
2458 inc ebx
2459 inc edx
7f88f624 2460 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
2461 add [edi + ebx - 1], cl
2462 cmp ebx, diff
2463 jb dpthlp1
2464dpthgo:
2465 mov ecx, FullLength
2466 mov eax, ecx
7f88f624
VZ
2467 sub eax, ebx /* subtract alignment fix */
2468 and eax, 0x00000007 /* calc bytes over mult of 8 */
2469 sub ecx, eax /* drop over bytes from original length */
c6b71bff 2470 mov MMXLength, ecx
7f88f624
VZ
2471 } /* end _asm block */
2472 /* Now do the math for the rest of the row */
c6b71bff
GD
2473 switch ( bpp )
2474 {
2475 case 3:
2476 {
2477 ActiveMask.use = 0x0000000000ffffff;
2478 ActiveMaskEnd.use = 0xffff000000000000;
7f88f624
VZ
2479 ShiftBpp.use = 24; /* == bpp(3) * 8 */
2480 ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
2481 _asm
2482 {
2483 mov ebx, diff
2484 mov edi, row
2485 mov esi, prev_row
2486 pxor mm0, mm0
7f88f624 2487 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
2488 movq mm1, [edi+ebx-8]
2489dpth3lp:
7f88f624
VZ
2490 psrlq mm1, ShiftRem /* shift last 3 bytes to 1st 3 bytes */
2491 movq mm2, [esi + ebx] /* load b=Prior(x) */
2492 punpcklbw mm1, mm0 /* Unpack High bytes of a */
2493 movq mm3, [esi+ebx-8] /* Prep c=Prior(x-bpp) bytes */
2494 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2495 psrlq mm3, ShiftRem /* shift last 3 bytes to 1st 3 bytes */
2496 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2497 movq mm4, mm2
7f88f624
VZ
2498 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2499 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2500 movq mm5, mm1
2501 psubw mm4, mm3
2502 pxor mm7, mm7
7f88f624 2503 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2504 movq mm6, mm4
2505 psubw mm5, mm3
2506
7f88f624
VZ
2507 /* pa = abs(p-a) = abs(pav) */
2508 /* pb = abs(p-b) = abs(pbv) */
2509 /* pc = abs(p-c) = abs(pcv) */
2510 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2511 paddw mm6, mm5
7f88f624
VZ
2512 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2513 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2514 psubw mm4, mm0
7f88f624 2515 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2516 psubw mm4, mm0
2517 psubw mm5, mm7
2518 pxor mm0, mm0
7f88f624
VZ
2519 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2520 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2521 psubw mm5, mm7
2522 psubw mm6, mm0
7f88f624 2523 /* test pa <= pb */
c6b71bff
GD
2524 movq mm7, mm4
2525 psubw mm6, mm0
7f88f624 2526 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2527 movq mm0, mm7
7f88f624 2528 /* use mm7 mask to merge pa & pb */
c6b71bff 2529 pand mm5, mm7
7f88f624 2530 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2531 pand mm2, mm0
2532 pandn mm7, mm4
2533 pandn mm0, mm1
2534 paddw mm7, mm5
2535 paddw mm0, mm2
7f88f624
VZ
2536 /* test ((pa <= pb)? pa:pb) <= pc */
2537 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2538 pxor mm1, mm1
2539 pand mm3, mm7
2540 pandn mm7, mm0
2541 paddw mm7, mm3
2542 pxor mm0, mm0
2543 packuswb mm7, mm1
7f88f624 2544 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */
c6b71bff 2545 pand mm7, ActiveMask
7f88f624
VZ
2546 movq mm2, mm3 /* load b=Prior(x) step 1 */
2547 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2548 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2549 movq [edi + ebx], mm7 /* write back updated value */
2550 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */
2551 /* Now do Paeth for 2nd set of bytes (3-5) */
2552 psrlq mm2, ShiftBpp /* load b=Prior(x) step 2 */
2553 punpcklbw mm1, mm0 /* Unpack High bytes of a */
c6b71bff 2554 pxor mm7, mm7
7f88f624
VZ
2555 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2556 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 2557 movq mm5, mm1
7f88f624 2558 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff
GD
2559 movq mm4, mm2
2560 psubw mm5, mm3
2561 psubw mm4, mm3
7f88f624
VZ
2562 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
2563 /* pav + pbv = pbv + pav */
c6b71bff
GD
2564 movq mm6, mm5
2565 paddw mm6, mm4
2566
7f88f624
VZ
2567 /* pa = abs(p-a) = abs(pav) */
2568 /* pb = abs(p-b) = abs(pbv) */
2569 /* pc = abs(p-c) = abs(pcv) */
2570 pcmpgtw mm0, mm5 /* Create mask pbv bytes < 0 */
2571 pcmpgtw mm7, mm4 /* Create mask pav bytes < 0 */
2572 pand mm0, mm5 /* Only pbv bytes < 0 in mm0 */
2573 pand mm7, mm4 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2574 psubw mm5, mm0
2575 psubw mm4, mm7
2576 psubw mm5, mm0
2577 psubw mm4, mm7
2578 pxor mm0, mm0
7f88f624
VZ
2579 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2580 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff 2581 psubw mm6, mm0
7f88f624 2582 /* test pa <= pb */
c6b71bff
GD
2583 movq mm7, mm4
2584 psubw mm6, mm0
7f88f624 2585 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2586 movq mm0, mm7
7f88f624 2587 /* use mm7 mask to merge pa & pb */
c6b71bff 2588 pand mm5, mm7
7f88f624 2589 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2590 pand mm2, mm0
2591 pandn mm7, mm4
2592 pandn mm0, mm1
2593 paddw mm7, mm5
2594 paddw mm0, mm2
7f88f624
VZ
2595 /* test ((pa <= pb)? pa:pb) <= pc */
2596 pcmpgtw mm7, mm6 /* pab > pc? */
2597 movq mm2, [esi + ebx] /* load b=Prior(x) */
c6b71bff
GD
2598 pand mm3, mm7
2599 pandn mm7, mm0
2600 pxor mm1, mm1
2601 paddw mm7, mm3
2602 pxor mm0, mm0
2603 packuswb mm7, mm1
7f88f624 2604 movq mm3, mm2 /* load c=Prior(x-bpp) step 1 */
c6b71bff 2605 pand mm7, ActiveMask
7f88f624
VZ
2606 punpckhbw mm2, mm0 /* Unpack High bytes of b */
2607 psllq mm7, ShiftBpp /* Shift bytes to 2nd group of 3 bytes */
2608 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2609 movq mm4, mm2
7f88f624
VZ
2610 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2611 psllq mm3, ShiftBpp /* load c=Prior(x-bpp) step 2 */
2612 movq [edi + ebx], mm7 /* write back updated value */
c6b71bff 2613 movq mm1, mm7
7f88f624
VZ
2614 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2615 psllq mm1, ShiftBpp /* Shift bytes */
2616 /* Now mm1 will be used as Raw(x-bpp) */
2617 /* Now do Paeth for 3rd, and final, set of bytes (6-7) */
c6b71bff 2618 pxor mm7, mm7
7f88f624 2619 punpckhbw mm1, mm0 /* Unpack High bytes of a */
c6b71bff 2620 psubw mm4, mm3
7f88f624 2621 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff 2622 movq mm5, mm1
7f88f624 2623 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2624 movq mm6, mm4
2625 psubw mm5, mm3
2626 pxor mm0, mm0
2627 paddw mm6, mm5
2628
7f88f624
VZ
2629 /* pa = abs(p-a) = abs(pav) */
2630 /* pb = abs(p-b) = abs(pbv) */
2631 /* pc = abs(p-c) = abs(pcv) */
2632 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2633 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2634 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2635 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2636 psubw mm4, mm0
2637 psubw mm5, mm7
2638 psubw mm4, mm0
2639 psubw mm5, mm7
2640 pxor mm0, mm0
7f88f624
VZ
2641 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2642 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff 2643 psubw mm6, mm0
7f88f624 2644 /* test pa <= pb */
c6b71bff
GD
2645 movq mm7, mm4
2646 psubw mm6, mm0
7f88f624 2647 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2648 movq mm0, mm7
7f88f624 2649 /* use mm0 mask copy to merge a & b */
c6b71bff 2650 pand mm2, mm0
7f88f624 2651 /* use mm7 mask to merge pa & pb */
c6b71bff
GD
2652 pand mm5, mm7
2653 pandn mm0, mm1
2654 pandn mm7, mm4
2655 paddw mm0, mm2
2656 paddw mm7, mm5
7f88f624
VZ
2657 /* test ((pa <= pb)? pa:pb) <= pc */
2658 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2659 pand mm3, mm7
2660 pandn mm7, mm0
2661 paddw mm7, mm3
2662 pxor mm1, mm1
2663 packuswb mm1, mm7
7f88f624 2664 /* Step ebx to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
2665 add ebx, 8
2666 pand mm1, ActiveMaskEnd
7f88f624 2667 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff
GD
2668
2669 cmp ebx, MMXLength
7f88f624
VZ
2670 pxor mm0, mm0 /* pxor does not affect flags */
2671 movq [edi + ebx - 8], mm1 /* write back updated value */
2672 /* mm1 will be used as Raw(x-bpp) next loop */
2673 /* mm3 ready to be used as Prior(x-bpp) next loop */
c6b71bff 2674 jb dpth3lp
7f88f624 2675 } /* end _asm block */
c6b71bff
GD
2676 }
2677 break;
2678
2679 case 6:
2680 case 7:
2681 case 5:
2682 {
2683 ActiveMask.use = 0x00000000ffffffff;
2684 ActiveMask2.use = 0xffffffff00000000;
7f88f624 2685 ShiftBpp.use = bpp << 3; /* == bpp * 8 */
c6b71bff
GD
2686 ShiftRem.use = 64 - ShiftBpp.use;
2687 _asm
2688 {
2689 mov ebx, diff
2690 mov edi, row
2691 mov esi, prev_row
7f88f624 2692 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
2693 movq mm1, [edi+ebx-8]
2694 pxor mm0, mm0
2695dpth6lp:
7f88f624 2696 /* Must shift to position Raw(x-bpp) data */
c6b71bff 2697 psrlq mm1, ShiftRem
7f88f624
VZ
2698 /* Do first set of 4 bytes */
2699 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2700 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2701 movq mm2, [esi + ebx] /* load b=Prior(x) */
2702 punpcklbw mm2, mm0 /* Unpack Low bytes of b */
2703 /* Must shift to position Prior(x-bpp) data */
c6b71bff 2704 psrlq mm3, ShiftRem
7f88f624 2705 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2706 movq mm4, mm2
7f88f624
VZ
2707 punpcklbw mm3, mm0 /* Unpack Low bytes of c */
2708 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2709 movq mm5, mm1
2710 psubw mm4, mm3
2711 pxor mm7, mm7
7f88f624 2712 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2713 movq mm6, mm4
2714 psubw mm5, mm3
7f88f624
VZ
2715 /* pa = abs(p-a) = abs(pav) */
2716 /* pb = abs(p-b) = abs(pbv) */
2717 /* pc = abs(p-c) = abs(pcv) */
2718 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2719 paddw mm6, mm5
7f88f624
VZ
2720 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2721 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2722 psubw mm4, mm0
7f88f624 2723 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2724 psubw mm4, mm0
2725 psubw mm5, mm7
2726 pxor mm0, mm0
7f88f624
VZ
2727 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2728 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2729 psubw mm5, mm7
2730 psubw mm6, mm0
7f88f624 2731 /* test pa <= pb */
c6b71bff
GD
2732 movq mm7, mm4
2733 psubw mm6, mm0
7f88f624 2734 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2735 movq mm0, mm7
7f88f624 2736 /* use mm7 mask to merge pa & pb */
c6b71bff 2737 pand mm5, mm7
7f88f624 2738 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2739 pand mm2, mm0
2740 pandn mm7, mm4
2741 pandn mm0, mm1
2742 paddw mm7, mm5
2743 paddw mm0, mm2
7f88f624
VZ
2744 /* test ((pa <= pb)? pa:pb) <= pc */
2745 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2746 pxor mm1, mm1
2747 pand mm3, mm7
2748 pandn mm7, mm0
2749 paddw mm7, mm3
2750 pxor mm0, mm0
2751 packuswb mm7, mm1
7f88f624 2752 movq mm3, [esi + ebx - 8] /* load c=Prior(x-bpp) */
c6b71bff
GD
2753 pand mm7, ActiveMask
2754 psrlq mm3, ShiftRem
7f88f624
VZ
2755 movq mm2, [esi + ebx] /* load b=Prior(x) step 1 */
2756 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
c6b71bff 2757 movq mm6, mm2
7f88f624 2758 movq [edi + ebx], mm7 /* write back updated value */
c6b71bff
GD
2759 movq mm1, [edi+ebx-8]
2760 psllq mm6, ShiftBpp
2761 movq mm5, mm7
2762 psrlq mm1, ShiftRem
2763 por mm3, mm6
2764 psllq mm5, ShiftBpp
7f88f624 2765 punpckhbw mm3, mm0 /* Unpack High bytes of c */
c6b71bff 2766 por mm1, mm5
7f88f624
VZ
2767 /* Do second set of 4 bytes */
2768 punpckhbw mm2, mm0 /* Unpack High bytes of b */
2769 punpckhbw mm1, mm0 /* Unpack High bytes of a */
2770 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2771 movq mm4, mm2
7f88f624 2772 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2773 movq mm5, mm1
2774 psubw mm4, mm3
2775 pxor mm7, mm7
7f88f624 2776 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2777 movq mm6, mm4
2778 psubw mm5, mm3
7f88f624
VZ
2779 /* pa = abs(p-a) = abs(pav) */
2780 /* pb = abs(p-b) = abs(pbv) */
2781 /* pc = abs(p-c) = abs(pcv) */
2782 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2783 paddw mm6, mm5
7f88f624
VZ
2784 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2785 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2786 psubw mm4, mm0
7f88f624 2787 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2788 psubw mm4, mm0
2789 psubw mm5, mm7
2790 pxor mm0, mm0
7f88f624
VZ
2791 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2792 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2793 psubw mm5, mm7
2794 psubw mm6, mm0
7f88f624 2795 /* test pa <= pb */
c6b71bff
GD
2796 movq mm7, mm4
2797 psubw mm6, mm0
7f88f624 2798 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2799 movq mm0, mm7
7f88f624 2800 /* use mm7 mask to merge pa & pb */
c6b71bff 2801 pand mm5, mm7
7f88f624 2802 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2803 pand mm2, mm0
2804 pandn mm7, mm4
2805 pandn mm0, mm1
2806 paddw mm7, mm5
2807 paddw mm0, mm2
7f88f624
VZ
2808 /* test ((pa <= pb)? pa:pb) <= pc */
2809 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2810 pxor mm1, mm1
2811 pand mm3, mm7
2812 pandn mm7, mm0
2813 pxor mm1, mm1
2814 paddw mm7, mm3
2815 pxor mm0, mm0
7f88f624 2816 /* Step ex to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
2817 add ebx, 8
2818 packuswb mm1, mm7
7f88f624 2819 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff 2820 cmp ebx, MMXLength
7f88f624
VZ
2821 movq [edi + ebx - 8], mm1 /* write back updated value */
2822 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff 2823 jb dpth6lp
7f88f624 2824 } /* end _asm block */
c6b71bff
GD
2825 }
2826 break;
2827
2828 case 4:
2829 {
2830 ActiveMask.use = 0x00000000ffffffff;
2831 _asm {
2832 mov ebx, diff
2833 mov edi, row
2834 mov esi, prev_row
2835 pxor mm0, mm0
7f88f624
VZ
2836 /* PRIME the pump (load the first Raw(x-bpp) data set */
2837 movq mm1, [edi+ebx-8] /* Only time should need to read */
2838 /* a=Raw(x-bpp) bytes */
c6b71bff 2839dpth4lp:
7f88f624
VZ
2840 /* Do first set of 4 bytes */
2841 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2842 punpckhbw mm1, mm0 /* Unpack Low bytes of a */
2843 movq mm2, [esi + ebx] /* load b=Prior(x) */
2844 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2845 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2846 movq mm4, mm2
7f88f624
VZ
2847 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2848 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2849 movq mm5, mm1
2850 psubw mm4, mm3
2851 pxor mm7, mm7
7f88f624 2852 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2853 movq mm6, mm4
2854 psubw mm5, mm3
7f88f624
VZ
2855 /* pa = abs(p-a) = abs(pav) */
2856 /* pb = abs(p-b) = abs(pbv) */
2857 /* pc = abs(p-c) = abs(pcv) */
2858 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2859 paddw mm6, mm5
7f88f624
VZ
2860 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2861 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2862 psubw mm4, mm0
7f88f624 2863 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2864 psubw mm4, mm0
2865 psubw mm5, mm7
2866 pxor mm0, mm0
7f88f624
VZ
2867 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2868 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2869 psubw mm5, mm7
2870 psubw mm6, mm0
7f88f624 2871 /* test pa <= pb */
c6b71bff
GD
2872 movq mm7, mm4
2873 psubw mm6, mm0
7f88f624 2874 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2875 movq mm0, mm7
7f88f624 2876 /* use mm7 mask to merge pa & pb */
c6b71bff 2877 pand mm5, mm7
7f88f624 2878 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2879 pand mm2, mm0
2880 pandn mm7, mm4
2881 pandn mm0, mm1
2882 paddw mm7, mm5
2883 paddw mm0, mm2
7f88f624
VZ
2884 /* test ((pa <= pb)? pa:pb) <= pc */
2885 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2886 pxor mm1, mm1
2887 pand mm3, mm7
2888 pandn mm7, mm0
2889 paddw mm7, mm3
2890 pxor mm0, mm0
2891 packuswb mm7, mm1
7f88f624 2892 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */
c6b71bff 2893 pand mm7, ActiveMask
7f88f624
VZ
2894 movq mm2, mm3 /* load b=Prior(x) step 1 */
2895 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2896 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2897 movq [edi + ebx], mm7 /* write back updated value */
2898 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */
2899 /* Do second set of 4 bytes */
2900 punpckhbw mm2, mm0 /* Unpack Low bytes of b */
2901 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2902 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2903 movq mm4, mm2
7f88f624 2904 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2905 movq mm5, mm1
2906 psubw mm4, mm3
2907 pxor mm7, mm7
7f88f624 2908 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2909 movq mm6, mm4
2910 psubw mm5, mm3
7f88f624
VZ
2911 /* pa = abs(p-a) = abs(pav) */
2912 /* pb = abs(p-b) = abs(pbv) */
2913 /* pc = abs(p-c) = abs(pcv) */
2914 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2915 paddw mm6, mm5
7f88f624
VZ
2916 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2917 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2918 psubw mm4, mm0
7f88f624 2919 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2920 psubw mm4, mm0
2921 psubw mm5, mm7
2922 pxor mm0, mm0
7f88f624
VZ
2923 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2924 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
2925 psubw mm5, mm7
2926 psubw mm6, mm0
7f88f624 2927 /* test pa <= pb */
c6b71bff
GD
2928 movq mm7, mm4
2929 psubw mm6, mm0
7f88f624 2930 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 2931 movq mm0, mm7
7f88f624 2932 /* use mm7 mask to merge pa & pb */
c6b71bff 2933 pand mm5, mm7
7f88f624 2934 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
2935 pand mm2, mm0
2936 pandn mm7, mm4
2937 pandn mm0, mm1
2938 paddw mm7, mm5
2939 paddw mm0, mm2
7f88f624
VZ
2940 /* test ((pa <= pb)? pa:pb) <= pc */
2941 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
2942 pxor mm1, mm1
2943 pand mm3, mm7
2944 pandn mm7, mm0
2945 pxor mm1, mm1
2946 paddw mm7, mm3
2947 pxor mm0, mm0
7f88f624 2948 /* Step ex to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
2949 add ebx, 8
2950 packuswb mm1, mm7
7f88f624 2951 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff 2952 cmp ebx, MMXLength
7f88f624
VZ
2953 movq [edi + ebx - 8], mm1 /* write back updated value */
2954 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff 2955 jb dpth4lp
7f88f624 2956 } /* end _asm block */
c6b71bff
GD
2957 }
2958 break;
7f88f624 2959 case 8: /* bpp == 8 */
c6b71bff
GD
2960 {
2961 ActiveMask.use = 0x00000000ffffffff;
2962 _asm {
2963 mov ebx, diff
2964 mov edi, row
2965 mov esi, prev_row
2966 pxor mm0, mm0
7f88f624
VZ
2967 /* PRIME the pump (load the first Raw(x-bpp) data set */
2968 movq mm1, [edi+ebx-8] /* Only time should need to read */
2969 /* a=Raw(x-bpp) bytes */
c6b71bff 2970dpth8lp:
7f88f624
VZ
2971 /* Do first set of 4 bytes */
2972 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2973 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2974 movq mm2, [esi + ebx] /* load b=Prior(x) */
2975 punpcklbw mm2, mm0 /* Unpack Low bytes of b */
2976 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 2977 movq mm4, mm2
7f88f624
VZ
2978 punpcklbw mm3, mm0 /* Unpack Low bytes of c */
2979 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
2980 movq mm5, mm1
2981 psubw mm4, mm3
2982 pxor mm7, mm7
7f88f624 2983 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
2984 movq mm6, mm4
2985 psubw mm5, mm3
7f88f624
VZ
2986 /* pa = abs(p-a) = abs(pav) */
2987 /* pb = abs(p-b) = abs(pbv) */
2988 /* pc = abs(p-c) = abs(pcv) */
2989 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 2990 paddw mm6, mm5
7f88f624
VZ
2991 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2992 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 2993 psubw mm4, mm0
7f88f624 2994 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
2995 psubw mm4, mm0
2996 psubw mm5, mm7
2997 pxor mm0, mm0
7f88f624
VZ
2998 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2999 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
3000 psubw mm5, mm7
3001 psubw mm6, mm0
7f88f624 3002 /* test pa <= pb */
c6b71bff
GD
3003 movq mm7, mm4
3004 psubw mm6, mm0
7f88f624 3005 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 3006 movq mm0, mm7
7f88f624 3007 /* use mm7 mask to merge pa & pb */
c6b71bff 3008 pand mm5, mm7
7f88f624 3009 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3010 pand mm2, mm0
3011 pandn mm7, mm4
3012 pandn mm0, mm1
3013 paddw mm7, mm5
3014 paddw mm0, mm2
7f88f624
VZ
3015 /* test ((pa <= pb)? pa:pb) <= pc */
3016 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
3017 pxor mm1, mm1
3018 pand mm3, mm7
3019 pandn mm7, mm0
3020 paddw mm7, mm3
3021 pxor mm0, mm0
3022 packuswb mm7, mm1
7f88f624 3023 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
c6b71bff 3024 pand mm7, ActiveMask
7f88f624
VZ
3025 movq mm2, [esi + ebx] /* load b=Prior(x) */
3026 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
3027 punpckhbw mm3, mm0 /* Unpack High bytes of c */
3028 movq [edi + ebx], mm7 /* write back updated value */
3029 movq mm1, [edi+ebx-8] /* read a=Raw(x-bpp) bytes */
3030
3031 /* Do second set of 4 bytes */
3032 punpckhbw mm2, mm0 /* Unpack High bytes of b */
3033 punpckhbw mm1, mm0 /* Unpack High bytes of a */
3034 /* pav = p - a = (a + b - c) - a = b - c */
c6b71bff 3035 movq mm4, mm2
7f88f624 3036 /* pbv = p - b = (a + b - c) - b = a - c */
c6b71bff
GD
3037 movq mm5, mm1
3038 psubw mm4, mm3
3039 pxor mm7, mm7
7f88f624 3040 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
c6b71bff
GD
3041 movq mm6, mm4
3042 psubw mm5, mm3
7f88f624
VZ
3043 /* pa = abs(p-a) = abs(pav) */
3044 /* pb = abs(p-b) = abs(pbv) */
3045 /* pc = abs(p-c) = abs(pcv) */
3046 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
c6b71bff 3047 paddw mm6, mm5
7f88f624
VZ
3048 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
3049 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
c6b71bff 3050 psubw mm4, mm0
7f88f624 3051 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
c6b71bff
GD
3052 psubw mm4, mm0
3053 psubw mm5, mm7
3054 pxor mm0, mm0
7f88f624
VZ
3055 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
3056 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
c6b71bff
GD
3057 psubw mm5, mm7
3058 psubw mm6, mm0
7f88f624 3059 /* test pa <= pb */
c6b71bff
GD
3060 movq mm7, mm4
3061 psubw mm6, mm0
7f88f624 3062 pcmpgtw mm7, mm5 /* pa > pb? */
c6b71bff 3063 movq mm0, mm7
7f88f624 3064 /* use mm7 mask to merge pa & pb */
c6b71bff 3065 pand mm5, mm7
7f88f624 3066 /* use mm0 mask copy to merge a & b */
c6b71bff
GD
3067 pand mm2, mm0
3068 pandn mm7, mm4
3069 pandn mm0, mm1
3070 paddw mm7, mm5
3071 paddw mm0, mm2
7f88f624
VZ
3072 /* test ((pa <= pb)? pa:pb) <= pc */
3073 pcmpgtw mm7, mm6 /* pab > pc? */
c6b71bff
GD
3074 pxor mm1, mm1
3075 pand mm3, mm7
3076 pandn mm7, mm0
3077 pxor mm1, mm1
3078 paddw mm7, mm3
3079 pxor mm0, mm0
7f88f624 3080 /* Step ex to next set of 8 bytes and repeat loop til done */
c6b71bff
GD
3081 add ebx, 8
3082 packuswb mm1, mm7
7f88f624 3083 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
c6b71bff 3084 cmp ebx, MMXLength
7f88f624
VZ
3085 movq [edi + ebx - 8], mm1 /* write back updated value */
3086 /* mm1 will be used as Raw(x-bpp) next loop */
c6b71bff 3087 jb dpth8lp
7f88f624 3088 } /* end _asm block */
c6b71bff
GD
3089 }
3090 break;
3091
7f88f624
VZ
3092 case 1: /* bpp = 1 */
3093 case 2: /* bpp = 2 */
3094 default: /* bpp > 8 */
c6b71bff
GD
3095 {
3096 _asm {
3097 mov ebx, diff
3098 cmp ebx, FullLength
3099 jnb dpthdend
3100 mov edi, row
3101 mov esi, prev_row
7f88f624 3102 /* Do Paeth decode for remaining bytes */
c6b71bff 3103 mov edx, ebx
7f88f624
VZ
3104 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
3105 sub edx, bpp /* Set edx = ebx - bpp */
c6b71bff
GD
3106dpthdlp:
3107 xor eax, eax
7f88f624
VZ
3108 /* pav = p - a = (a + b - c) - a = b - c */
3109 mov al, [esi + ebx] /* load Prior(x) into al */
3110 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3111 sub eax, ecx /* subtract Prior(x-bpp) */
3112 mov patemp, eax /* Save pav for later use */
c6b71bff 3113 xor eax, eax
7f88f624
VZ
3114 /* pbv = p - b = (a + b - c) - b = a - c */
3115 mov al, [edi + edx] /* load Raw(x-bpp) into al */
3116 sub eax, ecx /* subtract Prior(x-bpp) */
c6b71bff 3117 mov ecx, eax
7f88f624
VZ
3118 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3119 add eax, patemp /* pcv = pav + pbv */
3120 /* pc = abs(pcv) */
c6b71bff
GD
3121 test eax, 0x80000000
3122 jz dpthdpca
7f88f624 3123 neg eax /* reverse sign of neg values */
c6b71bff 3124dpthdpca:
7f88f624
VZ
3125 mov pctemp, eax /* save pc for later use */
3126 /* pb = abs(pbv) */
c6b71bff
GD
3127 test ecx, 0x80000000
3128 jz dpthdpba
7f88f624 3129 neg ecx /* reverse sign of neg values */
c6b71bff 3130dpthdpba:
7f88f624
VZ
3131 mov pbtemp, ecx /* save pb for later use */
3132 /* pa = abs(pav) */
c6b71bff
GD
3133 mov eax, patemp
3134 test eax, 0x80000000
3135 jz dpthdpaa
7f88f624 3136 neg eax /* reverse sign of neg values */
c6b71bff 3137dpthdpaa:
7f88f624
VZ
3138 mov patemp, eax /* save pa for later use */
3139 /* test if pa <= pb */
c6b71bff
GD
3140 cmp eax, ecx
3141 jna dpthdabb
7f88f624 3142 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
3143 cmp ecx, pctemp
3144 jna dpthdbbc
7f88f624
VZ
3145 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3146 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3147 jmp dpthdpaeth
3148dpthdbbc:
7f88f624
VZ
3149 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3150 mov cl, [esi + ebx] /* load Prior(x) into cl */
c6b71bff
GD
3151 jmp dpthdpaeth
3152dpthdabb:
7f88f624 3153 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
3154 cmp eax, pctemp
3155 jna dpthdabc
7f88f624
VZ
3156 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3157 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3158 jmp dpthdpaeth
3159dpthdabc:
7f88f624
VZ
3160 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3161 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
c6b71bff
GD
3162dpthdpaeth:
3163 inc ebx
3164 inc edx
7f88f624 3165 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
3166 add [edi + ebx - 1], cl
3167 cmp ebx, FullLength
3168 jb dpthdlp
3169dpthdend:
7f88f624 3170 } /* end _asm block */
c6b71bff 3171 }
7f88f624
VZ
3172 return; /* No need to go further with this one */
3173 } /* end switch ( bpp ) */
c6b71bff
GD
3174 _asm
3175 {
7f88f624
VZ
3176 /* MMX acceleration complete now do clean-up */
3177 /* Check if any remaining bytes left to decode */
c6b71bff
GD
3178 mov ebx, MMXLength
3179 cmp ebx, FullLength
3180 jnb dpthend
3181 mov edi, row
3182 mov esi, prev_row
7f88f624 3183 /* Do Paeth decode for remaining bytes */
c6b71bff 3184 mov edx, ebx
7f88f624
VZ
3185 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
3186 sub edx, bpp /* Set edx = ebx - bpp */
c6b71bff
GD
3187dpthlp2:
3188 xor eax, eax
7f88f624
VZ
3189 /* pav = p - a = (a + b - c) - a = b - c */
3190 mov al, [esi + ebx] /* load Prior(x) into al */
3191 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3192 sub eax, ecx /* subtract Prior(x-bpp) */
3193 mov patemp, eax /* Save pav for later use */
c6b71bff 3194 xor eax, eax
7f88f624
VZ
3195 /* pbv = p - b = (a + b - c) - b = a - c */
3196 mov al, [edi + edx] /* load Raw(x-bpp) into al */
3197 sub eax, ecx /* subtract Prior(x-bpp) */
c6b71bff 3198 mov ecx, eax
7f88f624
VZ
3199 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3200 add eax, patemp /* pcv = pav + pbv */
3201 /* pc = abs(pcv) */
c6b71bff
GD
3202 test eax, 0x80000000
3203 jz dpthpca2
7f88f624 3204 neg eax /* reverse sign of neg values */
c6b71bff 3205dpthpca2:
7f88f624
VZ
3206 mov pctemp, eax /* save pc for later use */
3207 /* pb = abs(pbv) */
c6b71bff
GD
3208 test ecx, 0x80000000
3209 jz dpthpba2
7f88f624 3210 neg ecx /* reverse sign of neg values */
c6b71bff 3211dpthpba2:
7f88f624
VZ
3212 mov pbtemp, ecx /* save pb for later use */
3213 /* pa = abs(pav) */
c6b71bff
GD
3214 mov eax, patemp
3215 test eax, 0x80000000
3216 jz dpthpaa2
7f88f624 3217 neg eax /* reverse sign of neg values */
c6b71bff 3218dpthpaa2:
7f88f624
VZ
3219 mov patemp, eax /* save pa for later use */
3220 /* test if pa <= pb */
c6b71bff
GD
3221 cmp eax, ecx
3222 jna dpthabb2
7f88f624 3223 /* pa > pb; now test if pb <= pc */
c6b71bff
GD
3224 cmp ecx, pctemp
3225 jna dpthbbc2
7f88f624
VZ
3226 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3227 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3228 jmp dpthpaeth2
3229dpthbbc2:
7f88f624
VZ
3230 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3231 mov cl, [esi + ebx] /* load Prior(x) into cl */
c6b71bff
GD
3232 jmp dpthpaeth2
3233dpthabb2:
7f88f624 3234 /* pa <= pb; now test if pa <= pc */
c6b71bff
GD
3235 cmp eax, pctemp
3236 jna dpthabc2
7f88f624
VZ
3237 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3238 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
c6b71bff
GD
3239 jmp dpthpaeth2
3240dpthabc2:
7f88f624
VZ
3241 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3242 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
c6b71bff
GD
3243dpthpaeth2:
3244 inc ebx
3245 inc edx
7f88f624 3246 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
c6b71bff
GD
3247 add [edi + ebx - 1], cl
3248 cmp ebx, FullLength
3249 jb dpthlp2
3250dpthend:
7f88f624
VZ
3251 emms /* End MMX instructions; prep for possible FP instrs. */
3252 } /* end _asm block */
c6b71bff
GD
3253}
3254
7f88f624 3255/* Optimized code for PNG Sub filter decoder */
c6b71bff
GD
3256void /* PRIVATE */
3257png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3258{
7f88f624 3259 /*int test; */
c6b71bff
GD
3260 int bpp;
3261 png_uint_32 FullLength;
3262 png_uint_32 MMXLength;
3263 int diff;
3264
7f88f624
VZ
3265 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3266 FullLength = row_info->rowbytes - bpp; /* # of bytes to filter */
c6b71bff
GD
3267 _asm {
3268 mov edi, row
7f88f624
VZ
3269 mov esi, edi /* lp = row */
3270 add edi, bpp /* rp = row + bpp */
c6b71bff 3271 xor eax, eax
7f88f624
VZ
3272 /* get # of bytes to alignment */
3273 mov diff, edi /* take start of row */
3274 add diff, 0xf /* add 7 + 8 to incr past */
3275 /* alignment boundary */
c6b71bff 3276 xor ebx, ebx
7f88f624
VZ
3277 and diff, 0xfffffff8 /* mask to alignment boundary */
3278 sub diff, edi /* subtract from start ==> value */
3279 /* ebx at alignment */
c6b71bff 3280 jz dsubgo
7f88f624 3281 /* fix alignment */
c6b71bff
GD
3282dsublp1:
3283 mov al, [esi+ebx]
3284 add [edi+ebx], al
3285 inc ebx
3286 cmp ebx, diff
3287 jb dsublp1
3288dsubgo:
3289 mov ecx, FullLength
3290 mov edx, ecx
7f88f624
VZ
3291 sub edx, ebx /* subtract alignment fix */
3292 and edx, 0x00000007 /* calc bytes over mult of 8 */
3293 sub ecx, edx /* drop over bytes from length */
c6b71bff 3294 mov MMXLength, ecx
7f88f624 3295 } /* end _asm block */
c6b71bff 3296
7f88f624 3297 /* Now do the math for the rest of the row */
c6b71bff
GD
3298 switch ( bpp )
3299 {
3300 case 3:
3301 {
3302 ActiveMask.use = 0x0000ffffff000000;
7f88f624
VZ
3303 ShiftBpp.use = 24; /* == 3 * 8 */
3304 ShiftRem.use = 40; /* == 64 - 24 */
c6b71bff
GD
3305 _asm {
3306 mov edi, row
7f88f624
VZ
3307 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */
3308 mov esi, edi /* lp = row */
3309 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3310 movq mm6, mm7
3311 mov ebx, diff
7f88f624
VZ
3312 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */
3313 /* byte group */
3314 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
3315 movq mm1, [edi+ebx-8]
3316dsub3lp:
7f88f624
VZ
3317 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3318 /* no need for mask; shift clears inactive bytes */
3319 /* Add 1st active group */
c6b71bff
GD
3320 movq mm0, [edi+ebx]
3321 paddb mm0, mm1
7f88f624
VZ
3322 /* Add 2nd active group */
3323 movq mm1, mm0 /* mov updated Raws to mm1 */
3324 psllq mm1, ShiftBpp /* shift data to position correctly */
3325 pand mm1, mm7 /* mask to use only 2nd active group */
c6b71bff 3326 paddb mm0, mm1
7f88f624
VZ
3327 /* Add 3rd active group */
3328 movq mm1, mm0 /* mov updated Raws to mm1 */
3329 psllq mm1, ShiftBpp /* shift data to position correctly */
3330 pand mm1, mm6 /* mask to use only 3rd active group */
c6b71bff
GD
3331 add ebx, 8
3332 paddb mm0, mm1
3333 cmp ebx, MMXLength
7f88f624
VZ
3334 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */
3335 /* Prep for doing 1st add at top of loop */
c6b71bff
GD
3336 movq mm1, mm0
3337 jb dsub3lp
7f88f624 3338 } /* end _asm block */
c6b71bff
GD
3339 }
3340 break;
3341
3342 case 1:
3343 {
7f88f624
VZ
3344 /* Placed here just in case this is a duplicate of the */
3345 /* non-MMX code for the SUB filter in png_read_filter_row below */
c6b71bff 3346 //
7f88f624
VZ
3347 /* png_bytep rp; */
3348 /* png_bytep lp; */
3349 /* png_uint_32 i; */
3350 /* bpp = (row_info->pixel_depth + 7) >> 3; */
3351 /* for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */
3352 /* i < row_info->rowbytes; i++, rp++, lp++) */
3353 /* { */
3354 /* *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */
3355 /* } */
c6b71bff
GD
3356 _asm {
3357 mov ebx, diff
3358 mov edi, row
3359 cmp ebx, FullLength
3360 jnb dsub1end
7f88f624 3361 mov esi, edi /* lp = row */
c6b71bff 3362 xor eax, eax
7f88f624 3363 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3364dsub1lp:
3365 mov al, [esi+ebx]
3366 add [edi+ebx], al
3367 inc ebx
3368 cmp ebx, FullLength
3369 jb dsub1lp
3370dsub1end:
7f88f624 3371 } /* end _asm block */
c6b71bff
GD
3372 }
3373 return;
3374
3375 case 6:
3376 case 7:
3377 case 4:
3378 case 5:
3379 {
3380 ShiftBpp.use = bpp << 3;
3381 ShiftRem.use = 64 - ShiftBpp.use;
3382 _asm {
3383 mov edi, row
3384 mov ebx, diff
7f88f624
VZ
3385 mov esi, edi /* lp = row */
3386 add edi, bpp /* rp = row + bpp */
3387 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
3388 movq mm1, [edi+ebx-8]
3389dsub4lp:
7f88f624
VZ
3390 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3391 /* no need for mask; shift clears inactive bytes */
c6b71bff
GD
3392 movq mm0, [edi+ebx]
3393 paddb mm0, mm1
7f88f624
VZ
3394 /* Add 2nd active group */
3395 movq mm1, mm0 /* mov updated Raws to mm1 */
3396 psllq mm1, ShiftBpp /* shift data to position correctly */
3397 /* there is no need for any mask */
3398 /* since shift clears inactive bits/bytes */
c6b71bff
GD
3399 add ebx, 8
3400 paddb mm0, mm1
3401 cmp ebx, MMXLength
3402 movq [edi+ebx-8], mm0
7f88f624 3403 movq mm1, mm0 /* Prep for doing 1st add at top of loop */
c6b71bff 3404 jb dsub4lp
7f88f624 3405 } /* end _asm block */
c6b71bff
GD
3406 }
3407 break;
3408
3409 case 2:
3410 {
3411 ActiveMask.use = 0x00000000ffff0000;
7f88f624
VZ
3412 ShiftBpp.use = 16; /* == 2 * 8 */
3413 ShiftRem.use = 48; /* == 64 - 16 */
c6b71bff 3414 _asm {
7f88f624 3415 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */
c6b71bff
GD
3416 mov ebx, diff
3417 movq mm6, mm7
3418 mov edi, row
7f88f624
VZ
3419 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */
3420 /* byte group */
3421 mov esi, edi /* lp = row */
c6b71bff 3422 movq mm5, mm6
7f88f624
VZ
3423 add edi, bpp /* rp = row + bpp */
3424 psllq mm5, ShiftBpp /* Move mask in mm5 to cover 4th active */
3425 /* byte group */
3426 /* PRIME the pump (load the first Raw(x-bpp) data set */
c6b71bff
GD
3427 movq mm1, [edi+ebx-8]
3428dsub2lp:
7f88f624
VZ
3429 /* Add 1st active group */
3430 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3431 /* no need for mask; shift clears inactive */
3432 /* bytes */
c6b71bff
GD
3433 movq mm0, [edi+ebx]
3434 paddb mm0, mm1
7f88f624
VZ
3435 /* Add 2nd active group */
3436 movq mm1, mm0 /* mov updated Raws to mm1 */
3437 psllq mm1, ShiftBpp /* shift data to position correctly */
3438 pand mm1, mm7 /* mask to use only 2nd active group */
c6b71bff 3439 paddb mm0, mm1
7f88f624
VZ
3440 /* Add 3rd active group */
3441 movq mm1, mm0 /* mov updated Raws to mm1 */
3442 psllq mm1, ShiftBpp /* shift data to position correctly */
3443 pand mm1, mm6 /* mask to use only 3rd active group */
c6b71bff 3444 paddb mm0, mm1
7f88f624
VZ
3445 /* Add 4th active group */
3446 movq mm1, mm0 /* mov updated Raws to mm1 */
3447 psllq mm1, ShiftBpp /* shift data to position correctly */
3448 pand mm1, mm5 /* mask to use only 4th active group */
c6b71bff
GD
3449 add ebx, 8
3450 paddb mm0, mm1
3451 cmp ebx, MMXLength
7f88f624
VZ
3452 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */
3453 movq mm1, mm0 /* Prep for doing 1st add at top of loop */
c6b71bff 3454 jb dsub2lp
7f88f624 3455 } /* end _asm block */
c6b71bff
GD
3456 }
3457 break;
3458 case 8:
3459 {
3460 _asm {
3461 mov edi, row
3462 mov ebx, diff
7f88f624
VZ
3463 mov esi, edi /* lp = row */
3464 add edi, bpp /* rp = row + bpp */
c6b71bff 3465 mov ecx, MMXLength
7f88f624
VZ
3466 movq mm7, [edi+ebx-8] /* PRIME the pump (load the first */
3467 /* Raw(x-bpp) data set */
3468 and ecx, 0x0000003f /* calc bytes over mult of 64 */
c6b71bff 3469dsub8lp:
7f88f624 3470 movq mm0, [edi+ebx] /* Load Sub(x) for 1st 8 bytes */
c6b71bff 3471 paddb mm0, mm7
7f88f624
VZ
3472 movq mm1, [edi+ebx+8] /* Load Sub(x) for 2nd 8 bytes */
3473 movq [edi+ebx], mm0 /* Write Raw(x) for 1st 8 bytes */
3474 /* Now mm0 will be used as Raw(x-bpp) for */
3475 /* the 2nd group of 8 bytes. This will be */
3476 /* repeated for each group of 8 bytes with */
3477 /* the 8th group being used as the Raw(x-bpp) */
3478 /* for the 1st group of the next loop. */
c6b71bff 3479 paddb mm1, mm0
7f88f624
VZ
3480 movq mm2, [edi+ebx+16] /* Load Sub(x) for 3rd 8 bytes */
3481 movq [edi+ebx+8], mm1 /* Write Raw(x) for 2nd 8 bytes */
c6b71bff 3482 paddb mm2, mm1
7f88f624
VZ
3483 movq mm3, [edi+ebx+24] /* Load Sub(x) for 4th 8 bytes */
3484 movq [edi+ebx+16], mm2 /* Write Raw(x) for 3rd 8 bytes */
c6b71bff 3485 paddb mm3, mm2
7f88f624
VZ
3486 movq mm4, [edi+ebx+32] /* Load Sub(x) for 5th 8 bytes */
3487 movq [edi+ebx+24], mm3 /* Write Raw(x) for 4th 8 bytes */
c6b71bff 3488 paddb mm4, mm3
7f88f624
VZ
3489 movq mm5, [edi+ebx+40] /* Load Sub(x) for 6th 8 bytes */
3490 movq [edi+ebx+32], mm4 /* Write Raw(x) for 5th 8 bytes */
c6b71bff 3491 paddb mm5, mm4
7f88f624
VZ
3492 movq mm6, [edi+ebx+48] /* Load Sub(x) for 7th 8 bytes */
3493 movq [edi+ebx+40], mm5 /* Write Raw(x) for 6th 8 bytes */
c6b71bff 3494 paddb mm6, mm5
7f88f624
VZ
3495 movq mm7, [edi+ebx+56] /* Load Sub(x) for 8th 8 bytes */
3496 movq [edi+ebx+48], mm6 /* Write Raw(x) for 7th 8 bytes */
c6b71bff
GD
3497 add ebx, 64
3498 paddb mm7, mm6
3499 cmp ebx, ecx
7f88f624 3500 movq [edi+ebx-8], mm7 /* Write Raw(x) for 8th 8 bytes */
c6b71bff
GD
3501 jb dsub8lp
3502 cmp ebx, MMXLength
3503 jnb dsub8lt8
3504dsub8lpA:
3505 movq mm0, [edi+ebx]
3506 add ebx, 8
3507 paddb mm0, mm7
3508 cmp ebx, MMXLength
7f88f624
VZ
3509 movq [edi+ebx-8], mm0 /* use -8 to offset early add to ebx */
3510 movq mm7, mm0 /* Move calculated Raw(x) data to mm1 to */
3511 /* be the new Raw(x-bpp) for the next loop */
c6b71bff
GD
3512 jb dsub8lpA
3513dsub8lt8:
7f88f624 3514 } /* end _asm block */
c6b71bff
GD
3515 }
3516 break;
3517
7f88f624 3518 default: /* bpp greater than 8 bytes */
c6b71bff
GD
3519 {
3520 _asm {
3521 mov ebx, diff
3522 mov edi, row
7f88f624
VZ
3523 mov esi, edi /* lp = row */
3524 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3525dsubAlp:
3526 movq mm0, [edi+ebx]
3527 movq mm1, [esi+ebx]
3528 add ebx, 8
3529 paddb mm0, mm1
3530 cmp ebx, MMXLength
7f88f624
VZ
3531 movq [edi+ebx-8], mm0 /* mov does not affect flags; -8 to offset */
3532 /* add ebx */
c6b71bff 3533 jb dsubAlp
7f88f624 3534 } /* end _asm block */
c6b71bff
GD
3535 }
3536 break;
3537
7f88f624 3538 } /* end switch ( bpp ) */
c6b71bff
GD
3539
3540 _asm {
3541 mov ebx, MMXLength
3542 mov edi, row
3543 cmp ebx, FullLength
3544 jnb dsubend
7f88f624 3545 mov esi, edi /* lp = row */
c6b71bff 3546 xor eax, eax
7f88f624 3547 add edi, bpp /* rp = row + bpp */
c6b71bff
GD
3548dsublp2:
3549 mov al, [esi+ebx]
3550 add [edi+ebx], al
3551 inc ebx
3552 cmp ebx, FullLength
3553 jb dsublp2
3554dsubend:
7f88f624
VZ
3555 emms /* End MMX instructions; prep for possible FP instrs. */
3556 } /* end _asm block */
c6b71bff
GD
3557}
3558
7f88f624 3559/* Optimized code for PNG Up filter decoder */
c6b71bff
GD
3560void /* PRIVATE */
3561png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3562 png_bytep prev_row)
3563{
3564 png_uint_32 len;
7f88f624 3565 len = row_info->rowbytes; /* # of bytes to filter */
c6b71bff
GD
3566 _asm {
3567 mov edi, row
7f88f624 3568 /* get # of bytes to alignment */
c6b71bff
GD
3569 mov ecx, edi
3570 xor ebx, ebx
3571 add ecx, 0x7
3572 xor eax, eax
3573 and ecx, 0xfffffff8
3574 mov esi, prev_row
3575 sub ecx, edi
3576 jz dupgo
7f88f624 3577 /* fix alignment */
c6b71bff
GD
3578duplp1:
3579 mov al, [edi+ebx]
3580 add al, [esi+ebx]
3581 inc ebx
3582 cmp ebx, ecx
7f88f624 3583 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
3584 jb duplp1
3585dupgo:
3586 mov ecx, len
3587 mov edx, ecx
7f88f624
VZ
3588 sub edx, ebx /* subtract alignment fix */
3589 and edx, 0x0000003f /* calc bytes over mult of 64 */
3590 sub ecx, edx /* drop over bytes from length */
3591 /* Unrolled loop - use all MMX registers and interleave to reduce */
3592 /* number of branch instructions (loops) and reduce partial stalls */
c6b71bff
GD
3593duploop:
3594 movq mm1, [esi+ebx]
3595 movq mm0, [edi+ebx]
3596 movq mm3, [esi+ebx+8]
3597 paddb mm0, mm1
3598 movq mm2, [edi+ebx+8]
3599 movq [edi+ebx], mm0
3600 paddb mm2, mm3
3601 movq mm5, [esi+ebx+16]
3602 movq [edi+ebx+8], mm2
3603 movq mm4, [edi+ebx+16]
3604 movq mm7, [esi+ebx+24]
3605 paddb mm4, mm5
3606 movq mm6, [edi+ebx+24]
3607 movq [edi+ebx+16], mm4
3608 paddb mm6, mm7
3609 movq mm1, [esi+ebx+32]
3610 movq [edi+ebx+24], mm6
3611 movq mm0, [edi+ebx+32]
3612 movq mm3, [esi+ebx+40]
3613 paddb mm0, mm1
3614 movq mm2, [edi+ebx+40]
3615 movq [edi+ebx+32], mm0
3616 paddb mm2, mm3
3617 movq mm5, [esi+ebx+48]
3618 movq [edi+ebx+40], mm2
3619 movq mm4, [edi+ebx+48]
3620 movq mm7, [esi+ebx+56]
3621 paddb mm4, mm5
3622 movq mm6, [edi+ebx+56]
3623 movq [edi+ebx+48], mm4
3624 add ebx, 64
3625 paddb mm6, mm7
3626 cmp ebx, ecx
7f88f624
VZ
3627 movq [edi+ebx-8], mm6 /* (+56)movq does not affect flags; */
3628 /* -8 to offset add ebx */
c6b71bff
GD
3629 jb duploop
3630
7f88f624 3631 cmp edx, 0 /* Test for bytes over mult of 64 */
c6b71bff
GD
3632 jz dupend
3633
3634
7f88f624
VZ
3635 /* 2 lines added by lcreeve@netins.net */
3636 /* (mail 11 Jul 98 in png-implement list) */
3637 cmp edx, 8 /*test for less than 8 bytes */
c6b71bff
GD
3638 jb duplt8
3639
3640
3641 add ecx, edx
7f88f624
VZ
3642 and edx, 0x00000007 /* calc bytes over mult of 8 */
3643 sub ecx, edx /* drop over bytes from length */
c6b71bff 3644 jz duplt8
7f88f624 3645 /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */
c6b71bff
GD
3646duplpA:
3647 movq mm1, [esi+ebx]
3648 movq mm0, [edi+ebx]
3649 add ebx, 8
3650 paddb mm0, mm1
3651 cmp ebx, ecx
7f88f624 3652 movq [edi+ebx-8], mm0 /* movq does not affect flags; -8 to offset add ebx */
c6b71bff 3653 jb duplpA
7f88f624 3654 cmp edx, 0 /* Test for bytes over mult of 8 */
c6b71bff
GD
3655 jz dupend
3656duplt8:
3657 xor eax, eax
7f88f624
VZ
3658 add ecx, edx /* move over byte count into counter */
3659 /* Loop using x86 registers to update remaining bytes */
c6b71bff
GD
3660duplp2:
3661 mov al, [edi + ebx]
3662 add al, [esi + ebx]
3663 inc ebx
3664 cmp ebx, ecx
7f88f624 3665 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
c6b71bff
GD
3666 jb duplp2
3667dupend:
7f88f624
VZ
3668 /* Conversion of filtered row completed */
3669 emms /* End MMX instructions; prep for possible FP instrs. */
3670 } /* end _asm block */
c6b71bff
GD
3671}
3672
3673
7f88f624 3674/* Optimized png_read_filter_row routines */
c6b71bff
GD
3675void /* PRIVATE */
3676png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3677 row, png_bytep prev_row, int filter)
3678{
3679#ifdef PNG_DEBUG
3680 char filnm[10];
3681#endif
3682
3683 if (mmx_supported == 2) {
5b02c8a1 3684#if !defined(PNG_1_0_X)
c6b71bff
GD
3685 /* this should have happened in png_init_mmx_flags() already */
3686 png_warning(png_ptr, "asm_flags may not have been initialized");
5b02c8a1 3687#endif
c6b71bff
GD
3688 png_mmx_support();
3689 }
3690
3691#ifdef PNG_DEBUG
3692 png_debug(1, "in png_read_filter_row\n");
3693 switch (filter)
3694 {
3695 case 0: sprintf(filnm, "none");
3696 break;
5b02c8a1 3697#if !defined(PNG_1_0_X)
c6b71bff
GD
3698 case 1: sprintf(filnm, "sub-%s",
3699 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3700 break;
3701 case 2: sprintf(filnm, "up-%s",
3702 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3703 break;
3704 case 3: sprintf(filnm, "avg-%s",
3705 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3706 break;
3707 case 4: sprintf(filnm, "Paeth-%s",
3708 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3709 break;
5b02c8a1
VS
3710#else
3711 case 1: sprintf(filnm, "sub");
3712 break;
3713 case 2: sprintf(filnm, "up");
3714 break;
3715 case 3: sprintf(filnm, "avg");
3716 break;
3717 case 4: sprintf(filnm, "Paeth");
3718 break;
3719#endif
c6b71bff
GD
3720 default: sprintf(filnm, "unknw");
3721 break;
3722 }
3723 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3724 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3725 (int)((row_info->pixel_depth + 7) >> 3));
3726 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3727#endif /* PNG_DEBUG */
3728
3729 switch (filter)
3730 {
3731 case PNG_FILTER_VALUE_NONE:
3732 break;
3733
3734 case PNG_FILTER_VALUE_SUB:
3735 {
5b02c8a1 3736#if !defined(PNG_1_0_X)
c6b71bff
GD
3737 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3738 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3739 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5b02c8a1
VS
3740#else
3741 if (mmx_supported)
3742#endif
c6b71bff
GD
3743 {
3744 png_read_filter_row_mmx_sub(row_info, row);
3745 }
3746 else
3747 {
3748 png_uint_32 i;
3749 png_uint_32 istop = row_info->rowbytes;
3750 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3751 png_bytep rp = row + bpp;
3752 png_bytep lp = row;
3753
3754 for (i = bpp; i < istop; i++)
3755 {
3756 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3757 rp++;
3758 }
3759 }
3760 break;
3761 }
3762
3763 case PNG_FILTER_VALUE_UP:
3764 {
5b02c8a1 3765#if !defined(PNG_1_0_X)
c6b71bff
GD
3766 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3767 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3768 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5b02c8a1
VS
3769#else
3770 if (mmx_supported)
3771#endif
c6b71bff
GD
3772 {
3773 png_read_filter_row_mmx_up(row_info, row, prev_row);
3774 }
3775 else
3776 {
3777 png_uint_32 i;
3778 png_uint_32 istop = row_info->rowbytes;
3779 png_bytep rp = row;
3780 png_bytep pp = prev_row;
3781
3782 for (i = 0; i < istop; ++i)
3783 {
3784 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3785 rp++;
3786 }
3787 }
3788 break;
3789 }
3790
3791 case PNG_FILTER_VALUE_AVG:
3792 {
5b02c8a1 3793#if !defined(PNG_1_0_X)
c6b71bff
GD
3794 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3795 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3796 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5b02c8a1
VS
3797#else
3798 if (mmx_supported)
3799#endif
c6b71bff
GD
3800 {
3801 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3802 }
3803 else
3804 {
3805 png_uint_32 i;
3806 png_bytep rp = row;
3807 png_bytep pp = prev_row;
3808 png_bytep lp = row;
3809 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3810 png_uint_32 istop = row_info->rowbytes - bpp;
3811
3812 for (i = 0; i < bpp; i++)
3813 {
3814 *rp = (png_byte)(((int)(*rp) +
3815 ((int)(*pp++) >> 1)) & 0xff);
3816 rp++;
3817 }
3818
3819 for (i = 0; i < istop; i++)
3820 {
3821 *rp = (png_byte)(((int)(*rp) +
3822 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3823 rp++;
3824 }
3825 }
3826 break;
3827 }
3828
3829 case PNG_FILTER_VALUE_PAETH:
3830 {
5b02c8a1 3831#if !defined(PNG_1_0_X)
c6b71bff
GD
3832 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3833 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3834 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5b02c8a1
VS
3835#else
3836 if (mmx_supported)
3837#endif
c6b71bff
GD
3838 {
3839 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3840 }
3841 else
3842 {
3843 png_uint_32 i;
3844 png_bytep rp = row;
3845 png_bytep pp = prev_row;
3846 png_bytep lp = row;
3847 png_bytep cp = prev_row;
3848 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3849 png_uint_32 istop=row_info->rowbytes - bpp;
3850
3851 for (i = 0; i < bpp; i++)
3852 {
3853 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3854 rp++;
3855 }
3856
7f88f624 3857 for (i = 0; i < istop; i++) /* use leftover rp,pp */
c6b71bff
GD
3858 {
3859 int a, b, c, pa, pb, pc, p;
3860
3861 a = *lp++;
3862 b = *pp++;
3863 c = *cp++;
3864
3865 p = b - c;
3866 pc = a - c;
3867
3868#ifdef PNG_USE_ABS
3869 pa = abs(p);
3870 pb = abs(pc);
3871 pc = abs(p + pc);
3872#else
3873 pa = p < 0 ? -p : p;
3874 pb = pc < 0 ? -pc : pc;
3875 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3876#endif
3877
3878 /*
3879 if (pa <= pb && pa <= pc)
3880 p = a;
3881 else if (pb <= pc)
3882 p = b;
3883 else
3884 p = c;
3885 */
3886
3887 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3888
3889 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3890 rp++;
3891 }
3892 }
3893 break;
3894 }
3895
3896 default:
3897 png_warning(png_ptr, "Ignoring bad row filter type");
3898 *row=0;
3899 break;
3900 }
3901}
3902
3903#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */